Rename remaining GMX_ACCELERATION to GMX_CPU_ACCELERATION
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            None
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             shX,shY,shZ,rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset1;
67     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
68     int              vdwioffset2;
69     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
70     int              vdwioffset3;
71     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
72     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
73     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
74     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
75     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
76     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
77     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
78     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
79     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
80     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
81     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
82     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
83     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
84     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
85     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
86     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
87     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     __m128i          vfitab;
90     __m128i          ifour       = _mm_set1_epi32(4);
91     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
92     real             *vftab;
93     __m128           dummy_mask,cutoff_mask;
94     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
95     __m128           one     = _mm_set1_ps(1.0);
96     __m128           two     = _mm_set1_ps(2.0);
97     x                = xx[0];
98     f                = ff[0];
99
100     nri              = nlist->nri;
101     iinr             = nlist->iinr;
102     jindex           = nlist->jindex;
103     jjnr             = nlist->jjnr;
104     shiftidx         = nlist->shift;
105     gid              = nlist->gid;
106     shiftvec         = fr->shift_vec[0];
107     fshift           = fr->fshift[0];
108     facel            = _mm_set1_ps(fr->epsfac);
109     charge           = mdatoms->chargeA;
110
111     vftab            = kernel_data->table_elec->data;
112     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
113
114     /* Setup water-specific parameters */
115     inr              = nlist->iinr[0];
116     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
117     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
118     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
119
120     jq1              = _mm_set1_ps(charge[inr+1]);
121     jq2              = _mm_set1_ps(charge[inr+2]);
122     jq3              = _mm_set1_ps(charge[inr+3]);
123     qq11             = _mm_mul_ps(iq1,jq1);
124     qq12             = _mm_mul_ps(iq1,jq2);
125     qq13             = _mm_mul_ps(iq1,jq3);
126     qq21             = _mm_mul_ps(iq2,jq1);
127     qq22             = _mm_mul_ps(iq2,jq2);
128     qq23             = _mm_mul_ps(iq2,jq3);
129     qq31             = _mm_mul_ps(iq3,jq1);
130     qq32             = _mm_mul_ps(iq3,jq2);
131     qq33             = _mm_mul_ps(iq3,jq3);
132
133     /* Avoid stupid compiler warnings */
134     jnrA = jnrB = jnrC = jnrD = 0;
135     j_coord_offsetA = 0;
136     j_coord_offsetB = 0;
137     j_coord_offsetC = 0;
138     j_coord_offsetD = 0;
139
140     outeriter        = 0;
141     inneriter        = 0;
142
143     /* Start outer loop over neighborlists */
144     for(iidx=0; iidx<nri; iidx++)
145     {
146         /* Load shift vector for this list */
147         i_shift_offset   = DIM*shiftidx[iidx];
148         shX              = shiftvec[i_shift_offset+XX];
149         shY              = shiftvec[i_shift_offset+YY];
150         shZ              = shiftvec[i_shift_offset+ZZ];
151
152         /* Load limits for loop over neighbors */
153         j_index_start    = jindex[iidx];
154         j_index_end      = jindex[iidx+1];
155
156         /* Get outer coordinate index */
157         inr              = iinr[iidx];
158         i_coord_offset   = DIM*inr;
159
160         /* Load i particle coords and add shift vector */
161         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
162         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
163         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
164         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
165         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
166         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
167         ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
168         iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
169         iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
170
171         fix1             = _mm_setzero_ps();
172         fiy1             = _mm_setzero_ps();
173         fiz1             = _mm_setzero_ps();
174         fix2             = _mm_setzero_ps();
175         fiy2             = _mm_setzero_ps();
176         fiz2             = _mm_setzero_ps();
177         fix3             = _mm_setzero_ps();
178         fiy3             = _mm_setzero_ps();
179         fiz3             = _mm_setzero_ps();
180
181         /* Reset potential sums */
182         velecsum         = _mm_setzero_ps();
183
184         /* Start inner kernel loop */
185         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
186         {
187
188             /* Get j neighbor index, and coordinate index */
189             jnrA             = jjnr[jidx];
190             jnrB             = jjnr[jidx+1];
191             jnrC             = jjnr[jidx+2];
192             jnrD             = jjnr[jidx+3];
193
194             j_coord_offsetA  = DIM*jnrA;
195             j_coord_offsetB  = DIM*jnrB;
196             j_coord_offsetC  = DIM*jnrC;
197             j_coord_offsetD  = DIM*jnrD;
198
199             /* load j atom coordinates */
200             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
201                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
202                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
203
204             /* Calculate displacement vector */
205             dx11             = _mm_sub_ps(ix1,jx1);
206             dy11             = _mm_sub_ps(iy1,jy1);
207             dz11             = _mm_sub_ps(iz1,jz1);
208             dx12             = _mm_sub_ps(ix1,jx2);
209             dy12             = _mm_sub_ps(iy1,jy2);
210             dz12             = _mm_sub_ps(iz1,jz2);
211             dx13             = _mm_sub_ps(ix1,jx3);
212             dy13             = _mm_sub_ps(iy1,jy3);
213             dz13             = _mm_sub_ps(iz1,jz3);
214             dx21             = _mm_sub_ps(ix2,jx1);
215             dy21             = _mm_sub_ps(iy2,jy1);
216             dz21             = _mm_sub_ps(iz2,jz1);
217             dx22             = _mm_sub_ps(ix2,jx2);
218             dy22             = _mm_sub_ps(iy2,jy2);
219             dz22             = _mm_sub_ps(iz2,jz2);
220             dx23             = _mm_sub_ps(ix2,jx3);
221             dy23             = _mm_sub_ps(iy2,jy3);
222             dz23             = _mm_sub_ps(iz2,jz3);
223             dx31             = _mm_sub_ps(ix3,jx1);
224             dy31             = _mm_sub_ps(iy3,jy1);
225             dz31             = _mm_sub_ps(iz3,jz1);
226             dx32             = _mm_sub_ps(ix3,jx2);
227             dy32             = _mm_sub_ps(iy3,jy2);
228             dz32             = _mm_sub_ps(iz3,jz2);
229             dx33             = _mm_sub_ps(ix3,jx3);
230             dy33             = _mm_sub_ps(iy3,jy3);
231             dz33             = _mm_sub_ps(iz3,jz3);
232
233             /* Calculate squared distance and things based on it */
234             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
235             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
236             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
237             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
238             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
239             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
240             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
241             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
242             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
243
244             rinv11           = gmx_mm_invsqrt_ps(rsq11);
245             rinv12           = gmx_mm_invsqrt_ps(rsq12);
246             rinv13           = gmx_mm_invsqrt_ps(rsq13);
247             rinv21           = gmx_mm_invsqrt_ps(rsq21);
248             rinv22           = gmx_mm_invsqrt_ps(rsq22);
249             rinv23           = gmx_mm_invsqrt_ps(rsq23);
250             rinv31           = gmx_mm_invsqrt_ps(rsq31);
251             rinv32           = gmx_mm_invsqrt_ps(rsq32);
252             rinv33           = gmx_mm_invsqrt_ps(rsq33);
253
254             fjx1             = _mm_setzero_ps();
255             fjy1             = _mm_setzero_ps();
256             fjz1             = _mm_setzero_ps();
257             fjx2             = _mm_setzero_ps();
258             fjy2             = _mm_setzero_ps();
259             fjz2             = _mm_setzero_ps();
260             fjx3             = _mm_setzero_ps();
261             fjy3             = _mm_setzero_ps();
262             fjz3             = _mm_setzero_ps();
263
264             /**************************
265              * CALCULATE INTERACTIONS *
266              **************************/
267
268             r11              = _mm_mul_ps(rsq11,rinv11);
269
270             /* Calculate table index by multiplying r with table scale and truncate to integer */
271             rt               = _mm_mul_ps(r11,vftabscale);
272             vfitab           = _mm_cvttps_epi32(rt);
273             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
274             vfitab           = _mm_slli_epi32(vfitab,2);
275
276             /* CUBIC SPLINE TABLE ELECTROSTATICS */
277             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
278             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
279             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
280             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
281             _MM_TRANSPOSE4_PS(Y,F,G,H);
282             Heps             = _mm_mul_ps(vfeps,H);
283             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
284             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
285             velec            = _mm_mul_ps(qq11,VV);
286             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
287             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
288
289             /* Update potential sum for this i atom from the interaction with this j atom. */
290             velecsum         = _mm_add_ps(velecsum,velec);
291
292             fscal            = felec;
293
294             /* Calculate temporary vectorial force */
295             tx               = _mm_mul_ps(fscal,dx11);
296             ty               = _mm_mul_ps(fscal,dy11);
297             tz               = _mm_mul_ps(fscal,dz11);
298
299             /* Update vectorial force */
300             fix1             = _mm_add_ps(fix1,tx);
301             fiy1             = _mm_add_ps(fiy1,ty);
302             fiz1             = _mm_add_ps(fiz1,tz);
303
304             fjx1             = _mm_add_ps(fjx1,tx);
305             fjy1             = _mm_add_ps(fjy1,ty);
306             fjz1             = _mm_add_ps(fjz1,tz);
307
308             /**************************
309              * CALCULATE INTERACTIONS *
310              **************************/
311
312             r12              = _mm_mul_ps(rsq12,rinv12);
313
314             /* Calculate table index by multiplying r with table scale and truncate to integer */
315             rt               = _mm_mul_ps(r12,vftabscale);
316             vfitab           = _mm_cvttps_epi32(rt);
317             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
318             vfitab           = _mm_slli_epi32(vfitab,2);
319
320             /* CUBIC SPLINE TABLE ELECTROSTATICS */
321             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
322             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
323             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
324             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
325             _MM_TRANSPOSE4_PS(Y,F,G,H);
326             Heps             = _mm_mul_ps(vfeps,H);
327             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
328             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
329             velec            = _mm_mul_ps(qq12,VV);
330             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
331             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
332
333             /* Update potential sum for this i atom from the interaction with this j atom. */
334             velecsum         = _mm_add_ps(velecsum,velec);
335
336             fscal            = felec;
337
338             /* Calculate temporary vectorial force */
339             tx               = _mm_mul_ps(fscal,dx12);
340             ty               = _mm_mul_ps(fscal,dy12);
341             tz               = _mm_mul_ps(fscal,dz12);
342
343             /* Update vectorial force */
344             fix1             = _mm_add_ps(fix1,tx);
345             fiy1             = _mm_add_ps(fiy1,ty);
346             fiz1             = _mm_add_ps(fiz1,tz);
347
348             fjx2             = _mm_add_ps(fjx2,tx);
349             fjy2             = _mm_add_ps(fjy2,ty);
350             fjz2             = _mm_add_ps(fjz2,tz);
351
352             /**************************
353              * CALCULATE INTERACTIONS *
354              **************************/
355
356             r13              = _mm_mul_ps(rsq13,rinv13);
357
358             /* Calculate table index by multiplying r with table scale and truncate to integer */
359             rt               = _mm_mul_ps(r13,vftabscale);
360             vfitab           = _mm_cvttps_epi32(rt);
361             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
362             vfitab           = _mm_slli_epi32(vfitab,2);
363
364             /* CUBIC SPLINE TABLE ELECTROSTATICS */
365             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
366             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
367             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
368             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
369             _MM_TRANSPOSE4_PS(Y,F,G,H);
370             Heps             = _mm_mul_ps(vfeps,H);
371             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
372             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
373             velec            = _mm_mul_ps(qq13,VV);
374             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
375             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
376
377             /* Update potential sum for this i atom from the interaction with this j atom. */
378             velecsum         = _mm_add_ps(velecsum,velec);
379
380             fscal            = felec;
381
382             /* Calculate temporary vectorial force */
383             tx               = _mm_mul_ps(fscal,dx13);
384             ty               = _mm_mul_ps(fscal,dy13);
385             tz               = _mm_mul_ps(fscal,dz13);
386
387             /* Update vectorial force */
388             fix1             = _mm_add_ps(fix1,tx);
389             fiy1             = _mm_add_ps(fiy1,ty);
390             fiz1             = _mm_add_ps(fiz1,tz);
391
392             fjx3             = _mm_add_ps(fjx3,tx);
393             fjy3             = _mm_add_ps(fjy3,ty);
394             fjz3             = _mm_add_ps(fjz3,tz);
395
396             /**************************
397              * CALCULATE INTERACTIONS *
398              **************************/
399
400             r21              = _mm_mul_ps(rsq21,rinv21);
401
402             /* Calculate table index by multiplying r with table scale and truncate to integer */
403             rt               = _mm_mul_ps(r21,vftabscale);
404             vfitab           = _mm_cvttps_epi32(rt);
405             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
406             vfitab           = _mm_slli_epi32(vfitab,2);
407
408             /* CUBIC SPLINE TABLE ELECTROSTATICS */
409             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
410             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
411             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
412             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
413             _MM_TRANSPOSE4_PS(Y,F,G,H);
414             Heps             = _mm_mul_ps(vfeps,H);
415             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
416             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
417             velec            = _mm_mul_ps(qq21,VV);
418             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
419             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
420
421             /* Update potential sum for this i atom from the interaction with this j atom. */
422             velecsum         = _mm_add_ps(velecsum,velec);
423
424             fscal            = felec;
425
426             /* Calculate temporary vectorial force */
427             tx               = _mm_mul_ps(fscal,dx21);
428             ty               = _mm_mul_ps(fscal,dy21);
429             tz               = _mm_mul_ps(fscal,dz21);
430
431             /* Update vectorial force */
432             fix2             = _mm_add_ps(fix2,tx);
433             fiy2             = _mm_add_ps(fiy2,ty);
434             fiz2             = _mm_add_ps(fiz2,tz);
435
436             fjx1             = _mm_add_ps(fjx1,tx);
437             fjy1             = _mm_add_ps(fjy1,ty);
438             fjz1             = _mm_add_ps(fjz1,tz);
439
440             /**************************
441              * CALCULATE INTERACTIONS *
442              **************************/
443
444             r22              = _mm_mul_ps(rsq22,rinv22);
445
446             /* Calculate table index by multiplying r with table scale and truncate to integer */
447             rt               = _mm_mul_ps(r22,vftabscale);
448             vfitab           = _mm_cvttps_epi32(rt);
449             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
450             vfitab           = _mm_slli_epi32(vfitab,2);
451
452             /* CUBIC SPLINE TABLE ELECTROSTATICS */
453             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
454             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
455             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
456             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
457             _MM_TRANSPOSE4_PS(Y,F,G,H);
458             Heps             = _mm_mul_ps(vfeps,H);
459             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
460             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
461             velec            = _mm_mul_ps(qq22,VV);
462             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
463             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
464
465             /* Update potential sum for this i atom from the interaction with this j atom. */
466             velecsum         = _mm_add_ps(velecsum,velec);
467
468             fscal            = felec;
469
470             /* Calculate temporary vectorial force */
471             tx               = _mm_mul_ps(fscal,dx22);
472             ty               = _mm_mul_ps(fscal,dy22);
473             tz               = _mm_mul_ps(fscal,dz22);
474
475             /* Update vectorial force */
476             fix2             = _mm_add_ps(fix2,tx);
477             fiy2             = _mm_add_ps(fiy2,ty);
478             fiz2             = _mm_add_ps(fiz2,tz);
479
480             fjx2             = _mm_add_ps(fjx2,tx);
481             fjy2             = _mm_add_ps(fjy2,ty);
482             fjz2             = _mm_add_ps(fjz2,tz);
483
484             /**************************
485              * CALCULATE INTERACTIONS *
486              **************************/
487
488             r23              = _mm_mul_ps(rsq23,rinv23);
489
490             /* Calculate table index by multiplying r with table scale and truncate to integer */
491             rt               = _mm_mul_ps(r23,vftabscale);
492             vfitab           = _mm_cvttps_epi32(rt);
493             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
494             vfitab           = _mm_slli_epi32(vfitab,2);
495
496             /* CUBIC SPLINE TABLE ELECTROSTATICS */
497             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
498             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
499             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
500             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
501             _MM_TRANSPOSE4_PS(Y,F,G,H);
502             Heps             = _mm_mul_ps(vfeps,H);
503             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
504             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
505             velec            = _mm_mul_ps(qq23,VV);
506             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
507             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
508
509             /* Update potential sum for this i atom from the interaction with this j atom. */
510             velecsum         = _mm_add_ps(velecsum,velec);
511
512             fscal            = felec;
513
514             /* Calculate temporary vectorial force */
515             tx               = _mm_mul_ps(fscal,dx23);
516             ty               = _mm_mul_ps(fscal,dy23);
517             tz               = _mm_mul_ps(fscal,dz23);
518
519             /* Update vectorial force */
520             fix2             = _mm_add_ps(fix2,tx);
521             fiy2             = _mm_add_ps(fiy2,ty);
522             fiz2             = _mm_add_ps(fiz2,tz);
523
524             fjx3             = _mm_add_ps(fjx3,tx);
525             fjy3             = _mm_add_ps(fjy3,ty);
526             fjz3             = _mm_add_ps(fjz3,tz);
527
528             /**************************
529              * CALCULATE INTERACTIONS *
530              **************************/
531
532             r31              = _mm_mul_ps(rsq31,rinv31);
533
534             /* Calculate table index by multiplying r with table scale and truncate to integer */
535             rt               = _mm_mul_ps(r31,vftabscale);
536             vfitab           = _mm_cvttps_epi32(rt);
537             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
538             vfitab           = _mm_slli_epi32(vfitab,2);
539
540             /* CUBIC SPLINE TABLE ELECTROSTATICS */
541             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
542             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
543             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
544             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
545             _MM_TRANSPOSE4_PS(Y,F,G,H);
546             Heps             = _mm_mul_ps(vfeps,H);
547             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
548             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
549             velec            = _mm_mul_ps(qq31,VV);
550             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
551             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
552
553             /* Update potential sum for this i atom from the interaction with this j atom. */
554             velecsum         = _mm_add_ps(velecsum,velec);
555
556             fscal            = felec;
557
558             /* Calculate temporary vectorial force */
559             tx               = _mm_mul_ps(fscal,dx31);
560             ty               = _mm_mul_ps(fscal,dy31);
561             tz               = _mm_mul_ps(fscal,dz31);
562
563             /* Update vectorial force */
564             fix3             = _mm_add_ps(fix3,tx);
565             fiy3             = _mm_add_ps(fiy3,ty);
566             fiz3             = _mm_add_ps(fiz3,tz);
567
568             fjx1             = _mm_add_ps(fjx1,tx);
569             fjy1             = _mm_add_ps(fjy1,ty);
570             fjz1             = _mm_add_ps(fjz1,tz);
571
572             /**************************
573              * CALCULATE INTERACTIONS *
574              **************************/
575
576             r32              = _mm_mul_ps(rsq32,rinv32);
577
578             /* Calculate table index by multiplying r with table scale and truncate to integer */
579             rt               = _mm_mul_ps(r32,vftabscale);
580             vfitab           = _mm_cvttps_epi32(rt);
581             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
582             vfitab           = _mm_slli_epi32(vfitab,2);
583
584             /* CUBIC SPLINE TABLE ELECTROSTATICS */
585             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
586             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
587             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
588             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
589             _MM_TRANSPOSE4_PS(Y,F,G,H);
590             Heps             = _mm_mul_ps(vfeps,H);
591             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
592             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
593             velec            = _mm_mul_ps(qq32,VV);
594             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
595             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
596
597             /* Update potential sum for this i atom from the interaction with this j atom. */
598             velecsum         = _mm_add_ps(velecsum,velec);
599
600             fscal            = felec;
601
602             /* Calculate temporary vectorial force */
603             tx               = _mm_mul_ps(fscal,dx32);
604             ty               = _mm_mul_ps(fscal,dy32);
605             tz               = _mm_mul_ps(fscal,dz32);
606
607             /* Update vectorial force */
608             fix3             = _mm_add_ps(fix3,tx);
609             fiy3             = _mm_add_ps(fiy3,ty);
610             fiz3             = _mm_add_ps(fiz3,tz);
611
612             fjx2             = _mm_add_ps(fjx2,tx);
613             fjy2             = _mm_add_ps(fjy2,ty);
614             fjz2             = _mm_add_ps(fjz2,tz);
615
616             /**************************
617              * CALCULATE INTERACTIONS *
618              **************************/
619
620             r33              = _mm_mul_ps(rsq33,rinv33);
621
622             /* Calculate table index by multiplying r with table scale and truncate to integer */
623             rt               = _mm_mul_ps(r33,vftabscale);
624             vfitab           = _mm_cvttps_epi32(rt);
625             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
626             vfitab           = _mm_slli_epi32(vfitab,2);
627
628             /* CUBIC SPLINE TABLE ELECTROSTATICS */
629             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
630             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
631             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
632             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
633             _MM_TRANSPOSE4_PS(Y,F,G,H);
634             Heps             = _mm_mul_ps(vfeps,H);
635             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
636             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
637             velec            = _mm_mul_ps(qq33,VV);
638             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
639             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
640
641             /* Update potential sum for this i atom from the interaction with this j atom. */
642             velecsum         = _mm_add_ps(velecsum,velec);
643
644             fscal            = felec;
645
646             /* Calculate temporary vectorial force */
647             tx               = _mm_mul_ps(fscal,dx33);
648             ty               = _mm_mul_ps(fscal,dy33);
649             tz               = _mm_mul_ps(fscal,dz33);
650
651             /* Update vectorial force */
652             fix3             = _mm_add_ps(fix3,tx);
653             fiy3             = _mm_add_ps(fiy3,ty);
654             fiz3             = _mm_add_ps(fiz3,tz);
655
656             fjx3             = _mm_add_ps(fjx3,tx);
657             fjy3             = _mm_add_ps(fjy3,ty);
658             fjz3             = _mm_add_ps(fjz3,tz);
659
660             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
661                                                    f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
662                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
663
664             /* Inner loop uses 387 flops */
665         }
666
667         if(jidx<j_index_end)
668         {
669
670             /* Get j neighbor index, and coordinate index */
671             jnrA             = jjnr[jidx];
672             jnrB             = jjnr[jidx+1];
673             jnrC             = jjnr[jidx+2];
674             jnrD             = jjnr[jidx+3];
675
676             /* Sign of each element will be negative for non-real atoms.
677              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
678              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
679              */
680             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
681             jnrA       = (jnrA>=0) ? jnrA : 0;
682             jnrB       = (jnrB>=0) ? jnrB : 0;
683             jnrC       = (jnrC>=0) ? jnrC : 0;
684             jnrD       = (jnrD>=0) ? jnrD : 0;
685
686             j_coord_offsetA  = DIM*jnrA;
687             j_coord_offsetB  = DIM*jnrB;
688             j_coord_offsetC  = DIM*jnrC;
689             j_coord_offsetD  = DIM*jnrD;
690
691             /* load j atom coordinates */
692             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
693                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
694                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
695
696             /* Calculate displacement vector */
697             dx11             = _mm_sub_ps(ix1,jx1);
698             dy11             = _mm_sub_ps(iy1,jy1);
699             dz11             = _mm_sub_ps(iz1,jz1);
700             dx12             = _mm_sub_ps(ix1,jx2);
701             dy12             = _mm_sub_ps(iy1,jy2);
702             dz12             = _mm_sub_ps(iz1,jz2);
703             dx13             = _mm_sub_ps(ix1,jx3);
704             dy13             = _mm_sub_ps(iy1,jy3);
705             dz13             = _mm_sub_ps(iz1,jz3);
706             dx21             = _mm_sub_ps(ix2,jx1);
707             dy21             = _mm_sub_ps(iy2,jy1);
708             dz21             = _mm_sub_ps(iz2,jz1);
709             dx22             = _mm_sub_ps(ix2,jx2);
710             dy22             = _mm_sub_ps(iy2,jy2);
711             dz22             = _mm_sub_ps(iz2,jz2);
712             dx23             = _mm_sub_ps(ix2,jx3);
713             dy23             = _mm_sub_ps(iy2,jy3);
714             dz23             = _mm_sub_ps(iz2,jz3);
715             dx31             = _mm_sub_ps(ix3,jx1);
716             dy31             = _mm_sub_ps(iy3,jy1);
717             dz31             = _mm_sub_ps(iz3,jz1);
718             dx32             = _mm_sub_ps(ix3,jx2);
719             dy32             = _mm_sub_ps(iy3,jy2);
720             dz32             = _mm_sub_ps(iz3,jz2);
721             dx33             = _mm_sub_ps(ix3,jx3);
722             dy33             = _mm_sub_ps(iy3,jy3);
723             dz33             = _mm_sub_ps(iz3,jz3);
724
725             /* Calculate squared distance and things based on it */
726             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
727             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
728             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
729             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
730             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
731             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
732             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
733             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
734             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
735
736             rinv11           = gmx_mm_invsqrt_ps(rsq11);
737             rinv12           = gmx_mm_invsqrt_ps(rsq12);
738             rinv13           = gmx_mm_invsqrt_ps(rsq13);
739             rinv21           = gmx_mm_invsqrt_ps(rsq21);
740             rinv22           = gmx_mm_invsqrt_ps(rsq22);
741             rinv23           = gmx_mm_invsqrt_ps(rsq23);
742             rinv31           = gmx_mm_invsqrt_ps(rsq31);
743             rinv32           = gmx_mm_invsqrt_ps(rsq32);
744             rinv33           = gmx_mm_invsqrt_ps(rsq33);
745
746             fjx1             = _mm_setzero_ps();
747             fjy1             = _mm_setzero_ps();
748             fjz1             = _mm_setzero_ps();
749             fjx2             = _mm_setzero_ps();
750             fjy2             = _mm_setzero_ps();
751             fjz2             = _mm_setzero_ps();
752             fjx3             = _mm_setzero_ps();
753             fjy3             = _mm_setzero_ps();
754             fjz3             = _mm_setzero_ps();
755
756             /**************************
757              * CALCULATE INTERACTIONS *
758              **************************/
759
760             r11              = _mm_mul_ps(rsq11,rinv11);
761             r11              = _mm_andnot_ps(dummy_mask,r11);
762
763             /* Calculate table index by multiplying r with table scale and truncate to integer */
764             rt               = _mm_mul_ps(r11,vftabscale);
765             vfitab           = _mm_cvttps_epi32(rt);
766             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
767             vfitab           = _mm_slli_epi32(vfitab,2);
768
769             /* CUBIC SPLINE TABLE ELECTROSTATICS */
770             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
771             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
772             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
773             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
774             _MM_TRANSPOSE4_PS(Y,F,G,H);
775             Heps             = _mm_mul_ps(vfeps,H);
776             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
777             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
778             velec            = _mm_mul_ps(qq11,VV);
779             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
780             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
781
782             /* Update potential sum for this i atom from the interaction with this j atom. */
783             velec            = _mm_andnot_ps(dummy_mask,velec);
784             velecsum         = _mm_add_ps(velecsum,velec);
785
786             fscal            = felec;
787
788             fscal            = _mm_andnot_ps(dummy_mask,fscal);
789
790             /* Calculate temporary vectorial force */
791             tx               = _mm_mul_ps(fscal,dx11);
792             ty               = _mm_mul_ps(fscal,dy11);
793             tz               = _mm_mul_ps(fscal,dz11);
794
795             /* Update vectorial force */
796             fix1             = _mm_add_ps(fix1,tx);
797             fiy1             = _mm_add_ps(fiy1,ty);
798             fiz1             = _mm_add_ps(fiz1,tz);
799
800             fjx1             = _mm_add_ps(fjx1,tx);
801             fjy1             = _mm_add_ps(fjy1,ty);
802             fjz1             = _mm_add_ps(fjz1,tz);
803
804             /**************************
805              * CALCULATE INTERACTIONS *
806              **************************/
807
808             r12              = _mm_mul_ps(rsq12,rinv12);
809             r12              = _mm_andnot_ps(dummy_mask,r12);
810
811             /* Calculate table index by multiplying r with table scale and truncate to integer */
812             rt               = _mm_mul_ps(r12,vftabscale);
813             vfitab           = _mm_cvttps_epi32(rt);
814             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
815             vfitab           = _mm_slli_epi32(vfitab,2);
816
817             /* CUBIC SPLINE TABLE ELECTROSTATICS */
818             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
819             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
820             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
821             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
822             _MM_TRANSPOSE4_PS(Y,F,G,H);
823             Heps             = _mm_mul_ps(vfeps,H);
824             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
825             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
826             velec            = _mm_mul_ps(qq12,VV);
827             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
828             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
829
830             /* Update potential sum for this i atom from the interaction with this j atom. */
831             velec            = _mm_andnot_ps(dummy_mask,velec);
832             velecsum         = _mm_add_ps(velecsum,velec);
833
834             fscal            = felec;
835
836             fscal            = _mm_andnot_ps(dummy_mask,fscal);
837
838             /* Calculate temporary vectorial force */
839             tx               = _mm_mul_ps(fscal,dx12);
840             ty               = _mm_mul_ps(fscal,dy12);
841             tz               = _mm_mul_ps(fscal,dz12);
842
843             /* Update vectorial force */
844             fix1             = _mm_add_ps(fix1,tx);
845             fiy1             = _mm_add_ps(fiy1,ty);
846             fiz1             = _mm_add_ps(fiz1,tz);
847
848             fjx2             = _mm_add_ps(fjx2,tx);
849             fjy2             = _mm_add_ps(fjy2,ty);
850             fjz2             = _mm_add_ps(fjz2,tz);
851
852             /**************************
853              * CALCULATE INTERACTIONS *
854              **************************/
855
856             r13              = _mm_mul_ps(rsq13,rinv13);
857             r13              = _mm_andnot_ps(dummy_mask,r13);
858
859             /* Calculate table index by multiplying r with table scale and truncate to integer */
860             rt               = _mm_mul_ps(r13,vftabscale);
861             vfitab           = _mm_cvttps_epi32(rt);
862             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
863             vfitab           = _mm_slli_epi32(vfitab,2);
864
865             /* CUBIC SPLINE TABLE ELECTROSTATICS */
866             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
867             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
868             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
869             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
870             _MM_TRANSPOSE4_PS(Y,F,G,H);
871             Heps             = _mm_mul_ps(vfeps,H);
872             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
873             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
874             velec            = _mm_mul_ps(qq13,VV);
875             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
876             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
877
878             /* Update potential sum for this i atom from the interaction with this j atom. */
879             velec            = _mm_andnot_ps(dummy_mask,velec);
880             velecsum         = _mm_add_ps(velecsum,velec);
881
882             fscal            = felec;
883
884             fscal            = _mm_andnot_ps(dummy_mask,fscal);
885
886             /* Calculate temporary vectorial force */
887             tx               = _mm_mul_ps(fscal,dx13);
888             ty               = _mm_mul_ps(fscal,dy13);
889             tz               = _mm_mul_ps(fscal,dz13);
890
891             /* Update vectorial force */
892             fix1             = _mm_add_ps(fix1,tx);
893             fiy1             = _mm_add_ps(fiy1,ty);
894             fiz1             = _mm_add_ps(fiz1,tz);
895
896             fjx3             = _mm_add_ps(fjx3,tx);
897             fjy3             = _mm_add_ps(fjy3,ty);
898             fjz3             = _mm_add_ps(fjz3,tz);
899
900             /**************************
901              * CALCULATE INTERACTIONS *
902              **************************/
903
904             r21              = _mm_mul_ps(rsq21,rinv21);
905             r21              = _mm_andnot_ps(dummy_mask,r21);
906
907             /* Calculate table index by multiplying r with table scale and truncate to integer */
908             rt               = _mm_mul_ps(r21,vftabscale);
909             vfitab           = _mm_cvttps_epi32(rt);
910             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
911             vfitab           = _mm_slli_epi32(vfitab,2);
912
913             /* CUBIC SPLINE TABLE ELECTROSTATICS */
914             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
915             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
916             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
917             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
918             _MM_TRANSPOSE4_PS(Y,F,G,H);
919             Heps             = _mm_mul_ps(vfeps,H);
920             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
921             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
922             velec            = _mm_mul_ps(qq21,VV);
923             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
924             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
925
926             /* Update potential sum for this i atom from the interaction with this j atom. */
927             velec            = _mm_andnot_ps(dummy_mask,velec);
928             velecsum         = _mm_add_ps(velecsum,velec);
929
930             fscal            = felec;
931
932             fscal            = _mm_andnot_ps(dummy_mask,fscal);
933
934             /* Calculate temporary vectorial force */
935             tx               = _mm_mul_ps(fscal,dx21);
936             ty               = _mm_mul_ps(fscal,dy21);
937             tz               = _mm_mul_ps(fscal,dz21);
938
939             /* Update vectorial force */
940             fix2             = _mm_add_ps(fix2,tx);
941             fiy2             = _mm_add_ps(fiy2,ty);
942             fiz2             = _mm_add_ps(fiz2,tz);
943
944             fjx1             = _mm_add_ps(fjx1,tx);
945             fjy1             = _mm_add_ps(fjy1,ty);
946             fjz1             = _mm_add_ps(fjz1,tz);
947
948             /**************************
949              * CALCULATE INTERACTIONS *
950              **************************/
951
952             r22              = _mm_mul_ps(rsq22,rinv22);
953             r22              = _mm_andnot_ps(dummy_mask,r22);
954
955             /* Calculate table index by multiplying r with table scale and truncate to integer */
956             rt               = _mm_mul_ps(r22,vftabscale);
957             vfitab           = _mm_cvttps_epi32(rt);
958             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
959             vfitab           = _mm_slli_epi32(vfitab,2);
960
961             /* CUBIC SPLINE TABLE ELECTROSTATICS */
962             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
963             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
964             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
965             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
966             _MM_TRANSPOSE4_PS(Y,F,G,H);
967             Heps             = _mm_mul_ps(vfeps,H);
968             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
969             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
970             velec            = _mm_mul_ps(qq22,VV);
971             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
972             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
973
974             /* Update potential sum for this i atom from the interaction with this j atom. */
975             velec            = _mm_andnot_ps(dummy_mask,velec);
976             velecsum         = _mm_add_ps(velecsum,velec);
977
978             fscal            = felec;
979
980             fscal            = _mm_andnot_ps(dummy_mask,fscal);
981
982             /* Calculate temporary vectorial force */
983             tx               = _mm_mul_ps(fscal,dx22);
984             ty               = _mm_mul_ps(fscal,dy22);
985             tz               = _mm_mul_ps(fscal,dz22);
986
987             /* Update vectorial force */
988             fix2             = _mm_add_ps(fix2,tx);
989             fiy2             = _mm_add_ps(fiy2,ty);
990             fiz2             = _mm_add_ps(fiz2,tz);
991
992             fjx2             = _mm_add_ps(fjx2,tx);
993             fjy2             = _mm_add_ps(fjy2,ty);
994             fjz2             = _mm_add_ps(fjz2,tz);
995
996             /**************************
997              * CALCULATE INTERACTIONS *
998              **************************/
999
1000             r23              = _mm_mul_ps(rsq23,rinv23);
1001             r23              = _mm_andnot_ps(dummy_mask,r23);
1002
1003             /* Calculate table index by multiplying r with table scale and truncate to integer */
1004             rt               = _mm_mul_ps(r23,vftabscale);
1005             vfitab           = _mm_cvttps_epi32(rt);
1006             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1007             vfitab           = _mm_slli_epi32(vfitab,2);
1008
1009             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1010             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1011             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1012             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1013             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1014             _MM_TRANSPOSE4_PS(Y,F,G,H);
1015             Heps             = _mm_mul_ps(vfeps,H);
1016             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1017             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1018             velec            = _mm_mul_ps(qq23,VV);
1019             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1020             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1021
1022             /* Update potential sum for this i atom from the interaction with this j atom. */
1023             velec            = _mm_andnot_ps(dummy_mask,velec);
1024             velecsum         = _mm_add_ps(velecsum,velec);
1025
1026             fscal            = felec;
1027
1028             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1029
1030             /* Calculate temporary vectorial force */
1031             tx               = _mm_mul_ps(fscal,dx23);
1032             ty               = _mm_mul_ps(fscal,dy23);
1033             tz               = _mm_mul_ps(fscal,dz23);
1034
1035             /* Update vectorial force */
1036             fix2             = _mm_add_ps(fix2,tx);
1037             fiy2             = _mm_add_ps(fiy2,ty);
1038             fiz2             = _mm_add_ps(fiz2,tz);
1039
1040             fjx3             = _mm_add_ps(fjx3,tx);
1041             fjy3             = _mm_add_ps(fjy3,ty);
1042             fjz3             = _mm_add_ps(fjz3,tz);
1043
1044             /**************************
1045              * CALCULATE INTERACTIONS *
1046              **************************/
1047
1048             r31              = _mm_mul_ps(rsq31,rinv31);
1049             r31              = _mm_andnot_ps(dummy_mask,r31);
1050
1051             /* Calculate table index by multiplying r with table scale and truncate to integer */
1052             rt               = _mm_mul_ps(r31,vftabscale);
1053             vfitab           = _mm_cvttps_epi32(rt);
1054             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1055             vfitab           = _mm_slli_epi32(vfitab,2);
1056
1057             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1058             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1059             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1060             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1061             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1062             _MM_TRANSPOSE4_PS(Y,F,G,H);
1063             Heps             = _mm_mul_ps(vfeps,H);
1064             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1065             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1066             velec            = _mm_mul_ps(qq31,VV);
1067             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1068             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1069
1070             /* Update potential sum for this i atom from the interaction with this j atom. */
1071             velec            = _mm_andnot_ps(dummy_mask,velec);
1072             velecsum         = _mm_add_ps(velecsum,velec);
1073
1074             fscal            = felec;
1075
1076             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1077
1078             /* Calculate temporary vectorial force */
1079             tx               = _mm_mul_ps(fscal,dx31);
1080             ty               = _mm_mul_ps(fscal,dy31);
1081             tz               = _mm_mul_ps(fscal,dz31);
1082
1083             /* Update vectorial force */
1084             fix3             = _mm_add_ps(fix3,tx);
1085             fiy3             = _mm_add_ps(fiy3,ty);
1086             fiz3             = _mm_add_ps(fiz3,tz);
1087
1088             fjx1             = _mm_add_ps(fjx1,tx);
1089             fjy1             = _mm_add_ps(fjy1,ty);
1090             fjz1             = _mm_add_ps(fjz1,tz);
1091
1092             /**************************
1093              * CALCULATE INTERACTIONS *
1094              **************************/
1095
1096             r32              = _mm_mul_ps(rsq32,rinv32);
1097             r32              = _mm_andnot_ps(dummy_mask,r32);
1098
1099             /* Calculate table index by multiplying r with table scale and truncate to integer */
1100             rt               = _mm_mul_ps(r32,vftabscale);
1101             vfitab           = _mm_cvttps_epi32(rt);
1102             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1103             vfitab           = _mm_slli_epi32(vfitab,2);
1104
1105             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1106             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1107             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1108             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1109             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1110             _MM_TRANSPOSE4_PS(Y,F,G,H);
1111             Heps             = _mm_mul_ps(vfeps,H);
1112             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1113             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1114             velec            = _mm_mul_ps(qq32,VV);
1115             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1116             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1117
1118             /* Update potential sum for this i atom from the interaction with this j atom. */
1119             velec            = _mm_andnot_ps(dummy_mask,velec);
1120             velecsum         = _mm_add_ps(velecsum,velec);
1121
1122             fscal            = felec;
1123
1124             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1125
1126             /* Calculate temporary vectorial force */
1127             tx               = _mm_mul_ps(fscal,dx32);
1128             ty               = _mm_mul_ps(fscal,dy32);
1129             tz               = _mm_mul_ps(fscal,dz32);
1130
1131             /* Update vectorial force */
1132             fix3             = _mm_add_ps(fix3,tx);
1133             fiy3             = _mm_add_ps(fiy3,ty);
1134             fiz3             = _mm_add_ps(fiz3,tz);
1135
1136             fjx2             = _mm_add_ps(fjx2,tx);
1137             fjy2             = _mm_add_ps(fjy2,ty);
1138             fjz2             = _mm_add_ps(fjz2,tz);
1139
1140             /**************************
1141              * CALCULATE INTERACTIONS *
1142              **************************/
1143
1144             r33              = _mm_mul_ps(rsq33,rinv33);
1145             r33              = _mm_andnot_ps(dummy_mask,r33);
1146
1147             /* Calculate table index by multiplying r with table scale and truncate to integer */
1148             rt               = _mm_mul_ps(r33,vftabscale);
1149             vfitab           = _mm_cvttps_epi32(rt);
1150             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1151             vfitab           = _mm_slli_epi32(vfitab,2);
1152
1153             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1154             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1155             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1156             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1157             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1158             _MM_TRANSPOSE4_PS(Y,F,G,H);
1159             Heps             = _mm_mul_ps(vfeps,H);
1160             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1161             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1162             velec            = _mm_mul_ps(qq33,VV);
1163             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1164             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1165
1166             /* Update potential sum for this i atom from the interaction with this j atom. */
1167             velec            = _mm_andnot_ps(dummy_mask,velec);
1168             velecsum         = _mm_add_ps(velecsum,velec);
1169
1170             fscal            = felec;
1171
1172             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1173
1174             /* Calculate temporary vectorial force */
1175             tx               = _mm_mul_ps(fscal,dx33);
1176             ty               = _mm_mul_ps(fscal,dy33);
1177             tz               = _mm_mul_ps(fscal,dz33);
1178
1179             /* Update vectorial force */
1180             fix3             = _mm_add_ps(fix3,tx);
1181             fiy3             = _mm_add_ps(fiy3,ty);
1182             fiz3             = _mm_add_ps(fiz3,tz);
1183
1184             fjx3             = _mm_add_ps(fjx3,tx);
1185             fjy3             = _mm_add_ps(fjy3,ty);
1186             fjz3             = _mm_add_ps(fjz3,tz);
1187
1188             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
1189                                                    f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
1190                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1191
1192             /* Inner loop uses 396 flops */
1193         }
1194
1195         /* End of innermost loop */
1196
1197         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1198                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1199
1200         ggid                        = gid[iidx];
1201         /* Update potential energies */
1202         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1203
1204         /* Increment number of inner iterations */
1205         inneriter                  += j_index_end - j_index_start;
1206
1207         /* Outer loop uses 28 flops */
1208     }
1209
1210     /* Increment number of outer iterations */
1211     outeriter        += nri;
1212
1213     /* Update outer/inner flops */
1214
1215     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*396);
1216 }
1217 /*
1218  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single
1219  * Electrostatics interaction: CubicSplineTable
1220  * VdW interaction:            None
1221  * Geometry:                   Water4-Water4
1222  * Calculate force/pot:        Force
1223  */
1224 void
1225 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single
1226                     (t_nblist * gmx_restrict                nlist,
1227                      rvec * gmx_restrict                    xx,
1228                      rvec * gmx_restrict                    ff,
1229                      t_forcerec * gmx_restrict              fr,
1230                      t_mdatoms * gmx_restrict               mdatoms,
1231                      nb_kernel_data_t * gmx_restrict        kernel_data,
1232                      t_nrnb * gmx_restrict                  nrnb)
1233 {
1234     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1235      * just 0 for non-waters.
1236      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1237      * jnr indices corresponding to data put in the four positions in the SIMD register.
1238      */
1239     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1240     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1241     int              jnrA,jnrB,jnrC,jnrD;
1242     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1243     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1244     real             shX,shY,shZ,rcutoff_scalar;
1245     real             *shiftvec,*fshift,*x,*f;
1246     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1247     int              vdwioffset1;
1248     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1249     int              vdwioffset2;
1250     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1251     int              vdwioffset3;
1252     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1253     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1254     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1255     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1256     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1257     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1258     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1259     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1260     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1261     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1262     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1263     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1264     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1265     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1266     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1267     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1268     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1269     real             *charge;
1270     __m128i          vfitab;
1271     __m128i          ifour       = _mm_set1_epi32(4);
1272     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1273     real             *vftab;
1274     __m128           dummy_mask,cutoff_mask;
1275     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1276     __m128           one     = _mm_set1_ps(1.0);
1277     __m128           two     = _mm_set1_ps(2.0);
1278     x                = xx[0];
1279     f                = ff[0];
1280
1281     nri              = nlist->nri;
1282     iinr             = nlist->iinr;
1283     jindex           = nlist->jindex;
1284     jjnr             = nlist->jjnr;
1285     shiftidx         = nlist->shift;
1286     gid              = nlist->gid;
1287     shiftvec         = fr->shift_vec[0];
1288     fshift           = fr->fshift[0];
1289     facel            = _mm_set1_ps(fr->epsfac);
1290     charge           = mdatoms->chargeA;
1291
1292     vftab            = kernel_data->table_elec->data;
1293     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1294
1295     /* Setup water-specific parameters */
1296     inr              = nlist->iinr[0];
1297     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1298     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1299     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1300
1301     jq1              = _mm_set1_ps(charge[inr+1]);
1302     jq2              = _mm_set1_ps(charge[inr+2]);
1303     jq3              = _mm_set1_ps(charge[inr+3]);
1304     qq11             = _mm_mul_ps(iq1,jq1);
1305     qq12             = _mm_mul_ps(iq1,jq2);
1306     qq13             = _mm_mul_ps(iq1,jq3);
1307     qq21             = _mm_mul_ps(iq2,jq1);
1308     qq22             = _mm_mul_ps(iq2,jq2);
1309     qq23             = _mm_mul_ps(iq2,jq3);
1310     qq31             = _mm_mul_ps(iq3,jq1);
1311     qq32             = _mm_mul_ps(iq3,jq2);
1312     qq33             = _mm_mul_ps(iq3,jq3);
1313
1314     /* Avoid stupid compiler warnings */
1315     jnrA = jnrB = jnrC = jnrD = 0;
1316     j_coord_offsetA = 0;
1317     j_coord_offsetB = 0;
1318     j_coord_offsetC = 0;
1319     j_coord_offsetD = 0;
1320
1321     outeriter        = 0;
1322     inneriter        = 0;
1323
1324     /* Start outer loop over neighborlists */
1325     for(iidx=0; iidx<nri; iidx++)
1326     {
1327         /* Load shift vector for this list */
1328         i_shift_offset   = DIM*shiftidx[iidx];
1329         shX              = shiftvec[i_shift_offset+XX];
1330         shY              = shiftvec[i_shift_offset+YY];
1331         shZ              = shiftvec[i_shift_offset+ZZ];
1332
1333         /* Load limits for loop over neighbors */
1334         j_index_start    = jindex[iidx];
1335         j_index_end      = jindex[iidx+1];
1336
1337         /* Get outer coordinate index */
1338         inr              = iinr[iidx];
1339         i_coord_offset   = DIM*inr;
1340
1341         /* Load i particle coords and add shift vector */
1342         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1343         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1344         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1345         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1346         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1347         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1348         ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1349         iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1350         iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1351
1352         fix1             = _mm_setzero_ps();
1353         fiy1             = _mm_setzero_ps();
1354         fiz1             = _mm_setzero_ps();
1355         fix2             = _mm_setzero_ps();
1356         fiy2             = _mm_setzero_ps();
1357         fiz2             = _mm_setzero_ps();
1358         fix3             = _mm_setzero_ps();
1359         fiy3             = _mm_setzero_ps();
1360         fiz3             = _mm_setzero_ps();
1361
1362         /* Start inner kernel loop */
1363         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1364         {
1365
1366             /* Get j neighbor index, and coordinate index */
1367             jnrA             = jjnr[jidx];
1368             jnrB             = jjnr[jidx+1];
1369             jnrC             = jjnr[jidx+2];
1370             jnrD             = jjnr[jidx+3];
1371
1372             j_coord_offsetA  = DIM*jnrA;
1373             j_coord_offsetB  = DIM*jnrB;
1374             j_coord_offsetC  = DIM*jnrC;
1375             j_coord_offsetD  = DIM*jnrD;
1376
1377             /* load j atom coordinates */
1378             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1379                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1380                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1381
1382             /* Calculate displacement vector */
1383             dx11             = _mm_sub_ps(ix1,jx1);
1384             dy11             = _mm_sub_ps(iy1,jy1);
1385             dz11             = _mm_sub_ps(iz1,jz1);
1386             dx12             = _mm_sub_ps(ix1,jx2);
1387             dy12             = _mm_sub_ps(iy1,jy2);
1388             dz12             = _mm_sub_ps(iz1,jz2);
1389             dx13             = _mm_sub_ps(ix1,jx3);
1390             dy13             = _mm_sub_ps(iy1,jy3);
1391             dz13             = _mm_sub_ps(iz1,jz3);
1392             dx21             = _mm_sub_ps(ix2,jx1);
1393             dy21             = _mm_sub_ps(iy2,jy1);
1394             dz21             = _mm_sub_ps(iz2,jz1);
1395             dx22             = _mm_sub_ps(ix2,jx2);
1396             dy22             = _mm_sub_ps(iy2,jy2);
1397             dz22             = _mm_sub_ps(iz2,jz2);
1398             dx23             = _mm_sub_ps(ix2,jx3);
1399             dy23             = _mm_sub_ps(iy2,jy3);
1400             dz23             = _mm_sub_ps(iz2,jz3);
1401             dx31             = _mm_sub_ps(ix3,jx1);
1402             dy31             = _mm_sub_ps(iy3,jy1);
1403             dz31             = _mm_sub_ps(iz3,jz1);
1404             dx32             = _mm_sub_ps(ix3,jx2);
1405             dy32             = _mm_sub_ps(iy3,jy2);
1406             dz32             = _mm_sub_ps(iz3,jz2);
1407             dx33             = _mm_sub_ps(ix3,jx3);
1408             dy33             = _mm_sub_ps(iy3,jy3);
1409             dz33             = _mm_sub_ps(iz3,jz3);
1410
1411             /* Calculate squared distance and things based on it */
1412             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1413             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1414             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1415             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1416             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1417             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1418             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1419             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1420             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1421
1422             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1423             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1424             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1425             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1426             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1427             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1428             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1429             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1430             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1431
1432             fjx1             = _mm_setzero_ps();
1433             fjy1             = _mm_setzero_ps();
1434             fjz1             = _mm_setzero_ps();
1435             fjx2             = _mm_setzero_ps();
1436             fjy2             = _mm_setzero_ps();
1437             fjz2             = _mm_setzero_ps();
1438             fjx3             = _mm_setzero_ps();
1439             fjy3             = _mm_setzero_ps();
1440             fjz3             = _mm_setzero_ps();
1441
1442             /**************************
1443              * CALCULATE INTERACTIONS *
1444              **************************/
1445
1446             r11              = _mm_mul_ps(rsq11,rinv11);
1447
1448             /* Calculate table index by multiplying r with table scale and truncate to integer */
1449             rt               = _mm_mul_ps(r11,vftabscale);
1450             vfitab           = _mm_cvttps_epi32(rt);
1451             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1452             vfitab           = _mm_slli_epi32(vfitab,2);
1453
1454             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1455             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1456             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1457             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1458             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1459             _MM_TRANSPOSE4_PS(Y,F,G,H);
1460             Heps             = _mm_mul_ps(vfeps,H);
1461             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1462             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1463             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1464
1465             fscal            = felec;
1466
1467             /* Calculate temporary vectorial force */
1468             tx               = _mm_mul_ps(fscal,dx11);
1469             ty               = _mm_mul_ps(fscal,dy11);
1470             tz               = _mm_mul_ps(fscal,dz11);
1471
1472             /* Update vectorial force */
1473             fix1             = _mm_add_ps(fix1,tx);
1474             fiy1             = _mm_add_ps(fiy1,ty);
1475             fiz1             = _mm_add_ps(fiz1,tz);
1476
1477             fjx1             = _mm_add_ps(fjx1,tx);
1478             fjy1             = _mm_add_ps(fjy1,ty);
1479             fjz1             = _mm_add_ps(fjz1,tz);
1480
1481             /**************************
1482              * CALCULATE INTERACTIONS *
1483              **************************/
1484
1485             r12              = _mm_mul_ps(rsq12,rinv12);
1486
1487             /* Calculate table index by multiplying r with table scale and truncate to integer */
1488             rt               = _mm_mul_ps(r12,vftabscale);
1489             vfitab           = _mm_cvttps_epi32(rt);
1490             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1491             vfitab           = _mm_slli_epi32(vfitab,2);
1492
1493             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1494             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1495             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1496             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1497             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1498             _MM_TRANSPOSE4_PS(Y,F,G,H);
1499             Heps             = _mm_mul_ps(vfeps,H);
1500             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1501             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1502             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1503
1504             fscal            = felec;
1505
1506             /* Calculate temporary vectorial force */
1507             tx               = _mm_mul_ps(fscal,dx12);
1508             ty               = _mm_mul_ps(fscal,dy12);
1509             tz               = _mm_mul_ps(fscal,dz12);
1510
1511             /* Update vectorial force */
1512             fix1             = _mm_add_ps(fix1,tx);
1513             fiy1             = _mm_add_ps(fiy1,ty);
1514             fiz1             = _mm_add_ps(fiz1,tz);
1515
1516             fjx2             = _mm_add_ps(fjx2,tx);
1517             fjy2             = _mm_add_ps(fjy2,ty);
1518             fjz2             = _mm_add_ps(fjz2,tz);
1519
1520             /**************************
1521              * CALCULATE INTERACTIONS *
1522              **************************/
1523
1524             r13              = _mm_mul_ps(rsq13,rinv13);
1525
1526             /* Calculate table index by multiplying r with table scale and truncate to integer */
1527             rt               = _mm_mul_ps(r13,vftabscale);
1528             vfitab           = _mm_cvttps_epi32(rt);
1529             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1530             vfitab           = _mm_slli_epi32(vfitab,2);
1531
1532             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1533             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1534             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1535             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1536             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1537             _MM_TRANSPOSE4_PS(Y,F,G,H);
1538             Heps             = _mm_mul_ps(vfeps,H);
1539             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1540             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1541             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1542
1543             fscal            = felec;
1544
1545             /* Calculate temporary vectorial force */
1546             tx               = _mm_mul_ps(fscal,dx13);
1547             ty               = _mm_mul_ps(fscal,dy13);
1548             tz               = _mm_mul_ps(fscal,dz13);
1549
1550             /* Update vectorial force */
1551             fix1             = _mm_add_ps(fix1,tx);
1552             fiy1             = _mm_add_ps(fiy1,ty);
1553             fiz1             = _mm_add_ps(fiz1,tz);
1554
1555             fjx3             = _mm_add_ps(fjx3,tx);
1556             fjy3             = _mm_add_ps(fjy3,ty);
1557             fjz3             = _mm_add_ps(fjz3,tz);
1558
1559             /**************************
1560              * CALCULATE INTERACTIONS *
1561              **************************/
1562
1563             r21              = _mm_mul_ps(rsq21,rinv21);
1564
1565             /* Calculate table index by multiplying r with table scale and truncate to integer */
1566             rt               = _mm_mul_ps(r21,vftabscale);
1567             vfitab           = _mm_cvttps_epi32(rt);
1568             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1569             vfitab           = _mm_slli_epi32(vfitab,2);
1570
1571             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1572             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1573             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1574             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1575             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1576             _MM_TRANSPOSE4_PS(Y,F,G,H);
1577             Heps             = _mm_mul_ps(vfeps,H);
1578             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1579             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1580             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1581
1582             fscal            = felec;
1583
1584             /* Calculate temporary vectorial force */
1585             tx               = _mm_mul_ps(fscal,dx21);
1586             ty               = _mm_mul_ps(fscal,dy21);
1587             tz               = _mm_mul_ps(fscal,dz21);
1588
1589             /* Update vectorial force */
1590             fix2             = _mm_add_ps(fix2,tx);
1591             fiy2             = _mm_add_ps(fiy2,ty);
1592             fiz2             = _mm_add_ps(fiz2,tz);
1593
1594             fjx1             = _mm_add_ps(fjx1,tx);
1595             fjy1             = _mm_add_ps(fjy1,ty);
1596             fjz1             = _mm_add_ps(fjz1,tz);
1597
1598             /**************************
1599              * CALCULATE INTERACTIONS *
1600              **************************/
1601
1602             r22              = _mm_mul_ps(rsq22,rinv22);
1603
1604             /* Calculate table index by multiplying r with table scale and truncate to integer */
1605             rt               = _mm_mul_ps(r22,vftabscale);
1606             vfitab           = _mm_cvttps_epi32(rt);
1607             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1608             vfitab           = _mm_slli_epi32(vfitab,2);
1609
1610             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1611             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1612             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1613             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1614             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1615             _MM_TRANSPOSE4_PS(Y,F,G,H);
1616             Heps             = _mm_mul_ps(vfeps,H);
1617             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1618             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1619             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1620
1621             fscal            = felec;
1622
1623             /* Calculate temporary vectorial force */
1624             tx               = _mm_mul_ps(fscal,dx22);
1625             ty               = _mm_mul_ps(fscal,dy22);
1626             tz               = _mm_mul_ps(fscal,dz22);
1627
1628             /* Update vectorial force */
1629             fix2             = _mm_add_ps(fix2,tx);
1630             fiy2             = _mm_add_ps(fiy2,ty);
1631             fiz2             = _mm_add_ps(fiz2,tz);
1632
1633             fjx2             = _mm_add_ps(fjx2,tx);
1634             fjy2             = _mm_add_ps(fjy2,ty);
1635             fjz2             = _mm_add_ps(fjz2,tz);
1636
1637             /**************************
1638              * CALCULATE INTERACTIONS *
1639              **************************/
1640
1641             r23              = _mm_mul_ps(rsq23,rinv23);
1642
1643             /* Calculate table index by multiplying r with table scale and truncate to integer */
1644             rt               = _mm_mul_ps(r23,vftabscale);
1645             vfitab           = _mm_cvttps_epi32(rt);
1646             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1647             vfitab           = _mm_slli_epi32(vfitab,2);
1648
1649             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1650             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1651             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1652             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1653             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1654             _MM_TRANSPOSE4_PS(Y,F,G,H);
1655             Heps             = _mm_mul_ps(vfeps,H);
1656             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1657             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1658             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1659
1660             fscal            = felec;
1661
1662             /* Calculate temporary vectorial force */
1663             tx               = _mm_mul_ps(fscal,dx23);
1664             ty               = _mm_mul_ps(fscal,dy23);
1665             tz               = _mm_mul_ps(fscal,dz23);
1666
1667             /* Update vectorial force */
1668             fix2             = _mm_add_ps(fix2,tx);
1669             fiy2             = _mm_add_ps(fiy2,ty);
1670             fiz2             = _mm_add_ps(fiz2,tz);
1671
1672             fjx3             = _mm_add_ps(fjx3,tx);
1673             fjy3             = _mm_add_ps(fjy3,ty);
1674             fjz3             = _mm_add_ps(fjz3,tz);
1675
1676             /**************************
1677              * CALCULATE INTERACTIONS *
1678              **************************/
1679
1680             r31              = _mm_mul_ps(rsq31,rinv31);
1681
1682             /* Calculate table index by multiplying r with table scale and truncate to integer */
1683             rt               = _mm_mul_ps(r31,vftabscale);
1684             vfitab           = _mm_cvttps_epi32(rt);
1685             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1686             vfitab           = _mm_slli_epi32(vfitab,2);
1687
1688             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1689             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1690             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1691             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1692             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1693             _MM_TRANSPOSE4_PS(Y,F,G,H);
1694             Heps             = _mm_mul_ps(vfeps,H);
1695             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1696             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1697             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1698
1699             fscal            = felec;
1700
1701             /* Calculate temporary vectorial force */
1702             tx               = _mm_mul_ps(fscal,dx31);
1703             ty               = _mm_mul_ps(fscal,dy31);
1704             tz               = _mm_mul_ps(fscal,dz31);
1705
1706             /* Update vectorial force */
1707             fix3             = _mm_add_ps(fix3,tx);
1708             fiy3             = _mm_add_ps(fiy3,ty);
1709             fiz3             = _mm_add_ps(fiz3,tz);
1710
1711             fjx1             = _mm_add_ps(fjx1,tx);
1712             fjy1             = _mm_add_ps(fjy1,ty);
1713             fjz1             = _mm_add_ps(fjz1,tz);
1714
1715             /**************************
1716              * CALCULATE INTERACTIONS *
1717              **************************/
1718
1719             r32              = _mm_mul_ps(rsq32,rinv32);
1720
1721             /* Calculate table index by multiplying r with table scale and truncate to integer */
1722             rt               = _mm_mul_ps(r32,vftabscale);
1723             vfitab           = _mm_cvttps_epi32(rt);
1724             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1725             vfitab           = _mm_slli_epi32(vfitab,2);
1726
1727             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1728             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1729             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1730             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1731             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1732             _MM_TRANSPOSE4_PS(Y,F,G,H);
1733             Heps             = _mm_mul_ps(vfeps,H);
1734             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1735             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1736             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1737
1738             fscal            = felec;
1739
1740             /* Calculate temporary vectorial force */
1741             tx               = _mm_mul_ps(fscal,dx32);
1742             ty               = _mm_mul_ps(fscal,dy32);
1743             tz               = _mm_mul_ps(fscal,dz32);
1744
1745             /* Update vectorial force */
1746             fix3             = _mm_add_ps(fix3,tx);
1747             fiy3             = _mm_add_ps(fiy3,ty);
1748             fiz3             = _mm_add_ps(fiz3,tz);
1749
1750             fjx2             = _mm_add_ps(fjx2,tx);
1751             fjy2             = _mm_add_ps(fjy2,ty);
1752             fjz2             = _mm_add_ps(fjz2,tz);
1753
1754             /**************************
1755              * CALCULATE INTERACTIONS *
1756              **************************/
1757
1758             r33              = _mm_mul_ps(rsq33,rinv33);
1759
1760             /* Calculate table index by multiplying r with table scale and truncate to integer */
1761             rt               = _mm_mul_ps(r33,vftabscale);
1762             vfitab           = _mm_cvttps_epi32(rt);
1763             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1764             vfitab           = _mm_slli_epi32(vfitab,2);
1765
1766             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1767             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1768             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1769             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1770             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1771             _MM_TRANSPOSE4_PS(Y,F,G,H);
1772             Heps             = _mm_mul_ps(vfeps,H);
1773             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1774             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1775             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1776
1777             fscal            = felec;
1778
1779             /* Calculate temporary vectorial force */
1780             tx               = _mm_mul_ps(fscal,dx33);
1781             ty               = _mm_mul_ps(fscal,dy33);
1782             tz               = _mm_mul_ps(fscal,dz33);
1783
1784             /* Update vectorial force */
1785             fix3             = _mm_add_ps(fix3,tx);
1786             fiy3             = _mm_add_ps(fiy3,ty);
1787             fiz3             = _mm_add_ps(fiz3,tz);
1788
1789             fjx3             = _mm_add_ps(fjx3,tx);
1790             fjy3             = _mm_add_ps(fjy3,ty);
1791             fjz3             = _mm_add_ps(fjz3,tz);
1792
1793             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
1794                                                    f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
1795                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1796
1797             /* Inner loop uses 351 flops */
1798         }
1799
1800         if(jidx<j_index_end)
1801         {
1802
1803             /* Get j neighbor index, and coordinate index */
1804             jnrA             = jjnr[jidx];
1805             jnrB             = jjnr[jidx+1];
1806             jnrC             = jjnr[jidx+2];
1807             jnrD             = jjnr[jidx+3];
1808
1809             /* Sign of each element will be negative for non-real atoms.
1810              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1811              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1812              */
1813             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1814             jnrA       = (jnrA>=0) ? jnrA : 0;
1815             jnrB       = (jnrB>=0) ? jnrB : 0;
1816             jnrC       = (jnrC>=0) ? jnrC : 0;
1817             jnrD       = (jnrD>=0) ? jnrD : 0;
1818
1819             j_coord_offsetA  = DIM*jnrA;
1820             j_coord_offsetB  = DIM*jnrB;
1821             j_coord_offsetC  = DIM*jnrC;
1822             j_coord_offsetD  = DIM*jnrD;
1823
1824             /* load j atom coordinates */
1825             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1826                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1827                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1828
1829             /* Calculate displacement vector */
1830             dx11             = _mm_sub_ps(ix1,jx1);
1831             dy11             = _mm_sub_ps(iy1,jy1);
1832             dz11             = _mm_sub_ps(iz1,jz1);
1833             dx12             = _mm_sub_ps(ix1,jx2);
1834             dy12             = _mm_sub_ps(iy1,jy2);
1835             dz12             = _mm_sub_ps(iz1,jz2);
1836             dx13             = _mm_sub_ps(ix1,jx3);
1837             dy13             = _mm_sub_ps(iy1,jy3);
1838             dz13             = _mm_sub_ps(iz1,jz3);
1839             dx21             = _mm_sub_ps(ix2,jx1);
1840             dy21             = _mm_sub_ps(iy2,jy1);
1841             dz21             = _mm_sub_ps(iz2,jz1);
1842             dx22             = _mm_sub_ps(ix2,jx2);
1843             dy22             = _mm_sub_ps(iy2,jy2);
1844             dz22             = _mm_sub_ps(iz2,jz2);
1845             dx23             = _mm_sub_ps(ix2,jx3);
1846             dy23             = _mm_sub_ps(iy2,jy3);
1847             dz23             = _mm_sub_ps(iz2,jz3);
1848             dx31             = _mm_sub_ps(ix3,jx1);
1849             dy31             = _mm_sub_ps(iy3,jy1);
1850             dz31             = _mm_sub_ps(iz3,jz1);
1851             dx32             = _mm_sub_ps(ix3,jx2);
1852             dy32             = _mm_sub_ps(iy3,jy2);
1853             dz32             = _mm_sub_ps(iz3,jz2);
1854             dx33             = _mm_sub_ps(ix3,jx3);
1855             dy33             = _mm_sub_ps(iy3,jy3);
1856             dz33             = _mm_sub_ps(iz3,jz3);
1857
1858             /* Calculate squared distance and things based on it */
1859             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1860             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1861             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1862             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1863             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1864             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1865             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1866             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1867             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1868
1869             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1870             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1871             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1872             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1873             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1874             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1875             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1876             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1877             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1878
1879             fjx1             = _mm_setzero_ps();
1880             fjy1             = _mm_setzero_ps();
1881             fjz1             = _mm_setzero_ps();
1882             fjx2             = _mm_setzero_ps();
1883             fjy2             = _mm_setzero_ps();
1884             fjz2             = _mm_setzero_ps();
1885             fjx3             = _mm_setzero_ps();
1886             fjy3             = _mm_setzero_ps();
1887             fjz3             = _mm_setzero_ps();
1888
1889             /**************************
1890              * CALCULATE INTERACTIONS *
1891              **************************/
1892
1893             r11              = _mm_mul_ps(rsq11,rinv11);
1894             r11              = _mm_andnot_ps(dummy_mask,r11);
1895
1896             /* Calculate table index by multiplying r with table scale and truncate to integer */
1897             rt               = _mm_mul_ps(r11,vftabscale);
1898             vfitab           = _mm_cvttps_epi32(rt);
1899             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1900             vfitab           = _mm_slli_epi32(vfitab,2);
1901
1902             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1903             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1904             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1905             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1906             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1907             _MM_TRANSPOSE4_PS(Y,F,G,H);
1908             Heps             = _mm_mul_ps(vfeps,H);
1909             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1910             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1911             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1912
1913             fscal            = felec;
1914
1915             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1916
1917             /* Calculate temporary vectorial force */
1918             tx               = _mm_mul_ps(fscal,dx11);
1919             ty               = _mm_mul_ps(fscal,dy11);
1920             tz               = _mm_mul_ps(fscal,dz11);
1921
1922             /* Update vectorial force */
1923             fix1             = _mm_add_ps(fix1,tx);
1924             fiy1             = _mm_add_ps(fiy1,ty);
1925             fiz1             = _mm_add_ps(fiz1,tz);
1926
1927             fjx1             = _mm_add_ps(fjx1,tx);
1928             fjy1             = _mm_add_ps(fjy1,ty);
1929             fjz1             = _mm_add_ps(fjz1,tz);
1930
1931             /**************************
1932              * CALCULATE INTERACTIONS *
1933              **************************/
1934
1935             r12              = _mm_mul_ps(rsq12,rinv12);
1936             r12              = _mm_andnot_ps(dummy_mask,r12);
1937
1938             /* Calculate table index by multiplying r with table scale and truncate to integer */
1939             rt               = _mm_mul_ps(r12,vftabscale);
1940             vfitab           = _mm_cvttps_epi32(rt);
1941             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1942             vfitab           = _mm_slli_epi32(vfitab,2);
1943
1944             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1945             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1946             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1947             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1948             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1949             _MM_TRANSPOSE4_PS(Y,F,G,H);
1950             Heps             = _mm_mul_ps(vfeps,H);
1951             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1952             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1953             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1954
1955             fscal            = felec;
1956
1957             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1958
1959             /* Calculate temporary vectorial force */
1960             tx               = _mm_mul_ps(fscal,dx12);
1961             ty               = _mm_mul_ps(fscal,dy12);
1962             tz               = _mm_mul_ps(fscal,dz12);
1963
1964             /* Update vectorial force */
1965             fix1             = _mm_add_ps(fix1,tx);
1966             fiy1             = _mm_add_ps(fiy1,ty);
1967             fiz1             = _mm_add_ps(fiz1,tz);
1968
1969             fjx2             = _mm_add_ps(fjx2,tx);
1970             fjy2             = _mm_add_ps(fjy2,ty);
1971             fjz2             = _mm_add_ps(fjz2,tz);
1972
1973             /**************************
1974              * CALCULATE INTERACTIONS *
1975              **************************/
1976
1977             r13              = _mm_mul_ps(rsq13,rinv13);
1978             r13              = _mm_andnot_ps(dummy_mask,r13);
1979
1980             /* Calculate table index by multiplying r with table scale and truncate to integer */
1981             rt               = _mm_mul_ps(r13,vftabscale);
1982             vfitab           = _mm_cvttps_epi32(rt);
1983             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1984             vfitab           = _mm_slli_epi32(vfitab,2);
1985
1986             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1987             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1988             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1989             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1990             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1991             _MM_TRANSPOSE4_PS(Y,F,G,H);
1992             Heps             = _mm_mul_ps(vfeps,H);
1993             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1994             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1995             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1996
1997             fscal            = felec;
1998
1999             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2000
2001             /* Calculate temporary vectorial force */
2002             tx               = _mm_mul_ps(fscal,dx13);
2003             ty               = _mm_mul_ps(fscal,dy13);
2004             tz               = _mm_mul_ps(fscal,dz13);
2005
2006             /* Update vectorial force */
2007             fix1             = _mm_add_ps(fix1,tx);
2008             fiy1             = _mm_add_ps(fiy1,ty);
2009             fiz1             = _mm_add_ps(fiz1,tz);
2010
2011             fjx3             = _mm_add_ps(fjx3,tx);
2012             fjy3             = _mm_add_ps(fjy3,ty);
2013             fjz3             = _mm_add_ps(fjz3,tz);
2014
2015             /**************************
2016              * CALCULATE INTERACTIONS *
2017              **************************/
2018
2019             r21              = _mm_mul_ps(rsq21,rinv21);
2020             r21              = _mm_andnot_ps(dummy_mask,r21);
2021
2022             /* Calculate table index by multiplying r with table scale and truncate to integer */
2023             rt               = _mm_mul_ps(r21,vftabscale);
2024             vfitab           = _mm_cvttps_epi32(rt);
2025             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2026             vfitab           = _mm_slli_epi32(vfitab,2);
2027
2028             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2029             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2030             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2031             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2032             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2033             _MM_TRANSPOSE4_PS(Y,F,G,H);
2034             Heps             = _mm_mul_ps(vfeps,H);
2035             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2036             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2037             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2038
2039             fscal            = felec;
2040
2041             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2042
2043             /* Calculate temporary vectorial force */
2044             tx               = _mm_mul_ps(fscal,dx21);
2045             ty               = _mm_mul_ps(fscal,dy21);
2046             tz               = _mm_mul_ps(fscal,dz21);
2047
2048             /* Update vectorial force */
2049             fix2             = _mm_add_ps(fix2,tx);
2050             fiy2             = _mm_add_ps(fiy2,ty);
2051             fiz2             = _mm_add_ps(fiz2,tz);
2052
2053             fjx1             = _mm_add_ps(fjx1,tx);
2054             fjy1             = _mm_add_ps(fjy1,ty);
2055             fjz1             = _mm_add_ps(fjz1,tz);
2056
2057             /**************************
2058              * CALCULATE INTERACTIONS *
2059              **************************/
2060
2061             r22              = _mm_mul_ps(rsq22,rinv22);
2062             r22              = _mm_andnot_ps(dummy_mask,r22);
2063
2064             /* Calculate table index by multiplying r with table scale and truncate to integer */
2065             rt               = _mm_mul_ps(r22,vftabscale);
2066             vfitab           = _mm_cvttps_epi32(rt);
2067             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2068             vfitab           = _mm_slli_epi32(vfitab,2);
2069
2070             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2071             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2072             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2073             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2074             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2075             _MM_TRANSPOSE4_PS(Y,F,G,H);
2076             Heps             = _mm_mul_ps(vfeps,H);
2077             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2078             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2079             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2080
2081             fscal            = felec;
2082
2083             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2084
2085             /* Calculate temporary vectorial force */
2086             tx               = _mm_mul_ps(fscal,dx22);
2087             ty               = _mm_mul_ps(fscal,dy22);
2088             tz               = _mm_mul_ps(fscal,dz22);
2089
2090             /* Update vectorial force */
2091             fix2             = _mm_add_ps(fix2,tx);
2092             fiy2             = _mm_add_ps(fiy2,ty);
2093             fiz2             = _mm_add_ps(fiz2,tz);
2094
2095             fjx2             = _mm_add_ps(fjx2,tx);
2096             fjy2             = _mm_add_ps(fjy2,ty);
2097             fjz2             = _mm_add_ps(fjz2,tz);
2098
2099             /**************************
2100              * CALCULATE INTERACTIONS *
2101              **************************/
2102
2103             r23              = _mm_mul_ps(rsq23,rinv23);
2104             r23              = _mm_andnot_ps(dummy_mask,r23);
2105
2106             /* Calculate table index by multiplying r with table scale and truncate to integer */
2107             rt               = _mm_mul_ps(r23,vftabscale);
2108             vfitab           = _mm_cvttps_epi32(rt);
2109             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2110             vfitab           = _mm_slli_epi32(vfitab,2);
2111
2112             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2113             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2114             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2115             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2116             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2117             _MM_TRANSPOSE4_PS(Y,F,G,H);
2118             Heps             = _mm_mul_ps(vfeps,H);
2119             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2120             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2121             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2122
2123             fscal            = felec;
2124
2125             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2126
2127             /* Calculate temporary vectorial force */
2128             tx               = _mm_mul_ps(fscal,dx23);
2129             ty               = _mm_mul_ps(fscal,dy23);
2130             tz               = _mm_mul_ps(fscal,dz23);
2131
2132             /* Update vectorial force */
2133             fix2             = _mm_add_ps(fix2,tx);
2134             fiy2             = _mm_add_ps(fiy2,ty);
2135             fiz2             = _mm_add_ps(fiz2,tz);
2136
2137             fjx3             = _mm_add_ps(fjx3,tx);
2138             fjy3             = _mm_add_ps(fjy3,ty);
2139             fjz3             = _mm_add_ps(fjz3,tz);
2140
2141             /**************************
2142              * CALCULATE INTERACTIONS *
2143              **************************/
2144
2145             r31              = _mm_mul_ps(rsq31,rinv31);
2146             r31              = _mm_andnot_ps(dummy_mask,r31);
2147
2148             /* Calculate table index by multiplying r with table scale and truncate to integer */
2149             rt               = _mm_mul_ps(r31,vftabscale);
2150             vfitab           = _mm_cvttps_epi32(rt);
2151             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2152             vfitab           = _mm_slli_epi32(vfitab,2);
2153
2154             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2155             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2156             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2157             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2158             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2159             _MM_TRANSPOSE4_PS(Y,F,G,H);
2160             Heps             = _mm_mul_ps(vfeps,H);
2161             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2162             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2163             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2164
2165             fscal            = felec;
2166
2167             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2168
2169             /* Calculate temporary vectorial force */
2170             tx               = _mm_mul_ps(fscal,dx31);
2171             ty               = _mm_mul_ps(fscal,dy31);
2172             tz               = _mm_mul_ps(fscal,dz31);
2173
2174             /* Update vectorial force */
2175             fix3             = _mm_add_ps(fix3,tx);
2176             fiy3             = _mm_add_ps(fiy3,ty);
2177             fiz3             = _mm_add_ps(fiz3,tz);
2178
2179             fjx1             = _mm_add_ps(fjx1,tx);
2180             fjy1             = _mm_add_ps(fjy1,ty);
2181             fjz1             = _mm_add_ps(fjz1,tz);
2182
2183             /**************************
2184              * CALCULATE INTERACTIONS *
2185              **************************/
2186
2187             r32              = _mm_mul_ps(rsq32,rinv32);
2188             r32              = _mm_andnot_ps(dummy_mask,r32);
2189
2190             /* Calculate table index by multiplying r with table scale and truncate to integer */
2191             rt               = _mm_mul_ps(r32,vftabscale);
2192             vfitab           = _mm_cvttps_epi32(rt);
2193             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2194             vfitab           = _mm_slli_epi32(vfitab,2);
2195
2196             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2197             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2198             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2199             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2200             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2201             _MM_TRANSPOSE4_PS(Y,F,G,H);
2202             Heps             = _mm_mul_ps(vfeps,H);
2203             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2204             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2205             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2206
2207             fscal            = felec;
2208
2209             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2210
2211             /* Calculate temporary vectorial force */
2212             tx               = _mm_mul_ps(fscal,dx32);
2213             ty               = _mm_mul_ps(fscal,dy32);
2214             tz               = _mm_mul_ps(fscal,dz32);
2215
2216             /* Update vectorial force */
2217             fix3             = _mm_add_ps(fix3,tx);
2218             fiy3             = _mm_add_ps(fiy3,ty);
2219             fiz3             = _mm_add_ps(fiz3,tz);
2220
2221             fjx2             = _mm_add_ps(fjx2,tx);
2222             fjy2             = _mm_add_ps(fjy2,ty);
2223             fjz2             = _mm_add_ps(fjz2,tz);
2224
2225             /**************************
2226              * CALCULATE INTERACTIONS *
2227              **************************/
2228
2229             r33              = _mm_mul_ps(rsq33,rinv33);
2230             r33              = _mm_andnot_ps(dummy_mask,r33);
2231
2232             /* Calculate table index by multiplying r with table scale and truncate to integer */
2233             rt               = _mm_mul_ps(r33,vftabscale);
2234             vfitab           = _mm_cvttps_epi32(rt);
2235             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2236             vfitab           = _mm_slli_epi32(vfitab,2);
2237
2238             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2239             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2240             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2241             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2242             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2243             _MM_TRANSPOSE4_PS(Y,F,G,H);
2244             Heps             = _mm_mul_ps(vfeps,H);
2245             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2246             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2247             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2248
2249             fscal            = felec;
2250
2251             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2252
2253             /* Calculate temporary vectorial force */
2254             tx               = _mm_mul_ps(fscal,dx33);
2255             ty               = _mm_mul_ps(fscal,dy33);
2256             tz               = _mm_mul_ps(fscal,dz33);
2257
2258             /* Update vectorial force */
2259             fix3             = _mm_add_ps(fix3,tx);
2260             fiy3             = _mm_add_ps(fiy3,ty);
2261             fiz3             = _mm_add_ps(fiz3,tz);
2262
2263             fjx3             = _mm_add_ps(fjx3,tx);
2264             fjy3             = _mm_add_ps(fjy3,ty);
2265             fjz3             = _mm_add_ps(fjz3,tz);
2266
2267             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
2268                                                    f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
2269                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2270
2271             /* Inner loop uses 360 flops */
2272         }
2273
2274         /* End of innermost loop */
2275
2276         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2277                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
2278
2279         /* Increment number of inner iterations */
2280         inneriter                  += j_index_end - j_index_start;
2281
2282         /* Outer loop uses 27 flops */
2283     }
2284
2285     /* Increment number of outer iterations */
2286     outeriter        += nri;
2287
2288     /* Update outer/inner flops */
2289
2290     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*360);
2291 }