Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse4_1_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_single
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            None
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset1;
83     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84     int              vdwioffset2;
85     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86     int              vdwioffset3;
87     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
89     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
91     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
93     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
94     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
95     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
96     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
97     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
98     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
99     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
100     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
101     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
102     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
103     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     __m128i          vfitab;
106     __m128i          ifour       = _mm_set1_epi32(4);
107     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
108     real             *vftab;
109     __m128           dummy_mask,cutoff_mask;
110     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
111     __m128           one     = _mm_set1_ps(1.0);
112     __m128           two     = _mm_set1_ps(2.0);
113     x                = xx[0];
114     f                = ff[0];
115
116     nri              = nlist->nri;
117     iinr             = nlist->iinr;
118     jindex           = nlist->jindex;
119     jjnr             = nlist->jjnr;
120     shiftidx         = nlist->shift;
121     gid              = nlist->gid;
122     shiftvec         = fr->shift_vec[0];
123     fshift           = fr->fshift[0];
124     facel            = _mm_set1_ps(fr->ic->epsfac);
125     charge           = mdatoms->chargeA;
126
127     vftab            = kernel_data->table_elec->data;
128     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
129
130     /* Setup water-specific parameters */
131     inr              = nlist->iinr[0];
132     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
133     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
134     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
135
136     jq1              = _mm_set1_ps(charge[inr+1]);
137     jq2              = _mm_set1_ps(charge[inr+2]);
138     jq3              = _mm_set1_ps(charge[inr+3]);
139     qq11             = _mm_mul_ps(iq1,jq1);
140     qq12             = _mm_mul_ps(iq1,jq2);
141     qq13             = _mm_mul_ps(iq1,jq3);
142     qq21             = _mm_mul_ps(iq2,jq1);
143     qq22             = _mm_mul_ps(iq2,jq2);
144     qq23             = _mm_mul_ps(iq2,jq3);
145     qq31             = _mm_mul_ps(iq3,jq1);
146     qq32             = _mm_mul_ps(iq3,jq2);
147     qq33             = _mm_mul_ps(iq3,jq3);
148
149     /* Avoid stupid compiler warnings */
150     jnrA = jnrB = jnrC = jnrD = 0;
151     j_coord_offsetA = 0;
152     j_coord_offsetB = 0;
153     j_coord_offsetC = 0;
154     j_coord_offsetD = 0;
155
156     outeriter        = 0;
157     inneriter        = 0;
158
159     for(iidx=0;iidx<4*DIM;iidx++)
160     {
161         scratch[iidx] = 0.0;
162     }
163
164     /* Start outer loop over neighborlists */
165     for(iidx=0; iidx<nri; iidx++)
166     {
167         /* Load shift vector for this list */
168         i_shift_offset   = DIM*shiftidx[iidx];
169
170         /* Load limits for loop over neighbors */
171         j_index_start    = jindex[iidx];
172         j_index_end      = jindex[iidx+1];
173
174         /* Get outer coordinate index */
175         inr              = iinr[iidx];
176         i_coord_offset   = DIM*inr;
177
178         /* Load i particle coords and add shift vector */
179         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
180                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
181
182         fix1             = _mm_setzero_ps();
183         fiy1             = _mm_setzero_ps();
184         fiz1             = _mm_setzero_ps();
185         fix2             = _mm_setzero_ps();
186         fiy2             = _mm_setzero_ps();
187         fiz2             = _mm_setzero_ps();
188         fix3             = _mm_setzero_ps();
189         fiy3             = _mm_setzero_ps();
190         fiz3             = _mm_setzero_ps();
191
192         /* Reset potential sums */
193         velecsum         = _mm_setzero_ps();
194
195         /* Start inner kernel loop */
196         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
197         {
198
199             /* Get j neighbor index, and coordinate index */
200             jnrA             = jjnr[jidx];
201             jnrB             = jjnr[jidx+1];
202             jnrC             = jjnr[jidx+2];
203             jnrD             = jjnr[jidx+3];
204             j_coord_offsetA  = DIM*jnrA;
205             j_coord_offsetB  = DIM*jnrB;
206             j_coord_offsetC  = DIM*jnrC;
207             j_coord_offsetD  = DIM*jnrD;
208
209             /* load j atom coordinates */
210             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
211                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
212                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
213
214             /* Calculate displacement vector */
215             dx11             = _mm_sub_ps(ix1,jx1);
216             dy11             = _mm_sub_ps(iy1,jy1);
217             dz11             = _mm_sub_ps(iz1,jz1);
218             dx12             = _mm_sub_ps(ix1,jx2);
219             dy12             = _mm_sub_ps(iy1,jy2);
220             dz12             = _mm_sub_ps(iz1,jz2);
221             dx13             = _mm_sub_ps(ix1,jx3);
222             dy13             = _mm_sub_ps(iy1,jy3);
223             dz13             = _mm_sub_ps(iz1,jz3);
224             dx21             = _mm_sub_ps(ix2,jx1);
225             dy21             = _mm_sub_ps(iy2,jy1);
226             dz21             = _mm_sub_ps(iz2,jz1);
227             dx22             = _mm_sub_ps(ix2,jx2);
228             dy22             = _mm_sub_ps(iy2,jy2);
229             dz22             = _mm_sub_ps(iz2,jz2);
230             dx23             = _mm_sub_ps(ix2,jx3);
231             dy23             = _mm_sub_ps(iy2,jy3);
232             dz23             = _mm_sub_ps(iz2,jz3);
233             dx31             = _mm_sub_ps(ix3,jx1);
234             dy31             = _mm_sub_ps(iy3,jy1);
235             dz31             = _mm_sub_ps(iz3,jz1);
236             dx32             = _mm_sub_ps(ix3,jx2);
237             dy32             = _mm_sub_ps(iy3,jy2);
238             dz32             = _mm_sub_ps(iz3,jz2);
239             dx33             = _mm_sub_ps(ix3,jx3);
240             dy33             = _mm_sub_ps(iy3,jy3);
241             dz33             = _mm_sub_ps(iz3,jz3);
242
243             /* Calculate squared distance and things based on it */
244             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
245             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
246             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
247             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
248             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
249             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
250             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
251             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
252             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
253
254             rinv11           = sse41_invsqrt_f(rsq11);
255             rinv12           = sse41_invsqrt_f(rsq12);
256             rinv13           = sse41_invsqrt_f(rsq13);
257             rinv21           = sse41_invsqrt_f(rsq21);
258             rinv22           = sse41_invsqrt_f(rsq22);
259             rinv23           = sse41_invsqrt_f(rsq23);
260             rinv31           = sse41_invsqrt_f(rsq31);
261             rinv32           = sse41_invsqrt_f(rsq32);
262             rinv33           = sse41_invsqrt_f(rsq33);
263
264             fjx1             = _mm_setzero_ps();
265             fjy1             = _mm_setzero_ps();
266             fjz1             = _mm_setzero_ps();
267             fjx2             = _mm_setzero_ps();
268             fjy2             = _mm_setzero_ps();
269             fjz2             = _mm_setzero_ps();
270             fjx3             = _mm_setzero_ps();
271             fjy3             = _mm_setzero_ps();
272             fjz3             = _mm_setzero_ps();
273
274             /**************************
275              * CALCULATE INTERACTIONS *
276              **************************/
277
278             r11              = _mm_mul_ps(rsq11,rinv11);
279
280             /* Calculate table index by multiplying r with table scale and truncate to integer */
281             rt               = _mm_mul_ps(r11,vftabscale);
282             vfitab           = _mm_cvttps_epi32(rt);
283             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
284             vfitab           = _mm_slli_epi32(vfitab,2);
285
286             /* CUBIC SPLINE TABLE ELECTROSTATICS */
287             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
288             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
289             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
290             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
291             _MM_TRANSPOSE4_PS(Y,F,G,H);
292             Heps             = _mm_mul_ps(vfeps,H);
293             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
294             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
295             velec            = _mm_mul_ps(qq11,VV);
296             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
297             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
298
299             /* Update potential sum for this i atom from the interaction with this j atom. */
300             velecsum         = _mm_add_ps(velecsum,velec);
301
302             fscal            = felec;
303
304             /* Calculate temporary vectorial force */
305             tx               = _mm_mul_ps(fscal,dx11);
306             ty               = _mm_mul_ps(fscal,dy11);
307             tz               = _mm_mul_ps(fscal,dz11);
308
309             /* Update vectorial force */
310             fix1             = _mm_add_ps(fix1,tx);
311             fiy1             = _mm_add_ps(fiy1,ty);
312             fiz1             = _mm_add_ps(fiz1,tz);
313
314             fjx1             = _mm_add_ps(fjx1,tx);
315             fjy1             = _mm_add_ps(fjy1,ty);
316             fjz1             = _mm_add_ps(fjz1,tz);
317
318             /**************************
319              * CALCULATE INTERACTIONS *
320              **************************/
321
322             r12              = _mm_mul_ps(rsq12,rinv12);
323
324             /* Calculate table index by multiplying r with table scale and truncate to integer */
325             rt               = _mm_mul_ps(r12,vftabscale);
326             vfitab           = _mm_cvttps_epi32(rt);
327             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
328             vfitab           = _mm_slli_epi32(vfitab,2);
329
330             /* CUBIC SPLINE TABLE ELECTROSTATICS */
331             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
332             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
333             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
334             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
335             _MM_TRANSPOSE4_PS(Y,F,G,H);
336             Heps             = _mm_mul_ps(vfeps,H);
337             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
338             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
339             velec            = _mm_mul_ps(qq12,VV);
340             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
341             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
342
343             /* Update potential sum for this i atom from the interaction with this j atom. */
344             velecsum         = _mm_add_ps(velecsum,velec);
345
346             fscal            = felec;
347
348             /* Calculate temporary vectorial force */
349             tx               = _mm_mul_ps(fscal,dx12);
350             ty               = _mm_mul_ps(fscal,dy12);
351             tz               = _mm_mul_ps(fscal,dz12);
352
353             /* Update vectorial force */
354             fix1             = _mm_add_ps(fix1,tx);
355             fiy1             = _mm_add_ps(fiy1,ty);
356             fiz1             = _mm_add_ps(fiz1,tz);
357
358             fjx2             = _mm_add_ps(fjx2,tx);
359             fjy2             = _mm_add_ps(fjy2,ty);
360             fjz2             = _mm_add_ps(fjz2,tz);
361
362             /**************************
363              * CALCULATE INTERACTIONS *
364              **************************/
365
366             r13              = _mm_mul_ps(rsq13,rinv13);
367
368             /* Calculate table index by multiplying r with table scale and truncate to integer */
369             rt               = _mm_mul_ps(r13,vftabscale);
370             vfitab           = _mm_cvttps_epi32(rt);
371             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
372             vfitab           = _mm_slli_epi32(vfitab,2);
373
374             /* CUBIC SPLINE TABLE ELECTROSTATICS */
375             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
376             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
377             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
378             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
379             _MM_TRANSPOSE4_PS(Y,F,G,H);
380             Heps             = _mm_mul_ps(vfeps,H);
381             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
382             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
383             velec            = _mm_mul_ps(qq13,VV);
384             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
385             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
386
387             /* Update potential sum for this i atom from the interaction with this j atom. */
388             velecsum         = _mm_add_ps(velecsum,velec);
389
390             fscal            = felec;
391
392             /* Calculate temporary vectorial force */
393             tx               = _mm_mul_ps(fscal,dx13);
394             ty               = _mm_mul_ps(fscal,dy13);
395             tz               = _mm_mul_ps(fscal,dz13);
396
397             /* Update vectorial force */
398             fix1             = _mm_add_ps(fix1,tx);
399             fiy1             = _mm_add_ps(fiy1,ty);
400             fiz1             = _mm_add_ps(fiz1,tz);
401
402             fjx3             = _mm_add_ps(fjx3,tx);
403             fjy3             = _mm_add_ps(fjy3,ty);
404             fjz3             = _mm_add_ps(fjz3,tz);
405
406             /**************************
407              * CALCULATE INTERACTIONS *
408              **************************/
409
410             r21              = _mm_mul_ps(rsq21,rinv21);
411
412             /* Calculate table index by multiplying r with table scale and truncate to integer */
413             rt               = _mm_mul_ps(r21,vftabscale);
414             vfitab           = _mm_cvttps_epi32(rt);
415             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
416             vfitab           = _mm_slli_epi32(vfitab,2);
417
418             /* CUBIC SPLINE TABLE ELECTROSTATICS */
419             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
420             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
421             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
422             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
423             _MM_TRANSPOSE4_PS(Y,F,G,H);
424             Heps             = _mm_mul_ps(vfeps,H);
425             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
426             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
427             velec            = _mm_mul_ps(qq21,VV);
428             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
429             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
430
431             /* Update potential sum for this i atom from the interaction with this j atom. */
432             velecsum         = _mm_add_ps(velecsum,velec);
433
434             fscal            = felec;
435
436             /* Calculate temporary vectorial force */
437             tx               = _mm_mul_ps(fscal,dx21);
438             ty               = _mm_mul_ps(fscal,dy21);
439             tz               = _mm_mul_ps(fscal,dz21);
440
441             /* Update vectorial force */
442             fix2             = _mm_add_ps(fix2,tx);
443             fiy2             = _mm_add_ps(fiy2,ty);
444             fiz2             = _mm_add_ps(fiz2,tz);
445
446             fjx1             = _mm_add_ps(fjx1,tx);
447             fjy1             = _mm_add_ps(fjy1,ty);
448             fjz1             = _mm_add_ps(fjz1,tz);
449
450             /**************************
451              * CALCULATE INTERACTIONS *
452              **************************/
453
454             r22              = _mm_mul_ps(rsq22,rinv22);
455
456             /* Calculate table index by multiplying r with table scale and truncate to integer */
457             rt               = _mm_mul_ps(r22,vftabscale);
458             vfitab           = _mm_cvttps_epi32(rt);
459             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
460             vfitab           = _mm_slli_epi32(vfitab,2);
461
462             /* CUBIC SPLINE TABLE ELECTROSTATICS */
463             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
464             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
465             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
466             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
467             _MM_TRANSPOSE4_PS(Y,F,G,H);
468             Heps             = _mm_mul_ps(vfeps,H);
469             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
470             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
471             velec            = _mm_mul_ps(qq22,VV);
472             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
473             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
474
475             /* Update potential sum for this i atom from the interaction with this j atom. */
476             velecsum         = _mm_add_ps(velecsum,velec);
477
478             fscal            = felec;
479
480             /* Calculate temporary vectorial force */
481             tx               = _mm_mul_ps(fscal,dx22);
482             ty               = _mm_mul_ps(fscal,dy22);
483             tz               = _mm_mul_ps(fscal,dz22);
484
485             /* Update vectorial force */
486             fix2             = _mm_add_ps(fix2,tx);
487             fiy2             = _mm_add_ps(fiy2,ty);
488             fiz2             = _mm_add_ps(fiz2,tz);
489
490             fjx2             = _mm_add_ps(fjx2,tx);
491             fjy2             = _mm_add_ps(fjy2,ty);
492             fjz2             = _mm_add_ps(fjz2,tz);
493
494             /**************************
495              * CALCULATE INTERACTIONS *
496              **************************/
497
498             r23              = _mm_mul_ps(rsq23,rinv23);
499
500             /* Calculate table index by multiplying r with table scale and truncate to integer */
501             rt               = _mm_mul_ps(r23,vftabscale);
502             vfitab           = _mm_cvttps_epi32(rt);
503             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
504             vfitab           = _mm_slli_epi32(vfitab,2);
505
506             /* CUBIC SPLINE TABLE ELECTROSTATICS */
507             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
508             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
509             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
510             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
511             _MM_TRANSPOSE4_PS(Y,F,G,H);
512             Heps             = _mm_mul_ps(vfeps,H);
513             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
514             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
515             velec            = _mm_mul_ps(qq23,VV);
516             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
517             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
518
519             /* Update potential sum for this i atom from the interaction with this j atom. */
520             velecsum         = _mm_add_ps(velecsum,velec);
521
522             fscal            = felec;
523
524             /* Calculate temporary vectorial force */
525             tx               = _mm_mul_ps(fscal,dx23);
526             ty               = _mm_mul_ps(fscal,dy23);
527             tz               = _mm_mul_ps(fscal,dz23);
528
529             /* Update vectorial force */
530             fix2             = _mm_add_ps(fix2,tx);
531             fiy2             = _mm_add_ps(fiy2,ty);
532             fiz2             = _mm_add_ps(fiz2,tz);
533
534             fjx3             = _mm_add_ps(fjx3,tx);
535             fjy3             = _mm_add_ps(fjy3,ty);
536             fjz3             = _mm_add_ps(fjz3,tz);
537
538             /**************************
539              * CALCULATE INTERACTIONS *
540              **************************/
541
542             r31              = _mm_mul_ps(rsq31,rinv31);
543
544             /* Calculate table index by multiplying r with table scale and truncate to integer */
545             rt               = _mm_mul_ps(r31,vftabscale);
546             vfitab           = _mm_cvttps_epi32(rt);
547             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
548             vfitab           = _mm_slli_epi32(vfitab,2);
549
550             /* CUBIC SPLINE TABLE ELECTROSTATICS */
551             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
552             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
553             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
554             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
555             _MM_TRANSPOSE4_PS(Y,F,G,H);
556             Heps             = _mm_mul_ps(vfeps,H);
557             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
558             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
559             velec            = _mm_mul_ps(qq31,VV);
560             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
561             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
562
563             /* Update potential sum for this i atom from the interaction with this j atom. */
564             velecsum         = _mm_add_ps(velecsum,velec);
565
566             fscal            = felec;
567
568             /* Calculate temporary vectorial force */
569             tx               = _mm_mul_ps(fscal,dx31);
570             ty               = _mm_mul_ps(fscal,dy31);
571             tz               = _mm_mul_ps(fscal,dz31);
572
573             /* Update vectorial force */
574             fix3             = _mm_add_ps(fix3,tx);
575             fiy3             = _mm_add_ps(fiy3,ty);
576             fiz3             = _mm_add_ps(fiz3,tz);
577
578             fjx1             = _mm_add_ps(fjx1,tx);
579             fjy1             = _mm_add_ps(fjy1,ty);
580             fjz1             = _mm_add_ps(fjz1,tz);
581
582             /**************************
583              * CALCULATE INTERACTIONS *
584              **************************/
585
586             r32              = _mm_mul_ps(rsq32,rinv32);
587
588             /* Calculate table index by multiplying r with table scale and truncate to integer */
589             rt               = _mm_mul_ps(r32,vftabscale);
590             vfitab           = _mm_cvttps_epi32(rt);
591             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
592             vfitab           = _mm_slli_epi32(vfitab,2);
593
594             /* CUBIC SPLINE TABLE ELECTROSTATICS */
595             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
596             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
597             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
598             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
599             _MM_TRANSPOSE4_PS(Y,F,G,H);
600             Heps             = _mm_mul_ps(vfeps,H);
601             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
602             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
603             velec            = _mm_mul_ps(qq32,VV);
604             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
605             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
606
607             /* Update potential sum for this i atom from the interaction with this j atom. */
608             velecsum         = _mm_add_ps(velecsum,velec);
609
610             fscal            = felec;
611
612             /* Calculate temporary vectorial force */
613             tx               = _mm_mul_ps(fscal,dx32);
614             ty               = _mm_mul_ps(fscal,dy32);
615             tz               = _mm_mul_ps(fscal,dz32);
616
617             /* Update vectorial force */
618             fix3             = _mm_add_ps(fix3,tx);
619             fiy3             = _mm_add_ps(fiy3,ty);
620             fiz3             = _mm_add_ps(fiz3,tz);
621
622             fjx2             = _mm_add_ps(fjx2,tx);
623             fjy2             = _mm_add_ps(fjy2,ty);
624             fjz2             = _mm_add_ps(fjz2,tz);
625
626             /**************************
627              * CALCULATE INTERACTIONS *
628              **************************/
629
630             r33              = _mm_mul_ps(rsq33,rinv33);
631
632             /* Calculate table index by multiplying r with table scale and truncate to integer */
633             rt               = _mm_mul_ps(r33,vftabscale);
634             vfitab           = _mm_cvttps_epi32(rt);
635             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
636             vfitab           = _mm_slli_epi32(vfitab,2);
637
638             /* CUBIC SPLINE TABLE ELECTROSTATICS */
639             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
640             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
641             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
642             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
643             _MM_TRANSPOSE4_PS(Y,F,G,H);
644             Heps             = _mm_mul_ps(vfeps,H);
645             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
646             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
647             velec            = _mm_mul_ps(qq33,VV);
648             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
649             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
650
651             /* Update potential sum for this i atom from the interaction with this j atom. */
652             velecsum         = _mm_add_ps(velecsum,velec);
653
654             fscal            = felec;
655
656             /* Calculate temporary vectorial force */
657             tx               = _mm_mul_ps(fscal,dx33);
658             ty               = _mm_mul_ps(fscal,dy33);
659             tz               = _mm_mul_ps(fscal,dz33);
660
661             /* Update vectorial force */
662             fix3             = _mm_add_ps(fix3,tx);
663             fiy3             = _mm_add_ps(fiy3,ty);
664             fiz3             = _mm_add_ps(fiz3,tz);
665
666             fjx3             = _mm_add_ps(fjx3,tx);
667             fjy3             = _mm_add_ps(fjy3,ty);
668             fjz3             = _mm_add_ps(fjz3,tz);
669
670             fjptrA             = f+j_coord_offsetA;
671             fjptrB             = f+j_coord_offsetB;
672             fjptrC             = f+j_coord_offsetC;
673             fjptrD             = f+j_coord_offsetD;
674
675             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
676                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
677
678             /* Inner loop uses 387 flops */
679         }
680
681         if(jidx<j_index_end)
682         {
683
684             /* Get j neighbor index, and coordinate index */
685             jnrlistA         = jjnr[jidx];
686             jnrlistB         = jjnr[jidx+1];
687             jnrlistC         = jjnr[jidx+2];
688             jnrlistD         = jjnr[jidx+3];
689             /* Sign of each element will be negative for non-real atoms.
690              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
691              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
692              */
693             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
694             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
695             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
696             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
697             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
698             j_coord_offsetA  = DIM*jnrA;
699             j_coord_offsetB  = DIM*jnrB;
700             j_coord_offsetC  = DIM*jnrC;
701             j_coord_offsetD  = DIM*jnrD;
702
703             /* load j atom coordinates */
704             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
705                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
706                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
707
708             /* Calculate displacement vector */
709             dx11             = _mm_sub_ps(ix1,jx1);
710             dy11             = _mm_sub_ps(iy1,jy1);
711             dz11             = _mm_sub_ps(iz1,jz1);
712             dx12             = _mm_sub_ps(ix1,jx2);
713             dy12             = _mm_sub_ps(iy1,jy2);
714             dz12             = _mm_sub_ps(iz1,jz2);
715             dx13             = _mm_sub_ps(ix1,jx3);
716             dy13             = _mm_sub_ps(iy1,jy3);
717             dz13             = _mm_sub_ps(iz1,jz3);
718             dx21             = _mm_sub_ps(ix2,jx1);
719             dy21             = _mm_sub_ps(iy2,jy1);
720             dz21             = _mm_sub_ps(iz2,jz1);
721             dx22             = _mm_sub_ps(ix2,jx2);
722             dy22             = _mm_sub_ps(iy2,jy2);
723             dz22             = _mm_sub_ps(iz2,jz2);
724             dx23             = _mm_sub_ps(ix2,jx3);
725             dy23             = _mm_sub_ps(iy2,jy3);
726             dz23             = _mm_sub_ps(iz2,jz3);
727             dx31             = _mm_sub_ps(ix3,jx1);
728             dy31             = _mm_sub_ps(iy3,jy1);
729             dz31             = _mm_sub_ps(iz3,jz1);
730             dx32             = _mm_sub_ps(ix3,jx2);
731             dy32             = _mm_sub_ps(iy3,jy2);
732             dz32             = _mm_sub_ps(iz3,jz2);
733             dx33             = _mm_sub_ps(ix3,jx3);
734             dy33             = _mm_sub_ps(iy3,jy3);
735             dz33             = _mm_sub_ps(iz3,jz3);
736
737             /* Calculate squared distance and things based on it */
738             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
739             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
740             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
741             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
742             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
743             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
744             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
745             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
746             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
747
748             rinv11           = sse41_invsqrt_f(rsq11);
749             rinv12           = sse41_invsqrt_f(rsq12);
750             rinv13           = sse41_invsqrt_f(rsq13);
751             rinv21           = sse41_invsqrt_f(rsq21);
752             rinv22           = sse41_invsqrt_f(rsq22);
753             rinv23           = sse41_invsqrt_f(rsq23);
754             rinv31           = sse41_invsqrt_f(rsq31);
755             rinv32           = sse41_invsqrt_f(rsq32);
756             rinv33           = sse41_invsqrt_f(rsq33);
757
758             fjx1             = _mm_setzero_ps();
759             fjy1             = _mm_setzero_ps();
760             fjz1             = _mm_setzero_ps();
761             fjx2             = _mm_setzero_ps();
762             fjy2             = _mm_setzero_ps();
763             fjz2             = _mm_setzero_ps();
764             fjx3             = _mm_setzero_ps();
765             fjy3             = _mm_setzero_ps();
766             fjz3             = _mm_setzero_ps();
767
768             /**************************
769              * CALCULATE INTERACTIONS *
770              **************************/
771
772             r11              = _mm_mul_ps(rsq11,rinv11);
773             r11              = _mm_andnot_ps(dummy_mask,r11);
774
775             /* Calculate table index by multiplying r with table scale and truncate to integer */
776             rt               = _mm_mul_ps(r11,vftabscale);
777             vfitab           = _mm_cvttps_epi32(rt);
778             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
779             vfitab           = _mm_slli_epi32(vfitab,2);
780
781             /* CUBIC SPLINE TABLE ELECTROSTATICS */
782             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
783             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
784             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
785             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
786             _MM_TRANSPOSE4_PS(Y,F,G,H);
787             Heps             = _mm_mul_ps(vfeps,H);
788             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
789             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
790             velec            = _mm_mul_ps(qq11,VV);
791             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
792             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
793
794             /* Update potential sum for this i atom from the interaction with this j atom. */
795             velec            = _mm_andnot_ps(dummy_mask,velec);
796             velecsum         = _mm_add_ps(velecsum,velec);
797
798             fscal            = felec;
799
800             fscal            = _mm_andnot_ps(dummy_mask,fscal);
801
802             /* Calculate temporary vectorial force */
803             tx               = _mm_mul_ps(fscal,dx11);
804             ty               = _mm_mul_ps(fscal,dy11);
805             tz               = _mm_mul_ps(fscal,dz11);
806
807             /* Update vectorial force */
808             fix1             = _mm_add_ps(fix1,tx);
809             fiy1             = _mm_add_ps(fiy1,ty);
810             fiz1             = _mm_add_ps(fiz1,tz);
811
812             fjx1             = _mm_add_ps(fjx1,tx);
813             fjy1             = _mm_add_ps(fjy1,ty);
814             fjz1             = _mm_add_ps(fjz1,tz);
815
816             /**************************
817              * CALCULATE INTERACTIONS *
818              **************************/
819
820             r12              = _mm_mul_ps(rsq12,rinv12);
821             r12              = _mm_andnot_ps(dummy_mask,r12);
822
823             /* Calculate table index by multiplying r with table scale and truncate to integer */
824             rt               = _mm_mul_ps(r12,vftabscale);
825             vfitab           = _mm_cvttps_epi32(rt);
826             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
827             vfitab           = _mm_slli_epi32(vfitab,2);
828
829             /* CUBIC SPLINE TABLE ELECTROSTATICS */
830             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
831             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
832             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
833             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
834             _MM_TRANSPOSE4_PS(Y,F,G,H);
835             Heps             = _mm_mul_ps(vfeps,H);
836             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
837             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
838             velec            = _mm_mul_ps(qq12,VV);
839             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
840             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
841
842             /* Update potential sum for this i atom from the interaction with this j atom. */
843             velec            = _mm_andnot_ps(dummy_mask,velec);
844             velecsum         = _mm_add_ps(velecsum,velec);
845
846             fscal            = felec;
847
848             fscal            = _mm_andnot_ps(dummy_mask,fscal);
849
850             /* Calculate temporary vectorial force */
851             tx               = _mm_mul_ps(fscal,dx12);
852             ty               = _mm_mul_ps(fscal,dy12);
853             tz               = _mm_mul_ps(fscal,dz12);
854
855             /* Update vectorial force */
856             fix1             = _mm_add_ps(fix1,tx);
857             fiy1             = _mm_add_ps(fiy1,ty);
858             fiz1             = _mm_add_ps(fiz1,tz);
859
860             fjx2             = _mm_add_ps(fjx2,tx);
861             fjy2             = _mm_add_ps(fjy2,ty);
862             fjz2             = _mm_add_ps(fjz2,tz);
863
864             /**************************
865              * CALCULATE INTERACTIONS *
866              **************************/
867
868             r13              = _mm_mul_ps(rsq13,rinv13);
869             r13              = _mm_andnot_ps(dummy_mask,r13);
870
871             /* Calculate table index by multiplying r with table scale and truncate to integer */
872             rt               = _mm_mul_ps(r13,vftabscale);
873             vfitab           = _mm_cvttps_epi32(rt);
874             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
875             vfitab           = _mm_slli_epi32(vfitab,2);
876
877             /* CUBIC SPLINE TABLE ELECTROSTATICS */
878             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
879             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
880             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
881             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
882             _MM_TRANSPOSE4_PS(Y,F,G,H);
883             Heps             = _mm_mul_ps(vfeps,H);
884             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
885             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
886             velec            = _mm_mul_ps(qq13,VV);
887             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
888             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
889
890             /* Update potential sum for this i atom from the interaction with this j atom. */
891             velec            = _mm_andnot_ps(dummy_mask,velec);
892             velecsum         = _mm_add_ps(velecsum,velec);
893
894             fscal            = felec;
895
896             fscal            = _mm_andnot_ps(dummy_mask,fscal);
897
898             /* Calculate temporary vectorial force */
899             tx               = _mm_mul_ps(fscal,dx13);
900             ty               = _mm_mul_ps(fscal,dy13);
901             tz               = _mm_mul_ps(fscal,dz13);
902
903             /* Update vectorial force */
904             fix1             = _mm_add_ps(fix1,tx);
905             fiy1             = _mm_add_ps(fiy1,ty);
906             fiz1             = _mm_add_ps(fiz1,tz);
907
908             fjx3             = _mm_add_ps(fjx3,tx);
909             fjy3             = _mm_add_ps(fjy3,ty);
910             fjz3             = _mm_add_ps(fjz3,tz);
911
912             /**************************
913              * CALCULATE INTERACTIONS *
914              **************************/
915
916             r21              = _mm_mul_ps(rsq21,rinv21);
917             r21              = _mm_andnot_ps(dummy_mask,r21);
918
919             /* Calculate table index by multiplying r with table scale and truncate to integer */
920             rt               = _mm_mul_ps(r21,vftabscale);
921             vfitab           = _mm_cvttps_epi32(rt);
922             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
923             vfitab           = _mm_slli_epi32(vfitab,2);
924
925             /* CUBIC SPLINE TABLE ELECTROSTATICS */
926             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
927             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
928             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
929             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
930             _MM_TRANSPOSE4_PS(Y,F,G,H);
931             Heps             = _mm_mul_ps(vfeps,H);
932             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
933             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
934             velec            = _mm_mul_ps(qq21,VV);
935             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
936             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
937
938             /* Update potential sum for this i atom from the interaction with this j atom. */
939             velec            = _mm_andnot_ps(dummy_mask,velec);
940             velecsum         = _mm_add_ps(velecsum,velec);
941
942             fscal            = felec;
943
944             fscal            = _mm_andnot_ps(dummy_mask,fscal);
945
946             /* Calculate temporary vectorial force */
947             tx               = _mm_mul_ps(fscal,dx21);
948             ty               = _mm_mul_ps(fscal,dy21);
949             tz               = _mm_mul_ps(fscal,dz21);
950
951             /* Update vectorial force */
952             fix2             = _mm_add_ps(fix2,tx);
953             fiy2             = _mm_add_ps(fiy2,ty);
954             fiz2             = _mm_add_ps(fiz2,tz);
955
956             fjx1             = _mm_add_ps(fjx1,tx);
957             fjy1             = _mm_add_ps(fjy1,ty);
958             fjz1             = _mm_add_ps(fjz1,tz);
959
960             /**************************
961              * CALCULATE INTERACTIONS *
962              **************************/
963
964             r22              = _mm_mul_ps(rsq22,rinv22);
965             r22              = _mm_andnot_ps(dummy_mask,r22);
966
967             /* Calculate table index by multiplying r with table scale and truncate to integer */
968             rt               = _mm_mul_ps(r22,vftabscale);
969             vfitab           = _mm_cvttps_epi32(rt);
970             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
971             vfitab           = _mm_slli_epi32(vfitab,2);
972
973             /* CUBIC SPLINE TABLE ELECTROSTATICS */
974             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
975             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
976             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
977             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
978             _MM_TRANSPOSE4_PS(Y,F,G,H);
979             Heps             = _mm_mul_ps(vfeps,H);
980             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
981             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
982             velec            = _mm_mul_ps(qq22,VV);
983             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
984             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
985
986             /* Update potential sum for this i atom from the interaction with this j atom. */
987             velec            = _mm_andnot_ps(dummy_mask,velec);
988             velecsum         = _mm_add_ps(velecsum,velec);
989
990             fscal            = felec;
991
992             fscal            = _mm_andnot_ps(dummy_mask,fscal);
993
994             /* Calculate temporary vectorial force */
995             tx               = _mm_mul_ps(fscal,dx22);
996             ty               = _mm_mul_ps(fscal,dy22);
997             tz               = _mm_mul_ps(fscal,dz22);
998
999             /* Update vectorial force */
1000             fix2             = _mm_add_ps(fix2,tx);
1001             fiy2             = _mm_add_ps(fiy2,ty);
1002             fiz2             = _mm_add_ps(fiz2,tz);
1003
1004             fjx2             = _mm_add_ps(fjx2,tx);
1005             fjy2             = _mm_add_ps(fjy2,ty);
1006             fjz2             = _mm_add_ps(fjz2,tz);
1007
1008             /**************************
1009              * CALCULATE INTERACTIONS *
1010              **************************/
1011
1012             r23              = _mm_mul_ps(rsq23,rinv23);
1013             r23              = _mm_andnot_ps(dummy_mask,r23);
1014
1015             /* Calculate table index by multiplying r with table scale and truncate to integer */
1016             rt               = _mm_mul_ps(r23,vftabscale);
1017             vfitab           = _mm_cvttps_epi32(rt);
1018             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1019             vfitab           = _mm_slli_epi32(vfitab,2);
1020
1021             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1022             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1023             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1024             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1025             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1026             _MM_TRANSPOSE4_PS(Y,F,G,H);
1027             Heps             = _mm_mul_ps(vfeps,H);
1028             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1029             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1030             velec            = _mm_mul_ps(qq23,VV);
1031             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1032             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1033
1034             /* Update potential sum for this i atom from the interaction with this j atom. */
1035             velec            = _mm_andnot_ps(dummy_mask,velec);
1036             velecsum         = _mm_add_ps(velecsum,velec);
1037
1038             fscal            = felec;
1039
1040             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1041
1042             /* Calculate temporary vectorial force */
1043             tx               = _mm_mul_ps(fscal,dx23);
1044             ty               = _mm_mul_ps(fscal,dy23);
1045             tz               = _mm_mul_ps(fscal,dz23);
1046
1047             /* Update vectorial force */
1048             fix2             = _mm_add_ps(fix2,tx);
1049             fiy2             = _mm_add_ps(fiy2,ty);
1050             fiz2             = _mm_add_ps(fiz2,tz);
1051
1052             fjx3             = _mm_add_ps(fjx3,tx);
1053             fjy3             = _mm_add_ps(fjy3,ty);
1054             fjz3             = _mm_add_ps(fjz3,tz);
1055
1056             /**************************
1057              * CALCULATE INTERACTIONS *
1058              **************************/
1059
1060             r31              = _mm_mul_ps(rsq31,rinv31);
1061             r31              = _mm_andnot_ps(dummy_mask,r31);
1062
1063             /* Calculate table index by multiplying r with table scale and truncate to integer */
1064             rt               = _mm_mul_ps(r31,vftabscale);
1065             vfitab           = _mm_cvttps_epi32(rt);
1066             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1067             vfitab           = _mm_slli_epi32(vfitab,2);
1068
1069             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1070             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1071             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1072             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1073             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1074             _MM_TRANSPOSE4_PS(Y,F,G,H);
1075             Heps             = _mm_mul_ps(vfeps,H);
1076             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1077             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1078             velec            = _mm_mul_ps(qq31,VV);
1079             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1080             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1081
1082             /* Update potential sum for this i atom from the interaction with this j atom. */
1083             velec            = _mm_andnot_ps(dummy_mask,velec);
1084             velecsum         = _mm_add_ps(velecsum,velec);
1085
1086             fscal            = felec;
1087
1088             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1089
1090             /* Calculate temporary vectorial force */
1091             tx               = _mm_mul_ps(fscal,dx31);
1092             ty               = _mm_mul_ps(fscal,dy31);
1093             tz               = _mm_mul_ps(fscal,dz31);
1094
1095             /* Update vectorial force */
1096             fix3             = _mm_add_ps(fix3,tx);
1097             fiy3             = _mm_add_ps(fiy3,ty);
1098             fiz3             = _mm_add_ps(fiz3,tz);
1099
1100             fjx1             = _mm_add_ps(fjx1,tx);
1101             fjy1             = _mm_add_ps(fjy1,ty);
1102             fjz1             = _mm_add_ps(fjz1,tz);
1103
1104             /**************************
1105              * CALCULATE INTERACTIONS *
1106              **************************/
1107
1108             r32              = _mm_mul_ps(rsq32,rinv32);
1109             r32              = _mm_andnot_ps(dummy_mask,r32);
1110
1111             /* Calculate table index by multiplying r with table scale and truncate to integer */
1112             rt               = _mm_mul_ps(r32,vftabscale);
1113             vfitab           = _mm_cvttps_epi32(rt);
1114             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1115             vfitab           = _mm_slli_epi32(vfitab,2);
1116
1117             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1118             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1119             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1120             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1121             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1122             _MM_TRANSPOSE4_PS(Y,F,G,H);
1123             Heps             = _mm_mul_ps(vfeps,H);
1124             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1125             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1126             velec            = _mm_mul_ps(qq32,VV);
1127             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1128             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1129
1130             /* Update potential sum for this i atom from the interaction with this j atom. */
1131             velec            = _mm_andnot_ps(dummy_mask,velec);
1132             velecsum         = _mm_add_ps(velecsum,velec);
1133
1134             fscal            = felec;
1135
1136             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1137
1138             /* Calculate temporary vectorial force */
1139             tx               = _mm_mul_ps(fscal,dx32);
1140             ty               = _mm_mul_ps(fscal,dy32);
1141             tz               = _mm_mul_ps(fscal,dz32);
1142
1143             /* Update vectorial force */
1144             fix3             = _mm_add_ps(fix3,tx);
1145             fiy3             = _mm_add_ps(fiy3,ty);
1146             fiz3             = _mm_add_ps(fiz3,tz);
1147
1148             fjx2             = _mm_add_ps(fjx2,tx);
1149             fjy2             = _mm_add_ps(fjy2,ty);
1150             fjz2             = _mm_add_ps(fjz2,tz);
1151
1152             /**************************
1153              * CALCULATE INTERACTIONS *
1154              **************************/
1155
1156             r33              = _mm_mul_ps(rsq33,rinv33);
1157             r33              = _mm_andnot_ps(dummy_mask,r33);
1158
1159             /* Calculate table index by multiplying r with table scale and truncate to integer */
1160             rt               = _mm_mul_ps(r33,vftabscale);
1161             vfitab           = _mm_cvttps_epi32(rt);
1162             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1163             vfitab           = _mm_slli_epi32(vfitab,2);
1164
1165             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1166             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1167             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1168             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1169             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1170             _MM_TRANSPOSE4_PS(Y,F,G,H);
1171             Heps             = _mm_mul_ps(vfeps,H);
1172             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1173             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1174             velec            = _mm_mul_ps(qq33,VV);
1175             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1176             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1177
1178             /* Update potential sum for this i atom from the interaction with this j atom. */
1179             velec            = _mm_andnot_ps(dummy_mask,velec);
1180             velecsum         = _mm_add_ps(velecsum,velec);
1181
1182             fscal            = felec;
1183
1184             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1185
1186             /* Calculate temporary vectorial force */
1187             tx               = _mm_mul_ps(fscal,dx33);
1188             ty               = _mm_mul_ps(fscal,dy33);
1189             tz               = _mm_mul_ps(fscal,dz33);
1190
1191             /* Update vectorial force */
1192             fix3             = _mm_add_ps(fix3,tx);
1193             fiy3             = _mm_add_ps(fiy3,ty);
1194             fiz3             = _mm_add_ps(fiz3,tz);
1195
1196             fjx3             = _mm_add_ps(fjx3,tx);
1197             fjy3             = _mm_add_ps(fjy3,ty);
1198             fjz3             = _mm_add_ps(fjz3,tz);
1199
1200             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1201             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1202             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1203             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1204
1205             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1206                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1207
1208             /* Inner loop uses 396 flops */
1209         }
1210
1211         /* End of innermost loop */
1212
1213         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1214                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1215
1216         ggid                        = gid[iidx];
1217         /* Update potential energies */
1218         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1219
1220         /* Increment number of inner iterations */
1221         inneriter                  += j_index_end - j_index_start;
1222
1223         /* Outer loop uses 19 flops */
1224     }
1225
1226     /* Increment number of outer iterations */
1227     outeriter        += nri;
1228
1229     /* Update outer/inner flops */
1230
1231     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*396);
1232 }
1233 /*
1234  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_single
1235  * Electrostatics interaction: CubicSplineTable
1236  * VdW interaction:            None
1237  * Geometry:                   Water4-Water4
1238  * Calculate force/pot:        Force
1239  */
1240 void
1241 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_single
1242                     (t_nblist                    * gmx_restrict       nlist,
1243                      rvec                        * gmx_restrict          xx,
1244                      rvec                        * gmx_restrict          ff,
1245                      struct t_forcerec           * gmx_restrict          fr,
1246                      t_mdatoms                   * gmx_restrict     mdatoms,
1247                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1248                      t_nrnb                      * gmx_restrict        nrnb)
1249 {
1250     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1251      * just 0 for non-waters.
1252      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1253      * jnr indices corresponding to data put in the four positions in the SIMD register.
1254      */
1255     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1256     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1257     int              jnrA,jnrB,jnrC,jnrD;
1258     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1259     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1260     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1261     real             rcutoff_scalar;
1262     real             *shiftvec,*fshift,*x,*f;
1263     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1264     real             scratch[4*DIM];
1265     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1266     int              vdwioffset1;
1267     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1268     int              vdwioffset2;
1269     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1270     int              vdwioffset3;
1271     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1272     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1273     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1274     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1275     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1276     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1277     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1278     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1279     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1280     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1281     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1282     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1283     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1284     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1285     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1286     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1287     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1288     real             *charge;
1289     __m128i          vfitab;
1290     __m128i          ifour       = _mm_set1_epi32(4);
1291     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1292     real             *vftab;
1293     __m128           dummy_mask,cutoff_mask;
1294     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1295     __m128           one     = _mm_set1_ps(1.0);
1296     __m128           two     = _mm_set1_ps(2.0);
1297     x                = xx[0];
1298     f                = ff[0];
1299
1300     nri              = nlist->nri;
1301     iinr             = nlist->iinr;
1302     jindex           = nlist->jindex;
1303     jjnr             = nlist->jjnr;
1304     shiftidx         = nlist->shift;
1305     gid              = nlist->gid;
1306     shiftvec         = fr->shift_vec[0];
1307     fshift           = fr->fshift[0];
1308     facel            = _mm_set1_ps(fr->ic->epsfac);
1309     charge           = mdatoms->chargeA;
1310
1311     vftab            = kernel_data->table_elec->data;
1312     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1313
1314     /* Setup water-specific parameters */
1315     inr              = nlist->iinr[0];
1316     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1317     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1318     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1319
1320     jq1              = _mm_set1_ps(charge[inr+1]);
1321     jq2              = _mm_set1_ps(charge[inr+2]);
1322     jq3              = _mm_set1_ps(charge[inr+3]);
1323     qq11             = _mm_mul_ps(iq1,jq1);
1324     qq12             = _mm_mul_ps(iq1,jq2);
1325     qq13             = _mm_mul_ps(iq1,jq3);
1326     qq21             = _mm_mul_ps(iq2,jq1);
1327     qq22             = _mm_mul_ps(iq2,jq2);
1328     qq23             = _mm_mul_ps(iq2,jq3);
1329     qq31             = _mm_mul_ps(iq3,jq1);
1330     qq32             = _mm_mul_ps(iq3,jq2);
1331     qq33             = _mm_mul_ps(iq3,jq3);
1332
1333     /* Avoid stupid compiler warnings */
1334     jnrA = jnrB = jnrC = jnrD = 0;
1335     j_coord_offsetA = 0;
1336     j_coord_offsetB = 0;
1337     j_coord_offsetC = 0;
1338     j_coord_offsetD = 0;
1339
1340     outeriter        = 0;
1341     inneriter        = 0;
1342
1343     for(iidx=0;iidx<4*DIM;iidx++)
1344     {
1345         scratch[iidx] = 0.0;
1346     }
1347
1348     /* Start outer loop over neighborlists */
1349     for(iidx=0; iidx<nri; iidx++)
1350     {
1351         /* Load shift vector for this list */
1352         i_shift_offset   = DIM*shiftidx[iidx];
1353
1354         /* Load limits for loop over neighbors */
1355         j_index_start    = jindex[iidx];
1356         j_index_end      = jindex[iidx+1];
1357
1358         /* Get outer coordinate index */
1359         inr              = iinr[iidx];
1360         i_coord_offset   = DIM*inr;
1361
1362         /* Load i particle coords and add shift vector */
1363         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1364                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1365
1366         fix1             = _mm_setzero_ps();
1367         fiy1             = _mm_setzero_ps();
1368         fiz1             = _mm_setzero_ps();
1369         fix2             = _mm_setzero_ps();
1370         fiy2             = _mm_setzero_ps();
1371         fiz2             = _mm_setzero_ps();
1372         fix3             = _mm_setzero_ps();
1373         fiy3             = _mm_setzero_ps();
1374         fiz3             = _mm_setzero_ps();
1375
1376         /* Start inner kernel loop */
1377         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1378         {
1379
1380             /* Get j neighbor index, and coordinate index */
1381             jnrA             = jjnr[jidx];
1382             jnrB             = jjnr[jidx+1];
1383             jnrC             = jjnr[jidx+2];
1384             jnrD             = jjnr[jidx+3];
1385             j_coord_offsetA  = DIM*jnrA;
1386             j_coord_offsetB  = DIM*jnrB;
1387             j_coord_offsetC  = DIM*jnrC;
1388             j_coord_offsetD  = DIM*jnrD;
1389
1390             /* load j atom coordinates */
1391             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1392                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1393                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1394
1395             /* Calculate displacement vector */
1396             dx11             = _mm_sub_ps(ix1,jx1);
1397             dy11             = _mm_sub_ps(iy1,jy1);
1398             dz11             = _mm_sub_ps(iz1,jz1);
1399             dx12             = _mm_sub_ps(ix1,jx2);
1400             dy12             = _mm_sub_ps(iy1,jy2);
1401             dz12             = _mm_sub_ps(iz1,jz2);
1402             dx13             = _mm_sub_ps(ix1,jx3);
1403             dy13             = _mm_sub_ps(iy1,jy3);
1404             dz13             = _mm_sub_ps(iz1,jz3);
1405             dx21             = _mm_sub_ps(ix2,jx1);
1406             dy21             = _mm_sub_ps(iy2,jy1);
1407             dz21             = _mm_sub_ps(iz2,jz1);
1408             dx22             = _mm_sub_ps(ix2,jx2);
1409             dy22             = _mm_sub_ps(iy2,jy2);
1410             dz22             = _mm_sub_ps(iz2,jz2);
1411             dx23             = _mm_sub_ps(ix2,jx3);
1412             dy23             = _mm_sub_ps(iy2,jy3);
1413             dz23             = _mm_sub_ps(iz2,jz3);
1414             dx31             = _mm_sub_ps(ix3,jx1);
1415             dy31             = _mm_sub_ps(iy3,jy1);
1416             dz31             = _mm_sub_ps(iz3,jz1);
1417             dx32             = _mm_sub_ps(ix3,jx2);
1418             dy32             = _mm_sub_ps(iy3,jy2);
1419             dz32             = _mm_sub_ps(iz3,jz2);
1420             dx33             = _mm_sub_ps(ix3,jx3);
1421             dy33             = _mm_sub_ps(iy3,jy3);
1422             dz33             = _mm_sub_ps(iz3,jz3);
1423
1424             /* Calculate squared distance and things based on it */
1425             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1426             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1427             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1428             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1429             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1430             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1431             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1432             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1433             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1434
1435             rinv11           = sse41_invsqrt_f(rsq11);
1436             rinv12           = sse41_invsqrt_f(rsq12);
1437             rinv13           = sse41_invsqrt_f(rsq13);
1438             rinv21           = sse41_invsqrt_f(rsq21);
1439             rinv22           = sse41_invsqrt_f(rsq22);
1440             rinv23           = sse41_invsqrt_f(rsq23);
1441             rinv31           = sse41_invsqrt_f(rsq31);
1442             rinv32           = sse41_invsqrt_f(rsq32);
1443             rinv33           = sse41_invsqrt_f(rsq33);
1444
1445             fjx1             = _mm_setzero_ps();
1446             fjy1             = _mm_setzero_ps();
1447             fjz1             = _mm_setzero_ps();
1448             fjx2             = _mm_setzero_ps();
1449             fjy2             = _mm_setzero_ps();
1450             fjz2             = _mm_setzero_ps();
1451             fjx3             = _mm_setzero_ps();
1452             fjy3             = _mm_setzero_ps();
1453             fjz3             = _mm_setzero_ps();
1454
1455             /**************************
1456              * CALCULATE INTERACTIONS *
1457              **************************/
1458
1459             r11              = _mm_mul_ps(rsq11,rinv11);
1460
1461             /* Calculate table index by multiplying r with table scale and truncate to integer */
1462             rt               = _mm_mul_ps(r11,vftabscale);
1463             vfitab           = _mm_cvttps_epi32(rt);
1464             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1465             vfitab           = _mm_slli_epi32(vfitab,2);
1466
1467             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1468             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1469             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1470             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1471             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1472             _MM_TRANSPOSE4_PS(Y,F,G,H);
1473             Heps             = _mm_mul_ps(vfeps,H);
1474             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1475             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1476             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1477
1478             fscal            = felec;
1479
1480             /* Calculate temporary vectorial force */
1481             tx               = _mm_mul_ps(fscal,dx11);
1482             ty               = _mm_mul_ps(fscal,dy11);
1483             tz               = _mm_mul_ps(fscal,dz11);
1484
1485             /* Update vectorial force */
1486             fix1             = _mm_add_ps(fix1,tx);
1487             fiy1             = _mm_add_ps(fiy1,ty);
1488             fiz1             = _mm_add_ps(fiz1,tz);
1489
1490             fjx1             = _mm_add_ps(fjx1,tx);
1491             fjy1             = _mm_add_ps(fjy1,ty);
1492             fjz1             = _mm_add_ps(fjz1,tz);
1493
1494             /**************************
1495              * CALCULATE INTERACTIONS *
1496              **************************/
1497
1498             r12              = _mm_mul_ps(rsq12,rinv12);
1499
1500             /* Calculate table index by multiplying r with table scale and truncate to integer */
1501             rt               = _mm_mul_ps(r12,vftabscale);
1502             vfitab           = _mm_cvttps_epi32(rt);
1503             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1504             vfitab           = _mm_slli_epi32(vfitab,2);
1505
1506             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1507             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1508             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1509             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1510             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1511             _MM_TRANSPOSE4_PS(Y,F,G,H);
1512             Heps             = _mm_mul_ps(vfeps,H);
1513             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1514             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1515             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1516
1517             fscal            = felec;
1518
1519             /* Calculate temporary vectorial force */
1520             tx               = _mm_mul_ps(fscal,dx12);
1521             ty               = _mm_mul_ps(fscal,dy12);
1522             tz               = _mm_mul_ps(fscal,dz12);
1523
1524             /* Update vectorial force */
1525             fix1             = _mm_add_ps(fix1,tx);
1526             fiy1             = _mm_add_ps(fiy1,ty);
1527             fiz1             = _mm_add_ps(fiz1,tz);
1528
1529             fjx2             = _mm_add_ps(fjx2,tx);
1530             fjy2             = _mm_add_ps(fjy2,ty);
1531             fjz2             = _mm_add_ps(fjz2,tz);
1532
1533             /**************************
1534              * CALCULATE INTERACTIONS *
1535              **************************/
1536
1537             r13              = _mm_mul_ps(rsq13,rinv13);
1538
1539             /* Calculate table index by multiplying r with table scale and truncate to integer */
1540             rt               = _mm_mul_ps(r13,vftabscale);
1541             vfitab           = _mm_cvttps_epi32(rt);
1542             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1543             vfitab           = _mm_slli_epi32(vfitab,2);
1544
1545             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1546             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1547             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1548             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1549             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1550             _MM_TRANSPOSE4_PS(Y,F,G,H);
1551             Heps             = _mm_mul_ps(vfeps,H);
1552             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1553             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1554             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1555
1556             fscal            = felec;
1557
1558             /* Calculate temporary vectorial force */
1559             tx               = _mm_mul_ps(fscal,dx13);
1560             ty               = _mm_mul_ps(fscal,dy13);
1561             tz               = _mm_mul_ps(fscal,dz13);
1562
1563             /* Update vectorial force */
1564             fix1             = _mm_add_ps(fix1,tx);
1565             fiy1             = _mm_add_ps(fiy1,ty);
1566             fiz1             = _mm_add_ps(fiz1,tz);
1567
1568             fjx3             = _mm_add_ps(fjx3,tx);
1569             fjy3             = _mm_add_ps(fjy3,ty);
1570             fjz3             = _mm_add_ps(fjz3,tz);
1571
1572             /**************************
1573              * CALCULATE INTERACTIONS *
1574              **************************/
1575
1576             r21              = _mm_mul_ps(rsq21,rinv21);
1577
1578             /* Calculate table index by multiplying r with table scale and truncate to integer */
1579             rt               = _mm_mul_ps(r21,vftabscale);
1580             vfitab           = _mm_cvttps_epi32(rt);
1581             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1582             vfitab           = _mm_slli_epi32(vfitab,2);
1583
1584             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1585             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1586             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1587             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1588             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1589             _MM_TRANSPOSE4_PS(Y,F,G,H);
1590             Heps             = _mm_mul_ps(vfeps,H);
1591             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1592             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1593             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1594
1595             fscal            = felec;
1596
1597             /* Calculate temporary vectorial force */
1598             tx               = _mm_mul_ps(fscal,dx21);
1599             ty               = _mm_mul_ps(fscal,dy21);
1600             tz               = _mm_mul_ps(fscal,dz21);
1601
1602             /* Update vectorial force */
1603             fix2             = _mm_add_ps(fix2,tx);
1604             fiy2             = _mm_add_ps(fiy2,ty);
1605             fiz2             = _mm_add_ps(fiz2,tz);
1606
1607             fjx1             = _mm_add_ps(fjx1,tx);
1608             fjy1             = _mm_add_ps(fjy1,ty);
1609             fjz1             = _mm_add_ps(fjz1,tz);
1610
1611             /**************************
1612              * CALCULATE INTERACTIONS *
1613              **************************/
1614
1615             r22              = _mm_mul_ps(rsq22,rinv22);
1616
1617             /* Calculate table index by multiplying r with table scale and truncate to integer */
1618             rt               = _mm_mul_ps(r22,vftabscale);
1619             vfitab           = _mm_cvttps_epi32(rt);
1620             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1621             vfitab           = _mm_slli_epi32(vfitab,2);
1622
1623             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1624             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1625             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1626             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1627             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1628             _MM_TRANSPOSE4_PS(Y,F,G,H);
1629             Heps             = _mm_mul_ps(vfeps,H);
1630             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1631             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1632             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1633
1634             fscal            = felec;
1635
1636             /* Calculate temporary vectorial force */
1637             tx               = _mm_mul_ps(fscal,dx22);
1638             ty               = _mm_mul_ps(fscal,dy22);
1639             tz               = _mm_mul_ps(fscal,dz22);
1640
1641             /* Update vectorial force */
1642             fix2             = _mm_add_ps(fix2,tx);
1643             fiy2             = _mm_add_ps(fiy2,ty);
1644             fiz2             = _mm_add_ps(fiz2,tz);
1645
1646             fjx2             = _mm_add_ps(fjx2,tx);
1647             fjy2             = _mm_add_ps(fjy2,ty);
1648             fjz2             = _mm_add_ps(fjz2,tz);
1649
1650             /**************************
1651              * CALCULATE INTERACTIONS *
1652              **************************/
1653
1654             r23              = _mm_mul_ps(rsq23,rinv23);
1655
1656             /* Calculate table index by multiplying r with table scale and truncate to integer */
1657             rt               = _mm_mul_ps(r23,vftabscale);
1658             vfitab           = _mm_cvttps_epi32(rt);
1659             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1660             vfitab           = _mm_slli_epi32(vfitab,2);
1661
1662             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1663             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1664             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1665             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1666             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1667             _MM_TRANSPOSE4_PS(Y,F,G,H);
1668             Heps             = _mm_mul_ps(vfeps,H);
1669             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1670             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1671             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1672
1673             fscal            = felec;
1674
1675             /* Calculate temporary vectorial force */
1676             tx               = _mm_mul_ps(fscal,dx23);
1677             ty               = _mm_mul_ps(fscal,dy23);
1678             tz               = _mm_mul_ps(fscal,dz23);
1679
1680             /* Update vectorial force */
1681             fix2             = _mm_add_ps(fix2,tx);
1682             fiy2             = _mm_add_ps(fiy2,ty);
1683             fiz2             = _mm_add_ps(fiz2,tz);
1684
1685             fjx3             = _mm_add_ps(fjx3,tx);
1686             fjy3             = _mm_add_ps(fjy3,ty);
1687             fjz3             = _mm_add_ps(fjz3,tz);
1688
1689             /**************************
1690              * CALCULATE INTERACTIONS *
1691              **************************/
1692
1693             r31              = _mm_mul_ps(rsq31,rinv31);
1694
1695             /* Calculate table index by multiplying r with table scale and truncate to integer */
1696             rt               = _mm_mul_ps(r31,vftabscale);
1697             vfitab           = _mm_cvttps_epi32(rt);
1698             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1699             vfitab           = _mm_slli_epi32(vfitab,2);
1700
1701             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1702             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1703             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1704             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1705             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1706             _MM_TRANSPOSE4_PS(Y,F,G,H);
1707             Heps             = _mm_mul_ps(vfeps,H);
1708             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1709             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1710             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1711
1712             fscal            = felec;
1713
1714             /* Calculate temporary vectorial force */
1715             tx               = _mm_mul_ps(fscal,dx31);
1716             ty               = _mm_mul_ps(fscal,dy31);
1717             tz               = _mm_mul_ps(fscal,dz31);
1718
1719             /* Update vectorial force */
1720             fix3             = _mm_add_ps(fix3,tx);
1721             fiy3             = _mm_add_ps(fiy3,ty);
1722             fiz3             = _mm_add_ps(fiz3,tz);
1723
1724             fjx1             = _mm_add_ps(fjx1,tx);
1725             fjy1             = _mm_add_ps(fjy1,ty);
1726             fjz1             = _mm_add_ps(fjz1,tz);
1727
1728             /**************************
1729              * CALCULATE INTERACTIONS *
1730              **************************/
1731
1732             r32              = _mm_mul_ps(rsq32,rinv32);
1733
1734             /* Calculate table index by multiplying r with table scale and truncate to integer */
1735             rt               = _mm_mul_ps(r32,vftabscale);
1736             vfitab           = _mm_cvttps_epi32(rt);
1737             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1738             vfitab           = _mm_slli_epi32(vfitab,2);
1739
1740             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1741             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1742             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1743             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1744             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1745             _MM_TRANSPOSE4_PS(Y,F,G,H);
1746             Heps             = _mm_mul_ps(vfeps,H);
1747             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1748             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1749             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1750
1751             fscal            = felec;
1752
1753             /* Calculate temporary vectorial force */
1754             tx               = _mm_mul_ps(fscal,dx32);
1755             ty               = _mm_mul_ps(fscal,dy32);
1756             tz               = _mm_mul_ps(fscal,dz32);
1757
1758             /* Update vectorial force */
1759             fix3             = _mm_add_ps(fix3,tx);
1760             fiy3             = _mm_add_ps(fiy3,ty);
1761             fiz3             = _mm_add_ps(fiz3,tz);
1762
1763             fjx2             = _mm_add_ps(fjx2,tx);
1764             fjy2             = _mm_add_ps(fjy2,ty);
1765             fjz2             = _mm_add_ps(fjz2,tz);
1766
1767             /**************************
1768              * CALCULATE INTERACTIONS *
1769              **************************/
1770
1771             r33              = _mm_mul_ps(rsq33,rinv33);
1772
1773             /* Calculate table index by multiplying r with table scale and truncate to integer */
1774             rt               = _mm_mul_ps(r33,vftabscale);
1775             vfitab           = _mm_cvttps_epi32(rt);
1776             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1777             vfitab           = _mm_slli_epi32(vfitab,2);
1778
1779             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1780             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1781             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1782             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1783             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1784             _MM_TRANSPOSE4_PS(Y,F,G,H);
1785             Heps             = _mm_mul_ps(vfeps,H);
1786             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1787             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1788             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1789
1790             fscal            = felec;
1791
1792             /* Calculate temporary vectorial force */
1793             tx               = _mm_mul_ps(fscal,dx33);
1794             ty               = _mm_mul_ps(fscal,dy33);
1795             tz               = _mm_mul_ps(fscal,dz33);
1796
1797             /* Update vectorial force */
1798             fix3             = _mm_add_ps(fix3,tx);
1799             fiy3             = _mm_add_ps(fiy3,ty);
1800             fiz3             = _mm_add_ps(fiz3,tz);
1801
1802             fjx3             = _mm_add_ps(fjx3,tx);
1803             fjy3             = _mm_add_ps(fjy3,ty);
1804             fjz3             = _mm_add_ps(fjz3,tz);
1805
1806             fjptrA             = f+j_coord_offsetA;
1807             fjptrB             = f+j_coord_offsetB;
1808             fjptrC             = f+j_coord_offsetC;
1809             fjptrD             = f+j_coord_offsetD;
1810
1811             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1812                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1813
1814             /* Inner loop uses 351 flops */
1815         }
1816
1817         if(jidx<j_index_end)
1818         {
1819
1820             /* Get j neighbor index, and coordinate index */
1821             jnrlistA         = jjnr[jidx];
1822             jnrlistB         = jjnr[jidx+1];
1823             jnrlistC         = jjnr[jidx+2];
1824             jnrlistD         = jjnr[jidx+3];
1825             /* Sign of each element will be negative for non-real atoms.
1826              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1827              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1828              */
1829             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1830             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1831             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1832             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1833             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1834             j_coord_offsetA  = DIM*jnrA;
1835             j_coord_offsetB  = DIM*jnrB;
1836             j_coord_offsetC  = DIM*jnrC;
1837             j_coord_offsetD  = DIM*jnrD;
1838
1839             /* load j atom coordinates */
1840             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1841                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1842                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1843
1844             /* Calculate displacement vector */
1845             dx11             = _mm_sub_ps(ix1,jx1);
1846             dy11             = _mm_sub_ps(iy1,jy1);
1847             dz11             = _mm_sub_ps(iz1,jz1);
1848             dx12             = _mm_sub_ps(ix1,jx2);
1849             dy12             = _mm_sub_ps(iy1,jy2);
1850             dz12             = _mm_sub_ps(iz1,jz2);
1851             dx13             = _mm_sub_ps(ix1,jx3);
1852             dy13             = _mm_sub_ps(iy1,jy3);
1853             dz13             = _mm_sub_ps(iz1,jz3);
1854             dx21             = _mm_sub_ps(ix2,jx1);
1855             dy21             = _mm_sub_ps(iy2,jy1);
1856             dz21             = _mm_sub_ps(iz2,jz1);
1857             dx22             = _mm_sub_ps(ix2,jx2);
1858             dy22             = _mm_sub_ps(iy2,jy2);
1859             dz22             = _mm_sub_ps(iz2,jz2);
1860             dx23             = _mm_sub_ps(ix2,jx3);
1861             dy23             = _mm_sub_ps(iy2,jy3);
1862             dz23             = _mm_sub_ps(iz2,jz3);
1863             dx31             = _mm_sub_ps(ix3,jx1);
1864             dy31             = _mm_sub_ps(iy3,jy1);
1865             dz31             = _mm_sub_ps(iz3,jz1);
1866             dx32             = _mm_sub_ps(ix3,jx2);
1867             dy32             = _mm_sub_ps(iy3,jy2);
1868             dz32             = _mm_sub_ps(iz3,jz2);
1869             dx33             = _mm_sub_ps(ix3,jx3);
1870             dy33             = _mm_sub_ps(iy3,jy3);
1871             dz33             = _mm_sub_ps(iz3,jz3);
1872
1873             /* Calculate squared distance and things based on it */
1874             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1875             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1876             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1877             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1878             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1879             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1880             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1881             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1882             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1883
1884             rinv11           = sse41_invsqrt_f(rsq11);
1885             rinv12           = sse41_invsqrt_f(rsq12);
1886             rinv13           = sse41_invsqrt_f(rsq13);
1887             rinv21           = sse41_invsqrt_f(rsq21);
1888             rinv22           = sse41_invsqrt_f(rsq22);
1889             rinv23           = sse41_invsqrt_f(rsq23);
1890             rinv31           = sse41_invsqrt_f(rsq31);
1891             rinv32           = sse41_invsqrt_f(rsq32);
1892             rinv33           = sse41_invsqrt_f(rsq33);
1893
1894             fjx1             = _mm_setzero_ps();
1895             fjy1             = _mm_setzero_ps();
1896             fjz1             = _mm_setzero_ps();
1897             fjx2             = _mm_setzero_ps();
1898             fjy2             = _mm_setzero_ps();
1899             fjz2             = _mm_setzero_ps();
1900             fjx3             = _mm_setzero_ps();
1901             fjy3             = _mm_setzero_ps();
1902             fjz3             = _mm_setzero_ps();
1903
1904             /**************************
1905              * CALCULATE INTERACTIONS *
1906              **************************/
1907
1908             r11              = _mm_mul_ps(rsq11,rinv11);
1909             r11              = _mm_andnot_ps(dummy_mask,r11);
1910
1911             /* Calculate table index by multiplying r with table scale and truncate to integer */
1912             rt               = _mm_mul_ps(r11,vftabscale);
1913             vfitab           = _mm_cvttps_epi32(rt);
1914             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1915             vfitab           = _mm_slli_epi32(vfitab,2);
1916
1917             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1918             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1919             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1920             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1921             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1922             _MM_TRANSPOSE4_PS(Y,F,G,H);
1923             Heps             = _mm_mul_ps(vfeps,H);
1924             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1925             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1926             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1927
1928             fscal            = felec;
1929
1930             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1931
1932             /* Calculate temporary vectorial force */
1933             tx               = _mm_mul_ps(fscal,dx11);
1934             ty               = _mm_mul_ps(fscal,dy11);
1935             tz               = _mm_mul_ps(fscal,dz11);
1936
1937             /* Update vectorial force */
1938             fix1             = _mm_add_ps(fix1,tx);
1939             fiy1             = _mm_add_ps(fiy1,ty);
1940             fiz1             = _mm_add_ps(fiz1,tz);
1941
1942             fjx1             = _mm_add_ps(fjx1,tx);
1943             fjy1             = _mm_add_ps(fjy1,ty);
1944             fjz1             = _mm_add_ps(fjz1,tz);
1945
1946             /**************************
1947              * CALCULATE INTERACTIONS *
1948              **************************/
1949
1950             r12              = _mm_mul_ps(rsq12,rinv12);
1951             r12              = _mm_andnot_ps(dummy_mask,r12);
1952
1953             /* Calculate table index by multiplying r with table scale and truncate to integer */
1954             rt               = _mm_mul_ps(r12,vftabscale);
1955             vfitab           = _mm_cvttps_epi32(rt);
1956             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1957             vfitab           = _mm_slli_epi32(vfitab,2);
1958
1959             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1960             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1961             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1962             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1963             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1964             _MM_TRANSPOSE4_PS(Y,F,G,H);
1965             Heps             = _mm_mul_ps(vfeps,H);
1966             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1967             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1968             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1969
1970             fscal            = felec;
1971
1972             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1973
1974             /* Calculate temporary vectorial force */
1975             tx               = _mm_mul_ps(fscal,dx12);
1976             ty               = _mm_mul_ps(fscal,dy12);
1977             tz               = _mm_mul_ps(fscal,dz12);
1978
1979             /* Update vectorial force */
1980             fix1             = _mm_add_ps(fix1,tx);
1981             fiy1             = _mm_add_ps(fiy1,ty);
1982             fiz1             = _mm_add_ps(fiz1,tz);
1983
1984             fjx2             = _mm_add_ps(fjx2,tx);
1985             fjy2             = _mm_add_ps(fjy2,ty);
1986             fjz2             = _mm_add_ps(fjz2,tz);
1987
1988             /**************************
1989              * CALCULATE INTERACTIONS *
1990              **************************/
1991
1992             r13              = _mm_mul_ps(rsq13,rinv13);
1993             r13              = _mm_andnot_ps(dummy_mask,r13);
1994
1995             /* Calculate table index by multiplying r with table scale and truncate to integer */
1996             rt               = _mm_mul_ps(r13,vftabscale);
1997             vfitab           = _mm_cvttps_epi32(rt);
1998             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1999             vfitab           = _mm_slli_epi32(vfitab,2);
2000
2001             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2002             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2003             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2004             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2005             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2006             _MM_TRANSPOSE4_PS(Y,F,G,H);
2007             Heps             = _mm_mul_ps(vfeps,H);
2008             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2009             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2010             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2011
2012             fscal            = felec;
2013
2014             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2015
2016             /* Calculate temporary vectorial force */
2017             tx               = _mm_mul_ps(fscal,dx13);
2018             ty               = _mm_mul_ps(fscal,dy13);
2019             tz               = _mm_mul_ps(fscal,dz13);
2020
2021             /* Update vectorial force */
2022             fix1             = _mm_add_ps(fix1,tx);
2023             fiy1             = _mm_add_ps(fiy1,ty);
2024             fiz1             = _mm_add_ps(fiz1,tz);
2025
2026             fjx3             = _mm_add_ps(fjx3,tx);
2027             fjy3             = _mm_add_ps(fjy3,ty);
2028             fjz3             = _mm_add_ps(fjz3,tz);
2029
2030             /**************************
2031              * CALCULATE INTERACTIONS *
2032              **************************/
2033
2034             r21              = _mm_mul_ps(rsq21,rinv21);
2035             r21              = _mm_andnot_ps(dummy_mask,r21);
2036
2037             /* Calculate table index by multiplying r with table scale and truncate to integer */
2038             rt               = _mm_mul_ps(r21,vftabscale);
2039             vfitab           = _mm_cvttps_epi32(rt);
2040             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2041             vfitab           = _mm_slli_epi32(vfitab,2);
2042
2043             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2044             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2045             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2046             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2047             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2048             _MM_TRANSPOSE4_PS(Y,F,G,H);
2049             Heps             = _mm_mul_ps(vfeps,H);
2050             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2051             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2052             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2053
2054             fscal            = felec;
2055
2056             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2057
2058             /* Calculate temporary vectorial force */
2059             tx               = _mm_mul_ps(fscal,dx21);
2060             ty               = _mm_mul_ps(fscal,dy21);
2061             tz               = _mm_mul_ps(fscal,dz21);
2062
2063             /* Update vectorial force */
2064             fix2             = _mm_add_ps(fix2,tx);
2065             fiy2             = _mm_add_ps(fiy2,ty);
2066             fiz2             = _mm_add_ps(fiz2,tz);
2067
2068             fjx1             = _mm_add_ps(fjx1,tx);
2069             fjy1             = _mm_add_ps(fjy1,ty);
2070             fjz1             = _mm_add_ps(fjz1,tz);
2071
2072             /**************************
2073              * CALCULATE INTERACTIONS *
2074              **************************/
2075
2076             r22              = _mm_mul_ps(rsq22,rinv22);
2077             r22              = _mm_andnot_ps(dummy_mask,r22);
2078
2079             /* Calculate table index by multiplying r with table scale and truncate to integer */
2080             rt               = _mm_mul_ps(r22,vftabscale);
2081             vfitab           = _mm_cvttps_epi32(rt);
2082             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2083             vfitab           = _mm_slli_epi32(vfitab,2);
2084
2085             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2086             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2087             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2088             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2089             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2090             _MM_TRANSPOSE4_PS(Y,F,G,H);
2091             Heps             = _mm_mul_ps(vfeps,H);
2092             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2093             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2094             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2095
2096             fscal            = felec;
2097
2098             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2099
2100             /* Calculate temporary vectorial force */
2101             tx               = _mm_mul_ps(fscal,dx22);
2102             ty               = _mm_mul_ps(fscal,dy22);
2103             tz               = _mm_mul_ps(fscal,dz22);
2104
2105             /* Update vectorial force */
2106             fix2             = _mm_add_ps(fix2,tx);
2107             fiy2             = _mm_add_ps(fiy2,ty);
2108             fiz2             = _mm_add_ps(fiz2,tz);
2109
2110             fjx2             = _mm_add_ps(fjx2,tx);
2111             fjy2             = _mm_add_ps(fjy2,ty);
2112             fjz2             = _mm_add_ps(fjz2,tz);
2113
2114             /**************************
2115              * CALCULATE INTERACTIONS *
2116              **************************/
2117
2118             r23              = _mm_mul_ps(rsq23,rinv23);
2119             r23              = _mm_andnot_ps(dummy_mask,r23);
2120
2121             /* Calculate table index by multiplying r with table scale and truncate to integer */
2122             rt               = _mm_mul_ps(r23,vftabscale);
2123             vfitab           = _mm_cvttps_epi32(rt);
2124             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2125             vfitab           = _mm_slli_epi32(vfitab,2);
2126
2127             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2128             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2129             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2130             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2131             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2132             _MM_TRANSPOSE4_PS(Y,F,G,H);
2133             Heps             = _mm_mul_ps(vfeps,H);
2134             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2135             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2136             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2137
2138             fscal            = felec;
2139
2140             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2141
2142             /* Calculate temporary vectorial force */
2143             tx               = _mm_mul_ps(fscal,dx23);
2144             ty               = _mm_mul_ps(fscal,dy23);
2145             tz               = _mm_mul_ps(fscal,dz23);
2146
2147             /* Update vectorial force */
2148             fix2             = _mm_add_ps(fix2,tx);
2149             fiy2             = _mm_add_ps(fiy2,ty);
2150             fiz2             = _mm_add_ps(fiz2,tz);
2151
2152             fjx3             = _mm_add_ps(fjx3,tx);
2153             fjy3             = _mm_add_ps(fjy3,ty);
2154             fjz3             = _mm_add_ps(fjz3,tz);
2155
2156             /**************************
2157              * CALCULATE INTERACTIONS *
2158              **************************/
2159
2160             r31              = _mm_mul_ps(rsq31,rinv31);
2161             r31              = _mm_andnot_ps(dummy_mask,r31);
2162
2163             /* Calculate table index by multiplying r with table scale and truncate to integer */
2164             rt               = _mm_mul_ps(r31,vftabscale);
2165             vfitab           = _mm_cvttps_epi32(rt);
2166             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2167             vfitab           = _mm_slli_epi32(vfitab,2);
2168
2169             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2170             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2171             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2172             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2173             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2174             _MM_TRANSPOSE4_PS(Y,F,G,H);
2175             Heps             = _mm_mul_ps(vfeps,H);
2176             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2177             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2178             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2179
2180             fscal            = felec;
2181
2182             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2183
2184             /* Calculate temporary vectorial force */
2185             tx               = _mm_mul_ps(fscal,dx31);
2186             ty               = _mm_mul_ps(fscal,dy31);
2187             tz               = _mm_mul_ps(fscal,dz31);
2188
2189             /* Update vectorial force */
2190             fix3             = _mm_add_ps(fix3,tx);
2191             fiy3             = _mm_add_ps(fiy3,ty);
2192             fiz3             = _mm_add_ps(fiz3,tz);
2193
2194             fjx1             = _mm_add_ps(fjx1,tx);
2195             fjy1             = _mm_add_ps(fjy1,ty);
2196             fjz1             = _mm_add_ps(fjz1,tz);
2197
2198             /**************************
2199              * CALCULATE INTERACTIONS *
2200              **************************/
2201
2202             r32              = _mm_mul_ps(rsq32,rinv32);
2203             r32              = _mm_andnot_ps(dummy_mask,r32);
2204
2205             /* Calculate table index by multiplying r with table scale and truncate to integer */
2206             rt               = _mm_mul_ps(r32,vftabscale);
2207             vfitab           = _mm_cvttps_epi32(rt);
2208             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2209             vfitab           = _mm_slli_epi32(vfitab,2);
2210
2211             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2212             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2213             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2214             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2215             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2216             _MM_TRANSPOSE4_PS(Y,F,G,H);
2217             Heps             = _mm_mul_ps(vfeps,H);
2218             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2219             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2220             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2221
2222             fscal            = felec;
2223
2224             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2225
2226             /* Calculate temporary vectorial force */
2227             tx               = _mm_mul_ps(fscal,dx32);
2228             ty               = _mm_mul_ps(fscal,dy32);
2229             tz               = _mm_mul_ps(fscal,dz32);
2230
2231             /* Update vectorial force */
2232             fix3             = _mm_add_ps(fix3,tx);
2233             fiy3             = _mm_add_ps(fiy3,ty);
2234             fiz3             = _mm_add_ps(fiz3,tz);
2235
2236             fjx2             = _mm_add_ps(fjx2,tx);
2237             fjy2             = _mm_add_ps(fjy2,ty);
2238             fjz2             = _mm_add_ps(fjz2,tz);
2239
2240             /**************************
2241              * CALCULATE INTERACTIONS *
2242              **************************/
2243
2244             r33              = _mm_mul_ps(rsq33,rinv33);
2245             r33              = _mm_andnot_ps(dummy_mask,r33);
2246
2247             /* Calculate table index by multiplying r with table scale and truncate to integer */
2248             rt               = _mm_mul_ps(r33,vftabscale);
2249             vfitab           = _mm_cvttps_epi32(rt);
2250             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2251             vfitab           = _mm_slli_epi32(vfitab,2);
2252
2253             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2254             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2255             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2256             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2257             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2258             _MM_TRANSPOSE4_PS(Y,F,G,H);
2259             Heps             = _mm_mul_ps(vfeps,H);
2260             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2261             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2262             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2263
2264             fscal            = felec;
2265
2266             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2267
2268             /* Calculate temporary vectorial force */
2269             tx               = _mm_mul_ps(fscal,dx33);
2270             ty               = _mm_mul_ps(fscal,dy33);
2271             tz               = _mm_mul_ps(fscal,dz33);
2272
2273             /* Update vectorial force */
2274             fix3             = _mm_add_ps(fix3,tx);
2275             fiy3             = _mm_add_ps(fiy3,ty);
2276             fiz3             = _mm_add_ps(fiz3,tz);
2277
2278             fjx3             = _mm_add_ps(fjx3,tx);
2279             fjy3             = _mm_add_ps(fjy3,ty);
2280             fjz3             = _mm_add_ps(fjz3,tz);
2281
2282             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2283             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2284             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2285             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2286
2287             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2288                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2289
2290             /* Inner loop uses 360 flops */
2291         }
2292
2293         /* End of innermost loop */
2294
2295         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2296                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
2297
2298         /* Increment number of inner iterations */
2299         inneriter                  += j_index_end - j_index_start;
2300
2301         /* Outer loop uses 18 flops */
2302     }
2303
2304     /* Increment number of outer iterations */
2305     outeriter        += nri;
2306
2307     /* Update outer/inner flops */
2308
2309     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*360);
2310 }