Version bumps after new release
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse4_1_single.h"
50 #include "kernelutil_x86_sse4_1_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_single
54  * Electrostatics interaction: CubicSplineTable
55  * VdW interaction:            None
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwioffset3;
90     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
96     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     __m128i          vfitab;
109     __m128i          ifour       = _mm_set1_epi32(4);
110     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
111     real             *vftab;
112     __m128           dummy_mask,cutoff_mask;
113     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
114     __m128           one     = _mm_set1_ps(1.0);
115     __m128           two     = _mm_set1_ps(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm_set1_ps(fr->epsfac);
128     charge           = mdatoms->chargeA;
129
130     vftab            = kernel_data->table_elec->data;
131     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
132
133     /* Setup water-specific parameters */
134     inr              = nlist->iinr[0];
135     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
136     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
137     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
138
139     jq1              = _mm_set1_ps(charge[inr+1]);
140     jq2              = _mm_set1_ps(charge[inr+2]);
141     jq3              = _mm_set1_ps(charge[inr+3]);
142     qq11             = _mm_mul_ps(iq1,jq1);
143     qq12             = _mm_mul_ps(iq1,jq2);
144     qq13             = _mm_mul_ps(iq1,jq3);
145     qq21             = _mm_mul_ps(iq2,jq1);
146     qq22             = _mm_mul_ps(iq2,jq2);
147     qq23             = _mm_mul_ps(iq2,jq3);
148     qq31             = _mm_mul_ps(iq3,jq1);
149     qq32             = _mm_mul_ps(iq3,jq2);
150     qq33             = _mm_mul_ps(iq3,jq3);
151
152     /* Avoid stupid compiler warnings */
153     jnrA = jnrB = jnrC = jnrD = 0;
154     j_coord_offsetA = 0;
155     j_coord_offsetB = 0;
156     j_coord_offsetC = 0;
157     j_coord_offsetD = 0;
158
159     outeriter        = 0;
160     inneriter        = 0;
161
162     for(iidx=0;iidx<4*DIM;iidx++)
163     {
164         scratch[iidx] = 0.0;
165     }
166
167     /* Start outer loop over neighborlists */
168     for(iidx=0; iidx<nri; iidx++)
169     {
170         /* Load shift vector for this list */
171         i_shift_offset   = DIM*shiftidx[iidx];
172
173         /* Load limits for loop over neighbors */
174         j_index_start    = jindex[iidx];
175         j_index_end      = jindex[iidx+1];
176
177         /* Get outer coordinate index */
178         inr              = iinr[iidx];
179         i_coord_offset   = DIM*inr;
180
181         /* Load i particle coords and add shift vector */
182         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
183                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
184
185         fix1             = _mm_setzero_ps();
186         fiy1             = _mm_setzero_ps();
187         fiz1             = _mm_setzero_ps();
188         fix2             = _mm_setzero_ps();
189         fiy2             = _mm_setzero_ps();
190         fiz2             = _mm_setzero_ps();
191         fix3             = _mm_setzero_ps();
192         fiy3             = _mm_setzero_ps();
193         fiz3             = _mm_setzero_ps();
194
195         /* Reset potential sums */
196         velecsum         = _mm_setzero_ps();
197
198         /* Start inner kernel loop */
199         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200         {
201
202             /* Get j neighbor index, and coordinate index */
203             jnrA             = jjnr[jidx];
204             jnrB             = jjnr[jidx+1];
205             jnrC             = jjnr[jidx+2];
206             jnrD             = jjnr[jidx+3];
207             j_coord_offsetA  = DIM*jnrA;
208             j_coord_offsetB  = DIM*jnrB;
209             j_coord_offsetC  = DIM*jnrC;
210             j_coord_offsetD  = DIM*jnrD;
211
212             /* load j atom coordinates */
213             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
214                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
215                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
216
217             /* Calculate displacement vector */
218             dx11             = _mm_sub_ps(ix1,jx1);
219             dy11             = _mm_sub_ps(iy1,jy1);
220             dz11             = _mm_sub_ps(iz1,jz1);
221             dx12             = _mm_sub_ps(ix1,jx2);
222             dy12             = _mm_sub_ps(iy1,jy2);
223             dz12             = _mm_sub_ps(iz1,jz2);
224             dx13             = _mm_sub_ps(ix1,jx3);
225             dy13             = _mm_sub_ps(iy1,jy3);
226             dz13             = _mm_sub_ps(iz1,jz3);
227             dx21             = _mm_sub_ps(ix2,jx1);
228             dy21             = _mm_sub_ps(iy2,jy1);
229             dz21             = _mm_sub_ps(iz2,jz1);
230             dx22             = _mm_sub_ps(ix2,jx2);
231             dy22             = _mm_sub_ps(iy2,jy2);
232             dz22             = _mm_sub_ps(iz2,jz2);
233             dx23             = _mm_sub_ps(ix2,jx3);
234             dy23             = _mm_sub_ps(iy2,jy3);
235             dz23             = _mm_sub_ps(iz2,jz3);
236             dx31             = _mm_sub_ps(ix3,jx1);
237             dy31             = _mm_sub_ps(iy3,jy1);
238             dz31             = _mm_sub_ps(iz3,jz1);
239             dx32             = _mm_sub_ps(ix3,jx2);
240             dy32             = _mm_sub_ps(iy3,jy2);
241             dz32             = _mm_sub_ps(iz3,jz2);
242             dx33             = _mm_sub_ps(ix3,jx3);
243             dy33             = _mm_sub_ps(iy3,jy3);
244             dz33             = _mm_sub_ps(iz3,jz3);
245
246             /* Calculate squared distance and things based on it */
247             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
248             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
249             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
250             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
251             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
252             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
253             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
254             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
255             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
256
257             rinv11           = gmx_mm_invsqrt_ps(rsq11);
258             rinv12           = gmx_mm_invsqrt_ps(rsq12);
259             rinv13           = gmx_mm_invsqrt_ps(rsq13);
260             rinv21           = gmx_mm_invsqrt_ps(rsq21);
261             rinv22           = gmx_mm_invsqrt_ps(rsq22);
262             rinv23           = gmx_mm_invsqrt_ps(rsq23);
263             rinv31           = gmx_mm_invsqrt_ps(rsq31);
264             rinv32           = gmx_mm_invsqrt_ps(rsq32);
265             rinv33           = gmx_mm_invsqrt_ps(rsq33);
266
267             fjx1             = _mm_setzero_ps();
268             fjy1             = _mm_setzero_ps();
269             fjz1             = _mm_setzero_ps();
270             fjx2             = _mm_setzero_ps();
271             fjy2             = _mm_setzero_ps();
272             fjz2             = _mm_setzero_ps();
273             fjx3             = _mm_setzero_ps();
274             fjy3             = _mm_setzero_ps();
275             fjz3             = _mm_setzero_ps();
276
277             /**************************
278              * CALCULATE INTERACTIONS *
279              **************************/
280
281             r11              = _mm_mul_ps(rsq11,rinv11);
282
283             /* Calculate table index by multiplying r with table scale and truncate to integer */
284             rt               = _mm_mul_ps(r11,vftabscale);
285             vfitab           = _mm_cvttps_epi32(rt);
286             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
287             vfitab           = _mm_slli_epi32(vfitab,2);
288
289             /* CUBIC SPLINE TABLE ELECTROSTATICS */
290             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
291             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
292             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
293             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
294             _MM_TRANSPOSE4_PS(Y,F,G,H);
295             Heps             = _mm_mul_ps(vfeps,H);
296             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
297             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
298             velec            = _mm_mul_ps(qq11,VV);
299             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
300             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
301
302             /* Update potential sum for this i atom from the interaction with this j atom. */
303             velecsum         = _mm_add_ps(velecsum,velec);
304
305             fscal            = felec;
306
307             /* Calculate temporary vectorial force */
308             tx               = _mm_mul_ps(fscal,dx11);
309             ty               = _mm_mul_ps(fscal,dy11);
310             tz               = _mm_mul_ps(fscal,dz11);
311
312             /* Update vectorial force */
313             fix1             = _mm_add_ps(fix1,tx);
314             fiy1             = _mm_add_ps(fiy1,ty);
315             fiz1             = _mm_add_ps(fiz1,tz);
316
317             fjx1             = _mm_add_ps(fjx1,tx);
318             fjy1             = _mm_add_ps(fjy1,ty);
319             fjz1             = _mm_add_ps(fjz1,tz);
320
321             /**************************
322              * CALCULATE INTERACTIONS *
323              **************************/
324
325             r12              = _mm_mul_ps(rsq12,rinv12);
326
327             /* Calculate table index by multiplying r with table scale and truncate to integer */
328             rt               = _mm_mul_ps(r12,vftabscale);
329             vfitab           = _mm_cvttps_epi32(rt);
330             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
331             vfitab           = _mm_slli_epi32(vfitab,2);
332
333             /* CUBIC SPLINE TABLE ELECTROSTATICS */
334             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
335             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
336             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
337             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
338             _MM_TRANSPOSE4_PS(Y,F,G,H);
339             Heps             = _mm_mul_ps(vfeps,H);
340             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
341             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
342             velec            = _mm_mul_ps(qq12,VV);
343             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
344             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
345
346             /* Update potential sum for this i atom from the interaction with this j atom. */
347             velecsum         = _mm_add_ps(velecsum,velec);
348
349             fscal            = felec;
350
351             /* Calculate temporary vectorial force */
352             tx               = _mm_mul_ps(fscal,dx12);
353             ty               = _mm_mul_ps(fscal,dy12);
354             tz               = _mm_mul_ps(fscal,dz12);
355
356             /* Update vectorial force */
357             fix1             = _mm_add_ps(fix1,tx);
358             fiy1             = _mm_add_ps(fiy1,ty);
359             fiz1             = _mm_add_ps(fiz1,tz);
360
361             fjx2             = _mm_add_ps(fjx2,tx);
362             fjy2             = _mm_add_ps(fjy2,ty);
363             fjz2             = _mm_add_ps(fjz2,tz);
364
365             /**************************
366              * CALCULATE INTERACTIONS *
367              **************************/
368
369             r13              = _mm_mul_ps(rsq13,rinv13);
370
371             /* Calculate table index by multiplying r with table scale and truncate to integer */
372             rt               = _mm_mul_ps(r13,vftabscale);
373             vfitab           = _mm_cvttps_epi32(rt);
374             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
375             vfitab           = _mm_slli_epi32(vfitab,2);
376
377             /* CUBIC SPLINE TABLE ELECTROSTATICS */
378             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
379             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
380             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
381             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
382             _MM_TRANSPOSE4_PS(Y,F,G,H);
383             Heps             = _mm_mul_ps(vfeps,H);
384             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
385             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
386             velec            = _mm_mul_ps(qq13,VV);
387             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
388             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
389
390             /* Update potential sum for this i atom from the interaction with this j atom. */
391             velecsum         = _mm_add_ps(velecsum,velec);
392
393             fscal            = felec;
394
395             /* Calculate temporary vectorial force */
396             tx               = _mm_mul_ps(fscal,dx13);
397             ty               = _mm_mul_ps(fscal,dy13);
398             tz               = _mm_mul_ps(fscal,dz13);
399
400             /* Update vectorial force */
401             fix1             = _mm_add_ps(fix1,tx);
402             fiy1             = _mm_add_ps(fiy1,ty);
403             fiz1             = _mm_add_ps(fiz1,tz);
404
405             fjx3             = _mm_add_ps(fjx3,tx);
406             fjy3             = _mm_add_ps(fjy3,ty);
407             fjz3             = _mm_add_ps(fjz3,tz);
408
409             /**************************
410              * CALCULATE INTERACTIONS *
411              **************************/
412
413             r21              = _mm_mul_ps(rsq21,rinv21);
414
415             /* Calculate table index by multiplying r with table scale and truncate to integer */
416             rt               = _mm_mul_ps(r21,vftabscale);
417             vfitab           = _mm_cvttps_epi32(rt);
418             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
419             vfitab           = _mm_slli_epi32(vfitab,2);
420
421             /* CUBIC SPLINE TABLE ELECTROSTATICS */
422             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
423             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
424             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
425             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
426             _MM_TRANSPOSE4_PS(Y,F,G,H);
427             Heps             = _mm_mul_ps(vfeps,H);
428             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
429             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
430             velec            = _mm_mul_ps(qq21,VV);
431             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
432             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
433
434             /* Update potential sum for this i atom from the interaction with this j atom. */
435             velecsum         = _mm_add_ps(velecsum,velec);
436
437             fscal            = felec;
438
439             /* Calculate temporary vectorial force */
440             tx               = _mm_mul_ps(fscal,dx21);
441             ty               = _mm_mul_ps(fscal,dy21);
442             tz               = _mm_mul_ps(fscal,dz21);
443
444             /* Update vectorial force */
445             fix2             = _mm_add_ps(fix2,tx);
446             fiy2             = _mm_add_ps(fiy2,ty);
447             fiz2             = _mm_add_ps(fiz2,tz);
448
449             fjx1             = _mm_add_ps(fjx1,tx);
450             fjy1             = _mm_add_ps(fjy1,ty);
451             fjz1             = _mm_add_ps(fjz1,tz);
452
453             /**************************
454              * CALCULATE INTERACTIONS *
455              **************************/
456
457             r22              = _mm_mul_ps(rsq22,rinv22);
458
459             /* Calculate table index by multiplying r with table scale and truncate to integer */
460             rt               = _mm_mul_ps(r22,vftabscale);
461             vfitab           = _mm_cvttps_epi32(rt);
462             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
463             vfitab           = _mm_slli_epi32(vfitab,2);
464
465             /* CUBIC SPLINE TABLE ELECTROSTATICS */
466             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
467             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
468             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
469             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
470             _MM_TRANSPOSE4_PS(Y,F,G,H);
471             Heps             = _mm_mul_ps(vfeps,H);
472             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
473             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
474             velec            = _mm_mul_ps(qq22,VV);
475             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
476             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
477
478             /* Update potential sum for this i atom from the interaction with this j atom. */
479             velecsum         = _mm_add_ps(velecsum,velec);
480
481             fscal            = felec;
482
483             /* Calculate temporary vectorial force */
484             tx               = _mm_mul_ps(fscal,dx22);
485             ty               = _mm_mul_ps(fscal,dy22);
486             tz               = _mm_mul_ps(fscal,dz22);
487
488             /* Update vectorial force */
489             fix2             = _mm_add_ps(fix2,tx);
490             fiy2             = _mm_add_ps(fiy2,ty);
491             fiz2             = _mm_add_ps(fiz2,tz);
492
493             fjx2             = _mm_add_ps(fjx2,tx);
494             fjy2             = _mm_add_ps(fjy2,ty);
495             fjz2             = _mm_add_ps(fjz2,tz);
496
497             /**************************
498              * CALCULATE INTERACTIONS *
499              **************************/
500
501             r23              = _mm_mul_ps(rsq23,rinv23);
502
503             /* Calculate table index by multiplying r with table scale and truncate to integer */
504             rt               = _mm_mul_ps(r23,vftabscale);
505             vfitab           = _mm_cvttps_epi32(rt);
506             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
507             vfitab           = _mm_slli_epi32(vfitab,2);
508
509             /* CUBIC SPLINE TABLE ELECTROSTATICS */
510             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
511             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
512             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
513             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
514             _MM_TRANSPOSE4_PS(Y,F,G,H);
515             Heps             = _mm_mul_ps(vfeps,H);
516             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
517             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
518             velec            = _mm_mul_ps(qq23,VV);
519             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
520             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
521
522             /* Update potential sum for this i atom from the interaction with this j atom. */
523             velecsum         = _mm_add_ps(velecsum,velec);
524
525             fscal            = felec;
526
527             /* Calculate temporary vectorial force */
528             tx               = _mm_mul_ps(fscal,dx23);
529             ty               = _mm_mul_ps(fscal,dy23);
530             tz               = _mm_mul_ps(fscal,dz23);
531
532             /* Update vectorial force */
533             fix2             = _mm_add_ps(fix2,tx);
534             fiy2             = _mm_add_ps(fiy2,ty);
535             fiz2             = _mm_add_ps(fiz2,tz);
536
537             fjx3             = _mm_add_ps(fjx3,tx);
538             fjy3             = _mm_add_ps(fjy3,ty);
539             fjz3             = _mm_add_ps(fjz3,tz);
540
541             /**************************
542              * CALCULATE INTERACTIONS *
543              **************************/
544
545             r31              = _mm_mul_ps(rsq31,rinv31);
546
547             /* Calculate table index by multiplying r with table scale and truncate to integer */
548             rt               = _mm_mul_ps(r31,vftabscale);
549             vfitab           = _mm_cvttps_epi32(rt);
550             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
551             vfitab           = _mm_slli_epi32(vfitab,2);
552
553             /* CUBIC SPLINE TABLE ELECTROSTATICS */
554             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
555             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
556             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
557             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
558             _MM_TRANSPOSE4_PS(Y,F,G,H);
559             Heps             = _mm_mul_ps(vfeps,H);
560             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
561             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
562             velec            = _mm_mul_ps(qq31,VV);
563             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
564             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
565
566             /* Update potential sum for this i atom from the interaction with this j atom. */
567             velecsum         = _mm_add_ps(velecsum,velec);
568
569             fscal            = felec;
570
571             /* Calculate temporary vectorial force */
572             tx               = _mm_mul_ps(fscal,dx31);
573             ty               = _mm_mul_ps(fscal,dy31);
574             tz               = _mm_mul_ps(fscal,dz31);
575
576             /* Update vectorial force */
577             fix3             = _mm_add_ps(fix3,tx);
578             fiy3             = _mm_add_ps(fiy3,ty);
579             fiz3             = _mm_add_ps(fiz3,tz);
580
581             fjx1             = _mm_add_ps(fjx1,tx);
582             fjy1             = _mm_add_ps(fjy1,ty);
583             fjz1             = _mm_add_ps(fjz1,tz);
584
585             /**************************
586              * CALCULATE INTERACTIONS *
587              **************************/
588
589             r32              = _mm_mul_ps(rsq32,rinv32);
590
591             /* Calculate table index by multiplying r with table scale and truncate to integer */
592             rt               = _mm_mul_ps(r32,vftabscale);
593             vfitab           = _mm_cvttps_epi32(rt);
594             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
595             vfitab           = _mm_slli_epi32(vfitab,2);
596
597             /* CUBIC SPLINE TABLE ELECTROSTATICS */
598             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
599             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
600             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
601             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
602             _MM_TRANSPOSE4_PS(Y,F,G,H);
603             Heps             = _mm_mul_ps(vfeps,H);
604             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
605             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
606             velec            = _mm_mul_ps(qq32,VV);
607             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
608             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
609
610             /* Update potential sum for this i atom from the interaction with this j atom. */
611             velecsum         = _mm_add_ps(velecsum,velec);
612
613             fscal            = felec;
614
615             /* Calculate temporary vectorial force */
616             tx               = _mm_mul_ps(fscal,dx32);
617             ty               = _mm_mul_ps(fscal,dy32);
618             tz               = _mm_mul_ps(fscal,dz32);
619
620             /* Update vectorial force */
621             fix3             = _mm_add_ps(fix3,tx);
622             fiy3             = _mm_add_ps(fiy3,ty);
623             fiz3             = _mm_add_ps(fiz3,tz);
624
625             fjx2             = _mm_add_ps(fjx2,tx);
626             fjy2             = _mm_add_ps(fjy2,ty);
627             fjz2             = _mm_add_ps(fjz2,tz);
628
629             /**************************
630              * CALCULATE INTERACTIONS *
631              **************************/
632
633             r33              = _mm_mul_ps(rsq33,rinv33);
634
635             /* Calculate table index by multiplying r with table scale and truncate to integer */
636             rt               = _mm_mul_ps(r33,vftabscale);
637             vfitab           = _mm_cvttps_epi32(rt);
638             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
639             vfitab           = _mm_slli_epi32(vfitab,2);
640
641             /* CUBIC SPLINE TABLE ELECTROSTATICS */
642             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
643             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
644             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
645             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
646             _MM_TRANSPOSE4_PS(Y,F,G,H);
647             Heps             = _mm_mul_ps(vfeps,H);
648             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
649             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
650             velec            = _mm_mul_ps(qq33,VV);
651             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
652             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
653
654             /* Update potential sum for this i atom from the interaction with this j atom. */
655             velecsum         = _mm_add_ps(velecsum,velec);
656
657             fscal            = felec;
658
659             /* Calculate temporary vectorial force */
660             tx               = _mm_mul_ps(fscal,dx33);
661             ty               = _mm_mul_ps(fscal,dy33);
662             tz               = _mm_mul_ps(fscal,dz33);
663
664             /* Update vectorial force */
665             fix3             = _mm_add_ps(fix3,tx);
666             fiy3             = _mm_add_ps(fiy3,ty);
667             fiz3             = _mm_add_ps(fiz3,tz);
668
669             fjx3             = _mm_add_ps(fjx3,tx);
670             fjy3             = _mm_add_ps(fjy3,ty);
671             fjz3             = _mm_add_ps(fjz3,tz);
672
673             fjptrA             = f+j_coord_offsetA;
674             fjptrB             = f+j_coord_offsetB;
675             fjptrC             = f+j_coord_offsetC;
676             fjptrD             = f+j_coord_offsetD;
677
678             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
679                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
680
681             /* Inner loop uses 387 flops */
682         }
683
684         if(jidx<j_index_end)
685         {
686
687             /* Get j neighbor index, and coordinate index */
688             jnrlistA         = jjnr[jidx];
689             jnrlistB         = jjnr[jidx+1];
690             jnrlistC         = jjnr[jidx+2];
691             jnrlistD         = jjnr[jidx+3];
692             /* Sign of each element will be negative for non-real atoms.
693              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
694              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
695              */
696             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
697             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
698             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
699             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
700             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
701             j_coord_offsetA  = DIM*jnrA;
702             j_coord_offsetB  = DIM*jnrB;
703             j_coord_offsetC  = DIM*jnrC;
704             j_coord_offsetD  = DIM*jnrD;
705
706             /* load j atom coordinates */
707             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
708                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
709                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
710
711             /* Calculate displacement vector */
712             dx11             = _mm_sub_ps(ix1,jx1);
713             dy11             = _mm_sub_ps(iy1,jy1);
714             dz11             = _mm_sub_ps(iz1,jz1);
715             dx12             = _mm_sub_ps(ix1,jx2);
716             dy12             = _mm_sub_ps(iy1,jy2);
717             dz12             = _mm_sub_ps(iz1,jz2);
718             dx13             = _mm_sub_ps(ix1,jx3);
719             dy13             = _mm_sub_ps(iy1,jy3);
720             dz13             = _mm_sub_ps(iz1,jz3);
721             dx21             = _mm_sub_ps(ix2,jx1);
722             dy21             = _mm_sub_ps(iy2,jy1);
723             dz21             = _mm_sub_ps(iz2,jz1);
724             dx22             = _mm_sub_ps(ix2,jx2);
725             dy22             = _mm_sub_ps(iy2,jy2);
726             dz22             = _mm_sub_ps(iz2,jz2);
727             dx23             = _mm_sub_ps(ix2,jx3);
728             dy23             = _mm_sub_ps(iy2,jy3);
729             dz23             = _mm_sub_ps(iz2,jz3);
730             dx31             = _mm_sub_ps(ix3,jx1);
731             dy31             = _mm_sub_ps(iy3,jy1);
732             dz31             = _mm_sub_ps(iz3,jz1);
733             dx32             = _mm_sub_ps(ix3,jx2);
734             dy32             = _mm_sub_ps(iy3,jy2);
735             dz32             = _mm_sub_ps(iz3,jz2);
736             dx33             = _mm_sub_ps(ix3,jx3);
737             dy33             = _mm_sub_ps(iy3,jy3);
738             dz33             = _mm_sub_ps(iz3,jz3);
739
740             /* Calculate squared distance and things based on it */
741             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
742             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
743             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
744             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
745             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
746             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
747             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
748             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
749             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
750
751             rinv11           = gmx_mm_invsqrt_ps(rsq11);
752             rinv12           = gmx_mm_invsqrt_ps(rsq12);
753             rinv13           = gmx_mm_invsqrt_ps(rsq13);
754             rinv21           = gmx_mm_invsqrt_ps(rsq21);
755             rinv22           = gmx_mm_invsqrt_ps(rsq22);
756             rinv23           = gmx_mm_invsqrt_ps(rsq23);
757             rinv31           = gmx_mm_invsqrt_ps(rsq31);
758             rinv32           = gmx_mm_invsqrt_ps(rsq32);
759             rinv33           = gmx_mm_invsqrt_ps(rsq33);
760
761             fjx1             = _mm_setzero_ps();
762             fjy1             = _mm_setzero_ps();
763             fjz1             = _mm_setzero_ps();
764             fjx2             = _mm_setzero_ps();
765             fjy2             = _mm_setzero_ps();
766             fjz2             = _mm_setzero_ps();
767             fjx3             = _mm_setzero_ps();
768             fjy3             = _mm_setzero_ps();
769             fjz3             = _mm_setzero_ps();
770
771             /**************************
772              * CALCULATE INTERACTIONS *
773              **************************/
774
775             r11              = _mm_mul_ps(rsq11,rinv11);
776             r11              = _mm_andnot_ps(dummy_mask,r11);
777
778             /* Calculate table index by multiplying r with table scale and truncate to integer */
779             rt               = _mm_mul_ps(r11,vftabscale);
780             vfitab           = _mm_cvttps_epi32(rt);
781             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
782             vfitab           = _mm_slli_epi32(vfitab,2);
783
784             /* CUBIC SPLINE TABLE ELECTROSTATICS */
785             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
786             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
787             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
788             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
789             _MM_TRANSPOSE4_PS(Y,F,G,H);
790             Heps             = _mm_mul_ps(vfeps,H);
791             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
792             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
793             velec            = _mm_mul_ps(qq11,VV);
794             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
795             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
796
797             /* Update potential sum for this i atom from the interaction with this j atom. */
798             velec            = _mm_andnot_ps(dummy_mask,velec);
799             velecsum         = _mm_add_ps(velecsum,velec);
800
801             fscal            = felec;
802
803             fscal            = _mm_andnot_ps(dummy_mask,fscal);
804
805             /* Calculate temporary vectorial force */
806             tx               = _mm_mul_ps(fscal,dx11);
807             ty               = _mm_mul_ps(fscal,dy11);
808             tz               = _mm_mul_ps(fscal,dz11);
809
810             /* Update vectorial force */
811             fix1             = _mm_add_ps(fix1,tx);
812             fiy1             = _mm_add_ps(fiy1,ty);
813             fiz1             = _mm_add_ps(fiz1,tz);
814
815             fjx1             = _mm_add_ps(fjx1,tx);
816             fjy1             = _mm_add_ps(fjy1,ty);
817             fjz1             = _mm_add_ps(fjz1,tz);
818
819             /**************************
820              * CALCULATE INTERACTIONS *
821              **************************/
822
823             r12              = _mm_mul_ps(rsq12,rinv12);
824             r12              = _mm_andnot_ps(dummy_mask,r12);
825
826             /* Calculate table index by multiplying r with table scale and truncate to integer */
827             rt               = _mm_mul_ps(r12,vftabscale);
828             vfitab           = _mm_cvttps_epi32(rt);
829             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
830             vfitab           = _mm_slli_epi32(vfitab,2);
831
832             /* CUBIC SPLINE TABLE ELECTROSTATICS */
833             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
834             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
835             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
836             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
837             _MM_TRANSPOSE4_PS(Y,F,G,H);
838             Heps             = _mm_mul_ps(vfeps,H);
839             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
840             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
841             velec            = _mm_mul_ps(qq12,VV);
842             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
843             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
844
845             /* Update potential sum for this i atom from the interaction with this j atom. */
846             velec            = _mm_andnot_ps(dummy_mask,velec);
847             velecsum         = _mm_add_ps(velecsum,velec);
848
849             fscal            = felec;
850
851             fscal            = _mm_andnot_ps(dummy_mask,fscal);
852
853             /* Calculate temporary vectorial force */
854             tx               = _mm_mul_ps(fscal,dx12);
855             ty               = _mm_mul_ps(fscal,dy12);
856             tz               = _mm_mul_ps(fscal,dz12);
857
858             /* Update vectorial force */
859             fix1             = _mm_add_ps(fix1,tx);
860             fiy1             = _mm_add_ps(fiy1,ty);
861             fiz1             = _mm_add_ps(fiz1,tz);
862
863             fjx2             = _mm_add_ps(fjx2,tx);
864             fjy2             = _mm_add_ps(fjy2,ty);
865             fjz2             = _mm_add_ps(fjz2,tz);
866
867             /**************************
868              * CALCULATE INTERACTIONS *
869              **************************/
870
871             r13              = _mm_mul_ps(rsq13,rinv13);
872             r13              = _mm_andnot_ps(dummy_mask,r13);
873
874             /* Calculate table index by multiplying r with table scale and truncate to integer */
875             rt               = _mm_mul_ps(r13,vftabscale);
876             vfitab           = _mm_cvttps_epi32(rt);
877             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
878             vfitab           = _mm_slli_epi32(vfitab,2);
879
880             /* CUBIC SPLINE TABLE ELECTROSTATICS */
881             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
882             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
883             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
884             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
885             _MM_TRANSPOSE4_PS(Y,F,G,H);
886             Heps             = _mm_mul_ps(vfeps,H);
887             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
888             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
889             velec            = _mm_mul_ps(qq13,VV);
890             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
891             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
892
893             /* Update potential sum for this i atom from the interaction with this j atom. */
894             velec            = _mm_andnot_ps(dummy_mask,velec);
895             velecsum         = _mm_add_ps(velecsum,velec);
896
897             fscal            = felec;
898
899             fscal            = _mm_andnot_ps(dummy_mask,fscal);
900
901             /* Calculate temporary vectorial force */
902             tx               = _mm_mul_ps(fscal,dx13);
903             ty               = _mm_mul_ps(fscal,dy13);
904             tz               = _mm_mul_ps(fscal,dz13);
905
906             /* Update vectorial force */
907             fix1             = _mm_add_ps(fix1,tx);
908             fiy1             = _mm_add_ps(fiy1,ty);
909             fiz1             = _mm_add_ps(fiz1,tz);
910
911             fjx3             = _mm_add_ps(fjx3,tx);
912             fjy3             = _mm_add_ps(fjy3,ty);
913             fjz3             = _mm_add_ps(fjz3,tz);
914
915             /**************************
916              * CALCULATE INTERACTIONS *
917              **************************/
918
919             r21              = _mm_mul_ps(rsq21,rinv21);
920             r21              = _mm_andnot_ps(dummy_mask,r21);
921
922             /* Calculate table index by multiplying r with table scale and truncate to integer */
923             rt               = _mm_mul_ps(r21,vftabscale);
924             vfitab           = _mm_cvttps_epi32(rt);
925             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
926             vfitab           = _mm_slli_epi32(vfitab,2);
927
928             /* CUBIC SPLINE TABLE ELECTROSTATICS */
929             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
930             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
931             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
932             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
933             _MM_TRANSPOSE4_PS(Y,F,G,H);
934             Heps             = _mm_mul_ps(vfeps,H);
935             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
936             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
937             velec            = _mm_mul_ps(qq21,VV);
938             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
939             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
940
941             /* Update potential sum for this i atom from the interaction with this j atom. */
942             velec            = _mm_andnot_ps(dummy_mask,velec);
943             velecsum         = _mm_add_ps(velecsum,velec);
944
945             fscal            = felec;
946
947             fscal            = _mm_andnot_ps(dummy_mask,fscal);
948
949             /* Calculate temporary vectorial force */
950             tx               = _mm_mul_ps(fscal,dx21);
951             ty               = _mm_mul_ps(fscal,dy21);
952             tz               = _mm_mul_ps(fscal,dz21);
953
954             /* Update vectorial force */
955             fix2             = _mm_add_ps(fix2,tx);
956             fiy2             = _mm_add_ps(fiy2,ty);
957             fiz2             = _mm_add_ps(fiz2,tz);
958
959             fjx1             = _mm_add_ps(fjx1,tx);
960             fjy1             = _mm_add_ps(fjy1,ty);
961             fjz1             = _mm_add_ps(fjz1,tz);
962
963             /**************************
964              * CALCULATE INTERACTIONS *
965              **************************/
966
967             r22              = _mm_mul_ps(rsq22,rinv22);
968             r22              = _mm_andnot_ps(dummy_mask,r22);
969
970             /* Calculate table index by multiplying r with table scale and truncate to integer */
971             rt               = _mm_mul_ps(r22,vftabscale);
972             vfitab           = _mm_cvttps_epi32(rt);
973             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
974             vfitab           = _mm_slli_epi32(vfitab,2);
975
976             /* CUBIC SPLINE TABLE ELECTROSTATICS */
977             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
978             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
979             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
980             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
981             _MM_TRANSPOSE4_PS(Y,F,G,H);
982             Heps             = _mm_mul_ps(vfeps,H);
983             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
984             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
985             velec            = _mm_mul_ps(qq22,VV);
986             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
987             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
988
989             /* Update potential sum for this i atom from the interaction with this j atom. */
990             velec            = _mm_andnot_ps(dummy_mask,velec);
991             velecsum         = _mm_add_ps(velecsum,velec);
992
993             fscal            = felec;
994
995             fscal            = _mm_andnot_ps(dummy_mask,fscal);
996
997             /* Calculate temporary vectorial force */
998             tx               = _mm_mul_ps(fscal,dx22);
999             ty               = _mm_mul_ps(fscal,dy22);
1000             tz               = _mm_mul_ps(fscal,dz22);
1001
1002             /* Update vectorial force */
1003             fix2             = _mm_add_ps(fix2,tx);
1004             fiy2             = _mm_add_ps(fiy2,ty);
1005             fiz2             = _mm_add_ps(fiz2,tz);
1006
1007             fjx2             = _mm_add_ps(fjx2,tx);
1008             fjy2             = _mm_add_ps(fjy2,ty);
1009             fjz2             = _mm_add_ps(fjz2,tz);
1010
1011             /**************************
1012              * CALCULATE INTERACTIONS *
1013              **************************/
1014
1015             r23              = _mm_mul_ps(rsq23,rinv23);
1016             r23              = _mm_andnot_ps(dummy_mask,r23);
1017
1018             /* Calculate table index by multiplying r with table scale and truncate to integer */
1019             rt               = _mm_mul_ps(r23,vftabscale);
1020             vfitab           = _mm_cvttps_epi32(rt);
1021             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1022             vfitab           = _mm_slli_epi32(vfitab,2);
1023
1024             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1025             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1026             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1027             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1028             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1029             _MM_TRANSPOSE4_PS(Y,F,G,H);
1030             Heps             = _mm_mul_ps(vfeps,H);
1031             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1032             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1033             velec            = _mm_mul_ps(qq23,VV);
1034             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1035             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1036
1037             /* Update potential sum for this i atom from the interaction with this j atom. */
1038             velec            = _mm_andnot_ps(dummy_mask,velec);
1039             velecsum         = _mm_add_ps(velecsum,velec);
1040
1041             fscal            = felec;
1042
1043             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1044
1045             /* Calculate temporary vectorial force */
1046             tx               = _mm_mul_ps(fscal,dx23);
1047             ty               = _mm_mul_ps(fscal,dy23);
1048             tz               = _mm_mul_ps(fscal,dz23);
1049
1050             /* Update vectorial force */
1051             fix2             = _mm_add_ps(fix2,tx);
1052             fiy2             = _mm_add_ps(fiy2,ty);
1053             fiz2             = _mm_add_ps(fiz2,tz);
1054
1055             fjx3             = _mm_add_ps(fjx3,tx);
1056             fjy3             = _mm_add_ps(fjy3,ty);
1057             fjz3             = _mm_add_ps(fjz3,tz);
1058
1059             /**************************
1060              * CALCULATE INTERACTIONS *
1061              **************************/
1062
1063             r31              = _mm_mul_ps(rsq31,rinv31);
1064             r31              = _mm_andnot_ps(dummy_mask,r31);
1065
1066             /* Calculate table index by multiplying r with table scale and truncate to integer */
1067             rt               = _mm_mul_ps(r31,vftabscale);
1068             vfitab           = _mm_cvttps_epi32(rt);
1069             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1070             vfitab           = _mm_slli_epi32(vfitab,2);
1071
1072             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1073             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1074             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1075             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1076             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1077             _MM_TRANSPOSE4_PS(Y,F,G,H);
1078             Heps             = _mm_mul_ps(vfeps,H);
1079             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1080             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1081             velec            = _mm_mul_ps(qq31,VV);
1082             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1083             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1084
1085             /* Update potential sum for this i atom from the interaction with this j atom. */
1086             velec            = _mm_andnot_ps(dummy_mask,velec);
1087             velecsum         = _mm_add_ps(velecsum,velec);
1088
1089             fscal            = felec;
1090
1091             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1092
1093             /* Calculate temporary vectorial force */
1094             tx               = _mm_mul_ps(fscal,dx31);
1095             ty               = _mm_mul_ps(fscal,dy31);
1096             tz               = _mm_mul_ps(fscal,dz31);
1097
1098             /* Update vectorial force */
1099             fix3             = _mm_add_ps(fix3,tx);
1100             fiy3             = _mm_add_ps(fiy3,ty);
1101             fiz3             = _mm_add_ps(fiz3,tz);
1102
1103             fjx1             = _mm_add_ps(fjx1,tx);
1104             fjy1             = _mm_add_ps(fjy1,ty);
1105             fjz1             = _mm_add_ps(fjz1,tz);
1106
1107             /**************************
1108              * CALCULATE INTERACTIONS *
1109              **************************/
1110
1111             r32              = _mm_mul_ps(rsq32,rinv32);
1112             r32              = _mm_andnot_ps(dummy_mask,r32);
1113
1114             /* Calculate table index by multiplying r with table scale and truncate to integer */
1115             rt               = _mm_mul_ps(r32,vftabscale);
1116             vfitab           = _mm_cvttps_epi32(rt);
1117             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1118             vfitab           = _mm_slli_epi32(vfitab,2);
1119
1120             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1121             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1122             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1123             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1124             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1125             _MM_TRANSPOSE4_PS(Y,F,G,H);
1126             Heps             = _mm_mul_ps(vfeps,H);
1127             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1128             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1129             velec            = _mm_mul_ps(qq32,VV);
1130             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1131             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1132
1133             /* Update potential sum for this i atom from the interaction with this j atom. */
1134             velec            = _mm_andnot_ps(dummy_mask,velec);
1135             velecsum         = _mm_add_ps(velecsum,velec);
1136
1137             fscal            = felec;
1138
1139             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1140
1141             /* Calculate temporary vectorial force */
1142             tx               = _mm_mul_ps(fscal,dx32);
1143             ty               = _mm_mul_ps(fscal,dy32);
1144             tz               = _mm_mul_ps(fscal,dz32);
1145
1146             /* Update vectorial force */
1147             fix3             = _mm_add_ps(fix3,tx);
1148             fiy3             = _mm_add_ps(fiy3,ty);
1149             fiz3             = _mm_add_ps(fiz3,tz);
1150
1151             fjx2             = _mm_add_ps(fjx2,tx);
1152             fjy2             = _mm_add_ps(fjy2,ty);
1153             fjz2             = _mm_add_ps(fjz2,tz);
1154
1155             /**************************
1156              * CALCULATE INTERACTIONS *
1157              **************************/
1158
1159             r33              = _mm_mul_ps(rsq33,rinv33);
1160             r33              = _mm_andnot_ps(dummy_mask,r33);
1161
1162             /* Calculate table index by multiplying r with table scale and truncate to integer */
1163             rt               = _mm_mul_ps(r33,vftabscale);
1164             vfitab           = _mm_cvttps_epi32(rt);
1165             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1166             vfitab           = _mm_slli_epi32(vfitab,2);
1167
1168             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1169             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1170             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1171             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1172             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1173             _MM_TRANSPOSE4_PS(Y,F,G,H);
1174             Heps             = _mm_mul_ps(vfeps,H);
1175             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1176             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1177             velec            = _mm_mul_ps(qq33,VV);
1178             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1179             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1180
1181             /* Update potential sum for this i atom from the interaction with this j atom. */
1182             velec            = _mm_andnot_ps(dummy_mask,velec);
1183             velecsum         = _mm_add_ps(velecsum,velec);
1184
1185             fscal            = felec;
1186
1187             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1188
1189             /* Calculate temporary vectorial force */
1190             tx               = _mm_mul_ps(fscal,dx33);
1191             ty               = _mm_mul_ps(fscal,dy33);
1192             tz               = _mm_mul_ps(fscal,dz33);
1193
1194             /* Update vectorial force */
1195             fix3             = _mm_add_ps(fix3,tx);
1196             fiy3             = _mm_add_ps(fiy3,ty);
1197             fiz3             = _mm_add_ps(fiz3,tz);
1198
1199             fjx3             = _mm_add_ps(fjx3,tx);
1200             fjy3             = _mm_add_ps(fjy3,ty);
1201             fjz3             = _mm_add_ps(fjz3,tz);
1202
1203             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1204             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1205             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1206             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1207
1208             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1209                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1210
1211             /* Inner loop uses 396 flops */
1212         }
1213
1214         /* End of innermost loop */
1215
1216         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1217                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1218
1219         ggid                        = gid[iidx];
1220         /* Update potential energies */
1221         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1222
1223         /* Increment number of inner iterations */
1224         inneriter                  += j_index_end - j_index_start;
1225
1226         /* Outer loop uses 19 flops */
1227     }
1228
1229     /* Increment number of outer iterations */
1230     outeriter        += nri;
1231
1232     /* Update outer/inner flops */
1233
1234     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*396);
1235 }
1236 /*
1237  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_single
1238  * Electrostatics interaction: CubicSplineTable
1239  * VdW interaction:            None
1240  * Geometry:                   Water4-Water4
1241  * Calculate force/pot:        Force
1242  */
1243 void
1244 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_single
1245                     (t_nblist                    * gmx_restrict       nlist,
1246                      rvec                        * gmx_restrict          xx,
1247                      rvec                        * gmx_restrict          ff,
1248                      t_forcerec                  * gmx_restrict          fr,
1249                      t_mdatoms                   * gmx_restrict     mdatoms,
1250                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1251                      t_nrnb                      * gmx_restrict        nrnb)
1252 {
1253     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1254      * just 0 for non-waters.
1255      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1256      * jnr indices corresponding to data put in the four positions in the SIMD register.
1257      */
1258     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1259     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1260     int              jnrA,jnrB,jnrC,jnrD;
1261     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1262     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1263     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1264     real             rcutoff_scalar;
1265     real             *shiftvec,*fshift,*x,*f;
1266     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1267     real             scratch[4*DIM];
1268     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1269     int              vdwioffset1;
1270     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1271     int              vdwioffset2;
1272     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1273     int              vdwioffset3;
1274     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1275     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1276     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1277     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1278     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1279     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1280     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1281     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1282     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1283     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1284     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1285     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1286     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1287     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1288     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1289     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1290     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1291     real             *charge;
1292     __m128i          vfitab;
1293     __m128i          ifour       = _mm_set1_epi32(4);
1294     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1295     real             *vftab;
1296     __m128           dummy_mask,cutoff_mask;
1297     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1298     __m128           one     = _mm_set1_ps(1.0);
1299     __m128           two     = _mm_set1_ps(2.0);
1300     x                = xx[0];
1301     f                = ff[0];
1302
1303     nri              = nlist->nri;
1304     iinr             = nlist->iinr;
1305     jindex           = nlist->jindex;
1306     jjnr             = nlist->jjnr;
1307     shiftidx         = nlist->shift;
1308     gid              = nlist->gid;
1309     shiftvec         = fr->shift_vec[0];
1310     fshift           = fr->fshift[0];
1311     facel            = _mm_set1_ps(fr->epsfac);
1312     charge           = mdatoms->chargeA;
1313
1314     vftab            = kernel_data->table_elec->data;
1315     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1316
1317     /* Setup water-specific parameters */
1318     inr              = nlist->iinr[0];
1319     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1320     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1321     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1322
1323     jq1              = _mm_set1_ps(charge[inr+1]);
1324     jq2              = _mm_set1_ps(charge[inr+2]);
1325     jq3              = _mm_set1_ps(charge[inr+3]);
1326     qq11             = _mm_mul_ps(iq1,jq1);
1327     qq12             = _mm_mul_ps(iq1,jq2);
1328     qq13             = _mm_mul_ps(iq1,jq3);
1329     qq21             = _mm_mul_ps(iq2,jq1);
1330     qq22             = _mm_mul_ps(iq2,jq2);
1331     qq23             = _mm_mul_ps(iq2,jq3);
1332     qq31             = _mm_mul_ps(iq3,jq1);
1333     qq32             = _mm_mul_ps(iq3,jq2);
1334     qq33             = _mm_mul_ps(iq3,jq3);
1335
1336     /* Avoid stupid compiler warnings */
1337     jnrA = jnrB = jnrC = jnrD = 0;
1338     j_coord_offsetA = 0;
1339     j_coord_offsetB = 0;
1340     j_coord_offsetC = 0;
1341     j_coord_offsetD = 0;
1342
1343     outeriter        = 0;
1344     inneriter        = 0;
1345
1346     for(iidx=0;iidx<4*DIM;iidx++)
1347     {
1348         scratch[iidx] = 0.0;
1349     }
1350
1351     /* Start outer loop over neighborlists */
1352     for(iidx=0; iidx<nri; iidx++)
1353     {
1354         /* Load shift vector for this list */
1355         i_shift_offset   = DIM*shiftidx[iidx];
1356
1357         /* Load limits for loop over neighbors */
1358         j_index_start    = jindex[iidx];
1359         j_index_end      = jindex[iidx+1];
1360
1361         /* Get outer coordinate index */
1362         inr              = iinr[iidx];
1363         i_coord_offset   = DIM*inr;
1364
1365         /* Load i particle coords and add shift vector */
1366         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1367                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1368
1369         fix1             = _mm_setzero_ps();
1370         fiy1             = _mm_setzero_ps();
1371         fiz1             = _mm_setzero_ps();
1372         fix2             = _mm_setzero_ps();
1373         fiy2             = _mm_setzero_ps();
1374         fiz2             = _mm_setzero_ps();
1375         fix3             = _mm_setzero_ps();
1376         fiy3             = _mm_setzero_ps();
1377         fiz3             = _mm_setzero_ps();
1378
1379         /* Start inner kernel loop */
1380         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1381         {
1382
1383             /* Get j neighbor index, and coordinate index */
1384             jnrA             = jjnr[jidx];
1385             jnrB             = jjnr[jidx+1];
1386             jnrC             = jjnr[jidx+2];
1387             jnrD             = jjnr[jidx+3];
1388             j_coord_offsetA  = DIM*jnrA;
1389             j_coord_offsetB  = DIM*jnrB;
1390             j_coord_offsetC  = DIM*jnrC;
1391             j_coord_offsetD  = DIM*jnrD;
1392
1393             /* load j atom coordinates */
1394             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1395                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1396                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1397
1398             /* Calculate displacement vector */
1399             dx11             = _mm_sub_ps(ix1,jx1);
1400             dy11             = _mm_sub_ps(iy1,jy1);
1401             dz11             = _mm_sub_ps(iz1,jz1);
1402             dx12             = _mm_sub_ps(ix1,jx2);
1403             dy12             = _mm_sub_ps(iy1,jy2);
1404             dz12             = _mm_sub_ps(iz1,jz2);
1405             dx13             = _mm_sub_ps(ix1,jx3);
1406             dy13             = _mm_sub_ps(iy1,jy3);
1407             dz13             = _mm_sub_ps(iz1,jz3);
1408             dx21             = _mm_sub_ps(ix2,jx1);
1409             dy21             = _mm_sub_ps(iy2,jy1);
1410             dz21             = _mm_sub_ps(iz2,jz1);
1411             dx22             = _mm_sub_ps(ix2,jx2);
1412             dy22             = _mm_sub_ps(iy2,jy2);
1413             dz22             = _mm_sub_ps(iz2,jz2);
1414             dx23             = _mm_sub_ps(ix2,jx3);
1415             dy23             = _mm_sub_ps(iy2,jy3);
1416             dz23             = _mm_sub_ps(iz2,jz3);
1417             dx31             = _mm_sub_ps(ix3,jx1);
1418             dy31             = _mm_sub_ps(iy3,jy1);
1419             dz31             = _mm_sub_ps(iz3,jz1);
1420             dx32             = _mm_sub_ps(ix3,jx2);
1421             dy32             = _mm_sub_ps(iy3,jy2);
1422             dz32             = _mm_sub_ps(iz3,jz2);
1423             dx33             = _mm_sub_ps(ix3,jx3);
1424             dy33             = _mm_sub_ps(iy3,jy3);
1425             dz33             = _mm_sub_ps(iz3,jz3);
1426
1427             /* Calculate squared distance and things based on it */
1428             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1429             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1430             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1431             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1432             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1433             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1434             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1435             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1436             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1437
1438             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1439             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1440             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1441             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1442             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1443             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1444             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1445             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1446             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1447
1448             fjx1             = _mm_setzero_ps();
1449             fjy1             = _mm_setzero_ps();
1450             fjz1             = _mm_setzero_ps();
1451             fjx2             = _mm_setzero_ps();
1452             fjy2             = _mm_setzero_ps();
1453             fjz2             = _mm_setzero_ps();
1454             fjx3             = _mm_setzero_ps();
1455             fjy3             = _mm_setzero_ps();
1456             fjz3             = _mm_setzero_ps();
1457
1458             /**************************
1459              * CALCULATE INTERACTIONS *
1460              **************************/
1461
1462             r11              = _mm_mul_ps(rsq11,rinv11);
1463
1464             /* Calculate table index by multiplying r with table scale and truncate to integer */
1465             rt               = _mm_mul_ps(r11,vftabscale);
1466             vfitab           = _mm_cvttps_epi32(rt);
1467             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1468             vfitab           = _mm_slli_epi32(vfitab,2);
1469
1470             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1471             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1472             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1473             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1474             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1475             _MM_TRANSPOSE4_PS(Y,F,G,H);
1476             Heps             = _mm_mul_ps(vfeps,H);
1477             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1478             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1479             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1480
1481             fscal            = felec;
1482
1483             /* Calculate temporary vectorial force */
1484             tx               = _mm_mul_ps(fscal,dx11);
1485             ty               = _mm_mul_ps(fscal,dy11);
1486             tz               = _mm_mul_ps(fscal,dz11);
1487
1488             /* Update vectorial force */
1489             fix1             = _mm_add_ps(fix1,tx);
1490             fiy1             = _mm_add_ps(fiy1,ty);
1491             fiz1             = _mm_add_ps(fiz1,tz);
1492
1493             fjx1             = _mm_add_ps(fjx1,tx);
1494             fjy1             = _mm_add_ps(fjy1,ty);
1495             fjz1             = _mm_add_ps(fjz1,tz);
1496
1497             /**************************
1498              * CALCULATE INTERACTIONS *
1499              **************************/
1500
1501             r12              = _mm_mul_ps(rsq12,rinv12);
1502
1503             /* Calculate table index by multiplying r with table scale and truncate to integer */
1504             rt               = _mm_mul_ps(r12,vftabscale);
1505             vfitab           = _mm_cvttps_epi32(rt);
1506             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1507             vfitab           = _mm_slli_epi32(vfitab,2);
1508
1509             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1510             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1511             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1512             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1513             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1514             _MM_TRANSPOSE4_PS(Y,F,G,H);
1515             Heps             = _mm_mul_ps(vfeps,H);
1516             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1517             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1518             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1519
1520             fscal            = felec;
1521
1522             /* Calculate temporary vectorial force */
1523             tx               = _mm_mul_ps(fscal,dx12);
1524             ty               = _mm_mul_ps(fscal,dy12);
1525             tz               = _mm_mul_ps(fscal,dz12);
1526
1527             /* Update vectorial force */
1528             fix1             = _mm_add_ps(fix1,tx);
1529             fiy1             = _mm_add_ps(fiy1,ty);
1530             fiz1             = _mm_add_ps(fiz1,tz);
1531
1532             fjx2             = _mm_add_ps(fjx2,tx);
1533             fjy2             = _mm_add_ps(fjy2,ty);
1534             fjz2             = _mm_add_ps(fjz2,tz);
1535
1536             /**************************
1537              * CALCULATE INTERACTIONS *
1538              **************************/
1539
1540             r13              = _mm_mul_ps(rsq13,rinv13);
1541
1542             /* Calculate table index by multiplying r with table scale and truncate to integer */
1543             rt               = _mm_mul_ps(r13,vftabscale);
1544             vfitab           = _mm_cvttps_epi32(rt);
1545             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1546             vfitab           = _mm_slli_epi32(vfitab,2);
1547
1548             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1549             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1550             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1551             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1552             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1553             _MM_TRANSPOSE4_PS(Y,F,G,H);
1554             Heps             = _mm_mul_ps(vfeps,H);
1555             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1556             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1557             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1558
1559             fscal            = felec;
1560
1561             /* Calculate temporary vectorial force */
1562             tx               = _mm_mul_ps(fscal,dx13);
1563             ty               = _mm_mul_ps(fscal,dy13);
1564             tz               = _mm_mul_ps(fscal,dz13);
1565
1566             /* Update vectorial force */
1567             fix1             = _mm_add_ps(fix1,tx);
1568             fiy1             = _mm_add_ps(fiy1,ty);
1569             fiz1             = _mm_add_ps(fiz1,tz);
1570
1571             fjx3             = _mm_add_ps(fjx3,tx);
1572             fjy3             = _mm_add_ps(fjy3,ty);
1573             fjz3             = _mm_add_ps(fjz3,tz);
1574
1575             /**************************
1576              * CALCULATE INTERACTIONS *
1577              **************************/
1578
1579             r21              = _mm_mul_ps(rsq21,rinv21);
1580
1581             /* Calculate table index by multiplying r with table scale and truncate to integer */
1582             rt               = _mm_mul_ps(r21,vftabscale);
1583             vfitab           = _mm_cvttps_epi32(rt);
1584             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1585             vfitab           = _mm_slli_epi32(vfitab,2);
1586
1587             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1588             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1589             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1590             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1591             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1592             _MM_TRANSPOSE4_PS(Y,F,G,H);
1593             Heps             = _mm_mul_ps(vfeps,H);
1594             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1595             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1596             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1597
1598             fscal            = felec;
1599
1600             /* Calculate temporary vectorial force */
1601             tx               = _mm_mul_ps(fscal,dx21);
1602             ty               = _mm_mul_ps(fscal,dy21);
1603             tz               = _mm_mul_ps(fscal,dz21);
1604
1605             /* Update vectorial force */
1606             fix2             = _mm_add_ps(fix2,tx);
1607             fiy2             = _mm_add_ps(fiy2,ty);
1608             fiz2             = _mm_add_ps(fiz2,tz);
1609
1610             fjx1             = _mm_add_ps(fjx1,tx);
1611             fjy1             = _mm_add_ps(fjy1,ty);
1612             fjz1             = _mm_add_ps(fjz1,tz);
1613
1614             /**************************
1615              * CALCULATE INTERACTIONS *
1616              **************************/
1617
1618             r22              = _mm_mul_ps(rsq22,rinv22);
1619
1620             /* Calculate table index by multiplying r with table scale and truncate to integer */
1621             rt               = _mm_mul_ps(r22,vftabscale);
1622             vfitab           = _mm_cvttps_epi32(rt);
1623             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1624             vfitab           = _mm_slli_epi32(vfitab,2);
1625
1626             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1627             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1628             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1629             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1630             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1631             _MM_TRANSPOSE4_PS(Y,F,G,H);
1632             Heps             = _mm_mul_ps(vfeps,H);
1633             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1634             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1635             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1636
1637             fscal            = felec;
1638
1639             /* Calculate temporary vectorial force */
1640             tx               = _mm_mul_ps(fscal,dx22);
1641             ty               = _mm_mul_ps(fscal,dy22);
1642             tz               = _mm_mul_ps(fscal,dz22);
1643
1644             /* Update vectorial force */
1645             fix2             = _mm_add_ps(fix2,tx);
1646             fiy2             = _mm_add_ps(fiy2,ty);
1647             fiz2             = _mm_add_ps(fiz2,tz);
1648
1649             fjx2             = _mm_add_ps(fjx2,tx);
1650             fjy2             = _mm_add_ps(fjy2,ty);
1651             fjz2             = _mm_add_ps(fjz2,tz);
1652
1653             /**************************
1654              * CALCULATE INTERACTIONS *
1655              **************************/
1656
1657             r23              = _mm_mul_ps(rsq23,rinv23);
1658
1659             /* Calculate table index by multiplying r with table scale and truncate to integer */
1660             rt               = _mm_mul_ps(r23,vftabscale);
1661             vfitab           = _mm_cvttps_epi32(rt);
1662             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1663             vfitab           = _mm_slli_epi32(vfitab,2);
1664
1665             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1666             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1667             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1668             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1669             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1670             _MM_TRANSPOSE4_PS(Y,F,G,H);
1671             Heps             = _mm_mul_ps(vfeps,H);
1672             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1673             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1674             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1675
1676             fscal            = felec;
1677
1678             /* Calculate temporary vectorial force */
1679             tx               = _mm_mul_ps(fscal,dx23);
1680             ty               = _mm_mul_ps(fscal,dy23);
1681             tz               = _mm_mul_ps(fscal,dz23);
1682
1683             /* Update vectorial force */
1684             fix2             = _mm_add_ps(fix2,tx);
1685             fiy2             = _mm_add_ps(fiy2,ty);
1686             fiz2             = _mm_add_ps(fiz2,tz);
1687
1688             fjx3             = _mm_add_ps(fjx3,tx);
1689             fjy3             = _mm_add_ps(fjy3,ty);
1690             fjz3             = _mm_add_ps(fjz3,tz);
1691
1692             /**************************
1693              * CALCULATE INTERACTIONS *
1694              **************************/
1695
1696             r31              = _mm_mul_ps(rsq31,rinv31);
1697
1698             /* Calculate table index by multiplying r with table scale and truncate to integer */
1699             rt               = _mm_mul_ps(r31,vftabscale);
1700             vfitab           = _mm_cvttps_epi32(rt);
1701             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1702             vfitab           = _mm_slli_epi32(vfitab,2);
1703
1704             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1705             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1706             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1707             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1708             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1709             _MM_TRANSPOSE4_PS(Y,F,G,H);
1710             Heps             = _mm_mul_ps(vfeps,H);
1711             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1712             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1713             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1714
1715             fscal            = felec;
1716
1717             /* Calculate temporary vectorial force */
1718             tx               = _mm_mul_ps(fscal,dx31);
1719             ty               = _mm_mul_ps(fscal,dy31);
1720             tz               = _mm_mul_ps(fscal,dz31);
1721
1722             /* Update vectorial force */
1723             fix3             = _mm_add_ps(fix3,tx);
1724             fiy3             = _mm_add_ps(fiy3,ty);
1725             fiz3             = _mm_add_ps(fiz3,tz);
1726
1727             fjx1             = _mm_add_ps(fjx1,tx);
1728             fjy1             = _mm_add_ps(fjy1,ty);
1729             fjz1             = _mm_add_ps(fjz1,tz);
1730
1731             /**************************
1732              * CALCULATE INTERACTIONS *
1733              **************************/
1734
1735             r32              = _mm_mul_ps(rsq32,rinv32);
1736
1737             /* Calculate table index by multiplying r with table scale and truncate to integer */
1738             rt               = _mm_mul_ps(r32,vftabscale);
1739             vfitab           = _mm_cvttps_epi32(rt);
1740             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1741             vfitab           = _mm_slli_epi32(vfitab,2);
1742
1743             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1744             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1745             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1746             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1747             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1748             _MM_TRANSPOSE4_PS(Y,F,G,H);
1749             Heps             = _mm_mul_ps(vfeps,H);
1750             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1751             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1752             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1753
1754             fscal            = felec;
1755
1756             /* Calculate temporary vectorial force */
1757             tx               = _mm_mul_ps(fscal,dx32);
1758             ty               = _mm_mul_ps(fscal,dy32);
1759             tz               = _mm_mul_ps(fscal,dz32);
1760
1761             /* Update vectorial force */
1762             fix3             = _mm_add_ps(fix3,tx);
1763             fiy3             = _mm_add_ps(fiy3,ty);
1764             fiz3             = _mm_add_ps(fiz3,tz);
1765
1766             fjx2             = _mm_add_ps(fjx2,tx);
1767             fjy2             = _mm_add_ps(fjy2,ty);
1768             fjz2             = _mm_add_ps(fjz2,tz);
1769
1770             /**************************
1771              * CALCULATE INTERACTIONS *
1772              **************************/
1773
1774             r33              = _mm_mul_ps(rsq33,rinv33);
1775
1776             /* Calculate table index by multiplying r with table scale and truncate to integer */
1777             rt               = _mm_mul_ps(r33,vftabscale);
1778             vfitab           = _mm_cvttps_epi32(rt);
1779             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1780             vfitab           = _mm_slli_epi32(vfitab,2);
1781
1782             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1783             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1784             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1785             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1786             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1787             _MM_TRANSPOSE4_PS(Y,F,G,H);
1788             Heps             = _mm_mul_ps(vfeps,H);
1789             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1790             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1791             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1792
1793             fscal            = felec;
1794
1795             /* Calculate temporary vectorial force */
1796             tx               = _mm_mul_ps(fscal,dx33);
1797             ty               = _mm_mul_ps(fscal,dy33);
1798             tz               = _mm_mul_ps(fscal,dz33);
1799
1800             /* Update vectorial force */
1801             fix3             = _mm_add_ps(fix3,tx);
1802             fiy3             = _mm_add_ps(fiy3,ty);
1803             fiz3             = _mm_add_ps(fiz3,tz);
1804
1805             fjx3             = _mm_add_ps(fjx3,tx);
1806             fjy3             = _mm_add_ps(fjy3,ty);
1807             fjz3             = _mm_add_ps(fjz3,tz);
1808
1809             fjptrA             = f+j_coord_offsetA;
1810             fjptrB             = f+j_coord_offsetB;
1811             fjptrC             = f+j_coord_offsetC;
1812             fjptrD             = f+j_coord_offsetD;
1813
1814             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1815                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1816
1817             /* Inner loop uses 351 flops */
1818         }
1819
1820         if(jidx<j_index_end)
1821         {
1822
1823             /* Get j neighbor index, and coordinate index */
1824             jnrlistA         = jjnr[jidx];
1825             jnrlistB         = jjnr[jidx+1];
1826             jnrlistC         = jjnr[jidx+2];
1827             jnrlistD         = jjnr[jidx+3];
1828             /* Sign of each element will be negative for non-real atoms.
1829              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1830              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1831              */
1832             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1833             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1834             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1835             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1836             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1837             j_coord_offsetA  = DIM*jnrA;
1838             j_coord_offsetB  = DIM*jnrB;
1839             j_coord_offsetC  = DIM*jnrC;
1840             j_coord_offsetD  = DIM*jnrD;
1841
1842             /* load j atom coordinates */
1843             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1844                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1845                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1846
1847             /* Calculate displacement vector */
1848             dx11             = _mm_sub_ps(ix1,jx1);
1849             dy11             = _mm_sub_ps(iy1,jy1);
1850             dz11             = _mm_sub_ps(iz1,jz1);
1851             dx12             = _mm_sub_ps(ix1,jx2);
1852             dy12             = _mm_sub_ps(iy1,jy2);
1853             dz12             = _mm_sub_ps(iz1,jz2);
1854             dx13             = _mm_sub_ps(ix1,jx3);
1855             dy13             = _mm_sub_ps(iy1,jy3);
1856             dz13             = _mm_sub_ps(iz1,jz3);
1857             dx21             = _mm_sub_ps(ix2,jx1);
1858             dy21             = _mm_sub_ps(iy2,jy1);
1859             dz21             = _mm_sub_ps(iz2,jz1);
1860             dx22             = _mm_sub_ps(ix2,jx2);
1861             dy22             = _mm_sub_ps(iy2,jy2);
1862             dz22             = _mm_sub_ps(iz2,jz2);
1863             dx23             = _mm_sub_ps(ix2,jx3);
1864             dy23             = _mm_sub_ps(iy2,jy3);
1865             dz23             = _mm_sub_ps(iz2,jz3);
1866             dx31             = _mm_sub_ps(ix3,jx1);
1867             dy31             = _mm_sub_ps(iy3,jy1);
1868             dz31             = _mm_sub_ps(iz3,jz1);
1869             dx32             = _mm_sub_ps(ix3,jx2);
1870             dy32             = _mm_sub_ps(iy3,jy2);
1871             dz32             = _mm_sub_ps(iz3,jz2);
1872             dx33             = _mm_sub_ps(ix3,jx3);
1873             dy33             = _mm_sub_ps(iy3,jy3);
1874             dz33             = _mm_sub_ps(iz3,jz3);
1875
1876             /* Calculate squared distance and things based on it */
1877             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1878             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1879             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1880             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1881             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1882             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1883             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1884             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1885             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1886
1887             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1888             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1889             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1890             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1891             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1892             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1893             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1894             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1895             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1896
1897             fjx1             = _mm_setzero_ps();
1898             fjy1             = _mm_setzero_ps();
1899             fjz1             = _mm_setzero_ps();
1900             fjx2             = _mm_setzero_ps();
1901             fjy2             = _mm_setzero_ps();
1902             fjz2             = _mm_setzero_ps();
1903             fjx3             = _mm_setzero_ps();
1904             fjy3             = _mm_setzero_ps();
1905             fjz3             = _mm_setzero_ps();
1906
1907             /**************************
1908              * CALCULATE INTERACTIONS *
1909              **************************/
1910
1911             r11              = _mm_mul_ps(rsq11,rinv11);
1912             r11              = _mm_andnot_ps(dummy_mask,r11);
1913
1914             /* Calculate table index by multiplying r with table scale and truncate to integer */
1915             rt               = _mm_mul_ps(r11,vftabscale);
1916             vfitab           = _mm_cvttps_epi32(rt);
1917             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1918             vfitab           = _mm_slli_epi32(vfitab,2);
1919
1920             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1921             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1922             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1923             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1924             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1925             _MM_TRANSPOSE4_PS(Y,F,G,H);
1926             Heps             = _mm_mul_ps(vfeps,H);
1927             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1928             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1929             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1930
1931             fscal            = felec;
1932
1933             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1934
1935             /* Calculate temporary vectorial force */
1936             tx               = _mm_mul_ps(fscal,dx11);
1937             ty               = _mm_mul_ps(fscal,dy11);
1938             tz               = _mm_mul_ps(fscal,dz11);
1939
1940             /* Update vectorial force */
1941             fix1             = _mm_add_ps(fix1,tx);
1942             fiy1             = _mm_add_ps(fiy1,ty);
1943             fiz1             = _mm_add_ps(fiz1,tz);
1944
1945             fjx1             = _mm_add_ps(fjx1,tx);
1946             fjy1             = _mm_add_ps(fjy1,ty);
1947             fjz1             = _mm_add_ps(fjz1,tz);
1948
1949             /**************************
1950              * CALCULATE INTERACTIONS *
1951              **************************/
1952
1953             r12              = _mm_mul_ps(rsq12,rinv12);
1954             r12              = _mm_andnot_ps(dummy_mask,r12);
1955
1956             /* Calculate table index by multiplying r with table scale and truncate to integer */
1957             rt               = _mm_mul_ps(r12,vftabscale);
1958             vfitab           = _mm_cvttps_epi32(rt);
1959             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1960             vfitab           = _mm_slli_epi32(vfitab,2);
1961
1962             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1963             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1964             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1965             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1966             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1967             _MM_TRANSPOSE4_PS(Y,F,G,H);
1968             Heps             = _mm_mul_ps(vfeps,H);
1969             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1970             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1971             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1972
1973             fscal            = felec;
1974
1975             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1976
1977             /* Calculate temporary vectorial force */
1978             tx               = _mm_mul_ps(fscal,dx12);
1979             ty               = _mm_mul_ps(fscal,dy12);
1980             tz               = _mm_mul_ps(fscal,dz12);
1981
1982             /* Update vectorial force */
1983             fix1             = _mm_add_ps(fix1,tx);
1984             fiy1             = _mm_add_ps(fiy1,ty);
1985             fiz1             = _mm_add_ps(fiz1,tz);
1986
1987             fjx2             = _mm_add_ps(fjx2,tx);
1988             fjy2             = _mm_add_ps(fjy2,ty);
1989             fjz2             = _mm_add_ps(fjz2,tz);
1990
1991             /**************************
1992              * CALCULATE INTERACTIONS *
1993              **************************/
1994
1995             r13              = _mm_mul_ps(rsq13,rinv13);
1996             r13              = _mm_andnot_ps(dummy_mask,r13);
1997
1998             /* Calculate table index by multiplying r with table scale and truncate to integer */
1999             rt               = _mm_mul_ps(r13,vftabscale);
2000             vfitab           = _mm_cvttps_epi32(rt);
2001             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2002             vfitab           = _mm_slli_epi32(vfitab,2);
2003
2004             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2005             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2006             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2007             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2008             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2009             _MM_TRANSPOSE4_PS(Y,F,G,H);
2010             Heps             = _mm_mul_ps(vfeps,H);
2011             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2012             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2013             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2014
2015             fscal            = felec;
2016
2017             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2018
2019             /* Calculate temporary vectorial force */
2020             tx               = _mm_mul_ps(fscal,dx13);
2021             ty               = _mm_mul_ps(fscal,dy13);
2022             tz               = _mm_mul_ps(fscal,dz13);
2023
2024             /* Update vectorial force */
2025             fix1             = _mm_add_ps(fix1,tx);
2026             fiy1             = _mm_add_ps(fiy1,ty);
2027             fiz1             = _mm_add_ps(fiz1,tz);
2028
2029             fjx3             = _mm_add_ps(fjx3,tx);
2030             fjy3             = _mm_add_ps(fjy3,ty);
2031             fjz3             = _mm_add_ps(fjz3,tz);
2032
2033             /**************************
2034              * CALCULATE INTERACTIONS *
2035              **************************/
2036
2037             r21              = _mm_mul_ps(rsq21,rinv21);
2038             r21              = _mm_andnot_ps(dummy_mask,r21);
2039
2040             /* Calculate table index by multiplying r with table scale and truncate to integer */
2041             rt               = _mm_mul_ps(r21,vftabscale);
2042             vfitab           = _mm_cvttps_epi32(rt);
2043             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2044             vfitab           = _mm_slli_epi32(vfitab,2);
2045
2046             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2047             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2048             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2049             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2050             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2051             _MM_TRANSPOSE4_PS(Y,F,G,H);
2052             Heps             = _mm_mul_ps(vfeps,H);
2053             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2054             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2055             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2056
2057             fscal            = felec;
2058
2059             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2060
2061             /* Calculate temporary vectorial force */
2062             tx               = _mm_mul_ps(fscal,dx21);
2063             ty               = _mm_mul_ps(fscal,dy21);
2064             tz               = _mm_mul_ps(fscal,dz21);
2065
2066             /* Update vectorial force */
2067             fix2             = _mm_add_ps(fix2,tx);
2068             fiy2             = _mm_add_ps(fiy2,ty);
2069             fiz2             = _mm_add_ps(fiz2,tz);
2070
2071             fjx1             = _mm_add_ps(fjx1,tx);
2072             fjy1             = _mm_add_ps(fjy1,ty);
2073             fjz1             = _mm_add_ps(fjz1,tz);
2074
2075             /**************************
2076              * CALCULATE INTERACTIONS *
2077              **************************/
2078
2079             r22              = _mm_mul_ps(rsq22,rinv22);
2080             r22              = _mm_andnot_ps(dummy_mask,r22);
2081
2082             /* Calculate table index by multiplying r with table scale and truncate to integer */
2083             rt               = _mm_mul_ps(r22,vftabscale);
2084             vfitab           = _mm_cvttps_epi32(rt);
2085             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2086             vfitab           = _mm_slli_epi32(vfitab,2);
2087
2088             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2089             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2090             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2091             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2092             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2093             _MM_TRANSPOSE4_PS(Y,F,G,H);
2094             Heps             = _mm_mul_ps(vfeps,H);
2095             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2096             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2097             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2098
2099             fscal            = felec;
2100
2101             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2102
2103             /* Calculate temporary vectorial force */
2104             tx               = _mm_mul_ps(fscal,dx22);
2105             ty               = _mm_mul_ps(fscal,dy22);
2106             tz               = _mm_mul_ps(fscal,dz22);
2107
2108             /* Update vectorial force */
2109             fix2             = _mm_add_ps(fix2,tx);
2110             fiy2             = _mm_add_ps(fiy2,ty);
2111             fiz2             = _mm_add_ps(fiz2,tz);
2112
2113             fjx2             = _mm_add_ps(fjx2,tx);
2114             fjy2             = _mm_add_ps(fjy2,ty);
2115             fjz2             = _mm_add_ps(fjz2,tz);
2116
2117             /**************************
2118              * CALCULATE INTERACTIONS *
2119              **************************/
2120
2121             r23              = _mm_mul_ps(rsq23,rinv23);
2122             r23              = _mm_andnot_ps(dummy_mask,r23);
2123
2124             /* Calculate table index by multiplying r with table scale and truncate to integer */
2125             rt               = _mm_mul_ps(r23,vftabscale);
2126             vfitab           = _mm_cvttps_epi32(rt);
2127             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2128             vfitab           = _mm_slli_epi32(vfitab,2);
2129
2130             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2131             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2132             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2133             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2134             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2135             _MM_TRANSPOSE4_PS(Y,F,G,H);
2136             Heps             = _mm_mul_ps(vfeps,H);
2137             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2138             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2139             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2140
2141             fscal            = felec;
2142
2143             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2144
2145             /* Calculate temporary vectorial force */
2146             tx               = _mm_mul_ps(fscal,dx23);
2147             ty               = _mm_mul_ps(fscal,dy23);
2148             tz               = _mm_mul_ps(fscal,dz23);
2149
2150             /* Update vectorial force */
2151             fix2             = _mm_add_ps(fix2,tx);
2152             fiy2             = _mm_add_ps(fiy2,ty);
2153             fiz2             = _mm_add_ps(fiz2,tz);
2154
2155             fjx3             = _mm_add_ps(fjx3,tx);
2156             fjy3             = _mm_add_ps(fjy3,ty);
2157             fjz3             = _mm_add_ps(fjz3,tz);
2158
2159             /**************************
2160              * CALCULATE INTERACTIONS *
2161              **************************/
2162
2163             r31              = _mm_mul_ps(rsq31,rinv31);
2164             r31              = _mm_andnot_ps(dummy_mask,r31);
2165
2166             /* Calculate table index by multiplying r with table scale and truncate to integer */
2167             rt               = _mm_mul_ps(r31,vftabscale);
2168             vfitab           = _mm_cvttps_epi32(rt);
2169             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2170             vfitab           = _mm_slli_epi32(vfitab,2);
2171
2172             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2173             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2174             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2175             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2176             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2177             _MM_TRANSPOSE4_PS(Y,F,G,H);
2178             Heps             = _mm_mul_ps(vfeps,H);
2179             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2180             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2181             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2182
2183             fscal            = felec;
2184
2185             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2186
2187             /* Calculate temporary vectorial force */
2188             tx               = _mm_mul_ps(fscal,dx31);
2189             ty               = _mm_mul_ps(fscal,dy31);
2190             tz               = _mm_mul_ps(fscal,dz31);
2191
2192             /* Update vectorial force */
2193             fix3             = _mm_add_ps(fix3,tx);
2194             fiy3             = _mm_add_ps(fiy3,ty);
2195             fiz3             = _mm_add_ps(fiz3,tz);
2196
2197             fjx1             = _mm_add_ps(fjx1,tx);
2198             fjy1             = _mm_add_ps(fjy1,ty);
2199             fjz1             = _mm_add_ps(fjz1,tz);
2200
2201             /**************************
2202              * CALCULATE INTERACTIONS *
2203              **************************/
2204
2205             r32              = _mm_mul_ps(rsq32,rinv32);
2206             r32              = _mm_andnot_ps(dummy_mask,r32);
2207
2208             /* Calculate table index by multiplying r with table scale and truncate to integer */
2209             rt               = _mm_mul_ps(r32,vftabscale);
2210             vfitab           = _mm_cvttps_epi32(rt);
2211             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2212             vfitab           = _mm_slli_epi32(vfitab,2);
2213
2214             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2215             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2216             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2217             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2218             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2219             _MM_TRANSPOSE4_PS(Y,F,G,H);
2220             Heps             = _mm_mul_ps(vfeps,H);
2221             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2222             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2223             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2224
2225             fscal            = felec;
2226
2227             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2228
2229             /* Calculate temporary vectorial force */
2230             tx               = _mm_mul_ps(fscal,dx32);
2231             ty               = _mm_mul_ps(fscal,dy32);
2232             tz               = _mm_mul_ps(fscal,dz32);
2233
2234             /* Update vectorial force */
2235             fix3             = _mm_add_ps(fix3,tx);
2236             fiy3             = _mm_add_ps(fiy3,ty);
2237             fiz3             = _mm_add_ps(fiz3,tz);
2238
2239             fjx2             = _mm_add_ps(fjx2,tx);
2240             fjy2             = _mm_add_ps(fjy2,ty);
2241             fjz2             = _mm_add_ps(fjz2,tz);
2242
2243             /**************************
2244              * CALCULATE INTERACTIONS *
2245              **************************/
2246
2247             r33              = _mm_mul_ps(rsq33,rinv33);
2248             r33              = _mm_andnot_ps(dummy_mask,r33);
2249
2250             /* Calculate table index by multiplying r with table scale and truncate to integer */
2251             rt               = _mm_mul_ps(r33,vftabscale);
2252             vfitab           = _mm_cvttps_epi32(rt);
2253             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2254             vfitab           = _mm_slli_epi32(vfitab,2);
2255
2256             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2257             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2258             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2259             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2260             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2261             _MM_TRANSPOSE4_PS(Y,F,G,H);
2262             Heps             = _mm_mul_ps(vfeps,H);
2263             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2264             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2265             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2266
2267             fscal            = felec;
2268
2269             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2270
2271             /* Calculate temporary vectorial force */
2272             tx               = _mm_mul_ps(fscal,dx33);
2273             ty               = _mm_mul_ps(fscal,dy33);
2274             tz               = _mm_mul_ps(fscal,dz33);
2275
2276             /* Update vectorial force */
2277             fix3             = _mm_add_ps(fix3,tx);
2278             fiy3             = _mm_add_ps(fiy3,ty);
2279             fiz3             = _mm_add_ps(fiz3,tz);
2280
2281             fjx3             = _mm_add_ps(fjx3,tx);
2282             fjy3             = _mm_add_ps(fjy3,ty);
2283             fjz3             = _mm_add_ps(fjz3,tz);
2284
2285             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2286             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2287             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2288             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2289
2290             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2291                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2292
2293             /* Inner loop uses 360 flops */
2294         }
2295
2296         /* End of innermost loop */
2297
2298         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2299                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
2300
2301         /* Increment number of inner iterations */
2302         inneriter                  += j_index_end - j_index_start;
2303
2304         /* Outer loop uses 18 flops */
2305     }
2306
2307     /* Increment number of outer iterations */
2308     outeriter        += nri;
2309
2310     /* Update outer/inner flops */
2311
2312     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*360);
2313 }