3477a68e1646edb075fd1fe5a76730aed3b61ccd
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse4_1_single.h"
50 #include "kernelutil_x86_sse4_1_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_single
54  * Electrostatics interaction: CubicSplineTable
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwioffset3;
92     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
112     real             *charge;
113     int              nvdwtype;
114     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115     int              *vdwtype;
116     real             *vdwparam;
117     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
118     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
119     __m128i          vfitab;
120     __m128i          ifour       = _mm_set1_epi32(4);
121     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
122     real             *vftab;
123     __m128           dummy_mask,cutoff_mask;
124     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
125     __m128           one     = _mm_set1_ps(1.0);
126     __m128           two     = _mm_set1_ps(2.0);
127     x                = xx[0];
128     f                = ff[0];
129
130     nri              = nlist->nri;
131     iinr             = nlist->iinr;
132     jindex           = nlist->jindex;
133     jjnr             = nlist->jjnr;
134     shiftidx         = nlist->shift;
135     gid              = nlist->gid;
136     shiftvec         = fr->shift_vec[0];
137     fshift           = fr->fshift[0];
138     facel            = _mm_set1_ps(fr->epsfac);
139     charge           = mdatoms->chargeA;
140     nvdwtype         = fr->ntype;
141     vdwparam         = fr->nbfp;
142     vdwtype          = mdatoms->typeA;
143
144     vftab            = kernel_data->table_elec->data;
145     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
146
147     /* Setup water-specific parameters */
148     inr              = nlist->iinr[0];
149     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
150     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
151     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
152     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
153
154     jq1              = _mm_set1_ps(charge[inr+1]);
155     jq2              = _mm_set1_ps(charge[inr+2]);
156     jq3              = _mm_set1_ps(charge[inr+3]);
157     vdwjidx0A        = 2*vdwtype[inr+0];
158     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
159     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
160     qq11             = _mm_mul_ps(iq1,jq1);
161     qq12             = _mm_mul_ps(iq1,jq2);
162     qq13             = _mm_mul_ps(iq1,jq3);
163     qq21             = _mm_mul_ps(iq2,jq1);
164     qq22             = _mm_mul_ps(iq2,jq2);
165     qq23             = _mm_mul_ps(iq2,jq3);
166     qq31             = _mm_mul_ps(iq3,jq1);
167     qq32             = _mm_mul_ps(iq3,jq2);
168     qq33             = _mm_mul_ps(iq3,jq3);
169
170     /* Avoid stupid compiler warnings */
171     jnrA = jnrB = jnrC = jnrD = 0;
172     j_coord_offsetA = 0;
173     j_coord_offsetB = 0;
174     j_coord_offsetC = 0;
175     j_coord_offsetD = 0;
176
177     outeriter        = 0;
178     inneriter        = 0;
179
180     for(iidx=0;iidx<4*DIM;iidx++)
181     {
182         scratch[iidx] = 0.0;
183     }
184
185     /* Start outer loop over neighborlists */
186     for(iidx=0; iidx<nri; iidx++)
187     {
188         /* Load shift vector for this list */
189         i_shift_offset   = DIM*shiftidx[iidx];
190
191         /* Load limits for loop over neighbors */
192         j_index_start    = jindex[iidx];
193         j_index_end      = jindex[iidx+1];
194
195         /* Get outer coordinate index */
196         inr              = iinr[iidx];
197         i_coord_offset   = DIM*inr;
198
199         /* Load i particle coords and add shift vector */
200         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
201                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
202
203         fix0             = _mm_setzero_ps();
204         fiy0             = _mm_setzero_ps();
205         fiz0             = _mm_setzero_ps();
206         fix1             = _mm_setzero_ps();
207         fiy1             = _mm_setzero_ps();
208         fiz1             = _mm_setzero_ps();
209         fix2             = _mm_setzero_ps();
210         fiy2             = _mm_setzero_ps();
211         fiz2             = _mm_setzero_ps();
212         fix3             = _mm_setzero_ps();
213         fiy3             = _mm_setzero_ps();
214         fiz3             = _mm_setzero_ps();
215
216         /* Reset potential sums */
217         velecsum         = _mm_setzero_ps();
218         vvdwsum          = _mm_setzero_ps();
219
220         /* Start inner kernel loop */
221         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
222         {
223
224             /* Get j neighbor index, and coordinate index */
225             jnrA             = jjnr[jidx];
226             jnrB             = jjnr[jidx+1];
227             jnrC             = jjnr[jidx+2];
228             jnrD             = jjnr[jidx+3];
229             j_coord_offsetA  = DIM*jnrA;
230             j_coord_offsetB  = DIM*jnrB;
231             j_coord_offsetC  = DIM*jnrC;
232             j_coord_offsetD  = DIM*jnrD;
233
234             /* load j atom coordinates */
235             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236                                               x+j_coord_offsetC,x+j_coord_offsetD,
237                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
238                                               &jy2,&jz2,&jx3,&jy3,&jz3);
239
240             /* Calculate displacement vector */
241             dx00             = _mm_sub_ps(ix0,jx0);
242             dy00             = _mm_sub_ps(iy0,jy0);
243             dz00             = _mm_sub_ps(iz0,jz0);
244             dx11             = _mm_sub_ps(ix1,jx1);
245             dy11             = _mm_sub_ps(iy1,jy1);
246             dz11             = _mm_sub_ps(iz1,jz1);
247             dx12             = _mm_sub_ps(ix1,jx2);
248             dy12             = _mm_sub_ps(iy1,jy2);
249             dz12             = _mm_sub_ps(iz1,jz2);
250             dx13             = _mm_sub_ps(ix1,jx3);
251             dy13             = _mm_sub_ps(iy1,jy3);
252             dz13             = _mm_sub_ps(iz1,jz3);
253             dx21             = _mm_sub_ps(ix2,jx1);
254             dy21             = _mm_sub_ps(iy2,jy1);
255             dz21             = _mm_sub_ps(iz2,jz1);
256             dx22             = _mm_sub_ps(ix2,jx2);
257             dy22             = _mm_sub_ps(iy2,jy2);
258             dz22             = _mm_sub_ps(iz2,jz2);
259             dx23             = _mm_sub_ps(ix2,jx3);
260             dy23             = _mm_sub_ps(iy2,jy3);
261             dz23             = _mm_sub_ps(iz2,jz3);
262             dx31             = _mm_sub_ps(ix3,jx1);
263             dy31             = _mm_sub_ps(iy3,jy1);
264             dz31             = _mm_sub_ps(iz3,jz1);
265             dx32             = _mm_sub_ps(ix3,jx2);
266             dy32             = _mm_sub_ps(iy3,jy2);
267             dz32             = _mm_sub_ps(iz3,jz2);
268             dx33             = _mm_sub_ps(ix3,jx3);
269             dy33             = _mm_sub_ps(iy3,jy3);
270             dz33             = _mm_sub_ps(iz3,jz3);
271
272             /* Calculate squared distance and things based on it */
273             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
274             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
275             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
276             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
277             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
278             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
279             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
280             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
281             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
282             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
283
284             rinv11           = gmx_mm_invsqrt_ps(rsq11);
285             rinv12           = gmx_mm_invsqrt_ps(rsq12);
286             rinv13           = gmx_mm_invsqrt_ps(rsq13);
287             rinv21           = gmx_mm_invsqrt_ps(rsq21);
288             rinv22           = gmx_mm_invsqrt_ps(rsq22);
289             rinv23           = gmx_mm_invsqrt_ps(rsq23);
290             rinv31           = gmx_mm_invsqrt_ps(rsq31);
291             rinv32           = gmx_mm_invsqrt_ps(rsq32);
292             rinv33           = gmx_mm_invsqrt_ps(rsq33);
293
294             rinvsq00         = gmx_mm_inv_ps(rsq00);
295
296             fjx0             = _mm_setzero_ps();
297             fjy0             = _mm_setzero_ps();
298             fjz0             = _mm_setzero_ps();
299             fjx1             = _mm_setzero_ps();
300             fjy1             = _mm_setzero_ps();
301             fjz1             = _mm_setzero_ps();
302             fjx2             = _mm_setzero_ps();
303             fjy2             = _mm_setzero_ps();
304             fjz2             = _mm_setzero_ps();
305             fjx3             = _mm_setzero_ps();
306             fjy3             = _mm_setzero_ps();
307             fjz3             = _mm_setzero_ps();
308
309             /**************************
310              * CALCULATE INTERACTIONS *
311              **************************/
312
313             /* LENNARD-JONES DISPERSION/REPULSION */
314
315             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
316             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
317             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
318             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
319             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
320
321             /* Update potential sum for this i atom from the interaction with this j atom. */
322             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
323
324             fscal            = fvdw;
325
326             /* Calculate temporary vectorial force */
327             tx               = _mm_mul_ps(fscal,dx00);
328             ty               = _mm_mul_ps(fscal,dy00);
329             tz               = _mm_mul_ps(fscal,dz00);
330
331             /* Update vectorial force */
332             fix0             = _mm_add_ps(fix0,tx);
333             fiy0             = _mm_add_ps(fiy0,ty);
334             fiz0             = _mm_add_ps(fiz0,tz);
335
336             fjx0             = _mm_add_ps(fjx0,tx);
337             fjy0             = _mm_add_ps(fjy0,ty);
338             fjz0             = _mm_add_ps(fjz0,tz);
339
340             /**************************
341              * CALCULATE INTERACTIONS *
342              **************************/
343
344             r11              = _mm_mul_ps(rsq11,rinv11);
345
346             /* Calculate table index by multiplying r with table scale and truncate to integer */
347             rt               = _mm_mul_ps(r11,vftabscale);
348             vfitab           = _mm_cvttps_epi32(rt);
349             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
350             vfitab           = _mm_slli_epi32(vfitab,2);
351
352             /* CUBIC SPLINE TABLE ELECTROSTATICS */
353             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
354             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
355             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
356             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
357             _MM_TRANSPOSE4_PS(Y,F,G,H);
358             Heps             = _mm_mul_ps(vfeps,H);
359             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
360             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
361             velec            = _mm_mul_ps(qq11,VV);
362             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
363             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
364
365             /* Update potential sum for this i atom from the interaction with this j atom. */
366             velecsum         = _mm_add_ps(velecsum,velec);
367
368             fscal            = felec;
369
370             /* Calculate temporary vectorial force */
371             tx               = _mm_mul_ps(fscal,dx11);
372             ty               = _mm_mul_ps(fscal,dy11);
373             tz               = _mm_mul_ps(fscal,dz11);
374
375             /* Update vectorial force */
376             fix1             = _mm_add_ps(fix1,tx);
377             fiy1             = _mm_add_ps(fiy1,ty);
378             fiz1             = _mm_add_ps(fiz1,tz);
379
380             fjx1             = _mm_add_ps(fjx1,tx);
381             fjy1             = _mm_add_ps(fjy1,ty);
382             fjz1             = _mm_add_ps(fjz1,tz);
383
384             /**************************
385              * CALCULATE INTERACTIONS *
386              **************************/
387
388             r12              = _mm_mul_ps(rsq12,rinv12);
389
390             /* Calculate table index by multiplying r with table scale and truncate to integer */
391             rt               = _mm_mul_ps(r12,vftabscale);
392             vfitab           = _mm_cvttps_epi32(rt);
393             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
394             vfitab           = _mm_slli_epi32(vfitab,2);
395
396             /* CUBIC SPLINE TABLE ELECTROSTATICS */
397             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
398             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
399             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
400             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
401             _MM_TRANSPOSE4_PS(Y,F,G,H);
402             Heps             = _mm_mul_ps(vfeps,H);
403             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
404             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
405             velec            = _mm_mul_ps(qq12,VV);
406             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
407             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
408
409             /* Update potential sum for this i atom from the interaction with this j atom. */
410             velecsum         = _mm_add_ps(velecsum,velec);
411
412             fscal            = felec;
413
414             /* Calculate temporary vectorial force */
415             tx               = _mm_mul_ps(fscal,dx12);
416             ty               = _mm_mul_ps(fscal,dy12);
417             tz               = _mm_mul_ps(fscal,dz12);
418
419             /* Update vectorial force */
420             fix1             = _mm_add_ps(fix1,tx);
421             fiy1             = _mm_add_ps(fiy1,ty);
422             fiz1             = _mm_add_ps(fiz1,tz);
423
424             fjx2             = _mm_add_ps(fjx2,tx);
425             fjy2             = _mm_add_ps(fjy2,ty);
426             fjz2             = _mm_add_ps(fjz2,tz);
427
428             /**************************
429              * CALCULATE INTERACTIONS *
430              **************************/
431
432             r13              = _mm_mul_ps(rsq13,rinv13);
433
434             /* Calculate table index by multiplying r with table scale and truncate to integer */
435             rt               = _mm_mul_ps(r13,vftabscale);
436             vfitab           = _mm_cvttps_epi32(rt);
437             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
438             vfitab           = _mm_slli_epi32(vfitab,2);
439
440             /* CUBIC SPLINE TABLE ELECTROSTATICS */
441             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
442             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
443             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
444             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
445             _MM_TRANSPOSE4_PS(Y,F,G,H);
446             Heps             = _mm_mul_ps(vfeps,H);
447             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
448             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
449             velec            = _mm_mul_ps(qq13,VV);
450             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
451             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
452
453             /* Update potential sum for this i atom from the interaction with this j atom. */
454             velecsum         = _mm_add_ps(velecsum,velec);
455
456             fscal            = felec;
457
458             /* Calculate temporary vectorial force */
459             tx               = _mm_mul_ps(fscal,dx13);
460             ty               = _mm_mul_ps(fscal,dy13);
461             tz               = _mm_mul_ps(fscal,dz13);
462
463             /* Update vectorial force */
464             fix1             = _mm_add_ps(fix1,tx);
465             fiy1             = _mm_add_ps(fiy1,ty);
466             fiz1             = _mm_add_ps(fiz1,tz);
467
468             fjx3             = _mm_add_ps(fjx3,tx);
469             fjy3             = _mm_add_ps(fjy3,ty);
470             fjz3             = _mm_add_ps(fjz3,tz);
471
472             /**************************
473              * CALCULATE INTERACTIONS *
474              **************************/
475
476             r21              = _mm_mul_ps(rsq21,rinv21);
477
478             /* Calculate table index by multiplying r with table scale and truncate to integer */
479             rt               = _mm_mul_ps(r21,vftabscale);
480             vfitab           = _mm_cvttps_epi32(rt);
481             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
482             vfitab           = _mm_slli_epi32(vfitab,2);
483
484             /* CUBIC SPLINE TABLE ELECTROSTATICS */
485             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
486             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
487             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
488             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
489             _MM_TRANSPOSE4_PS(Y,F,G,H);
490             Heps             = _mm_mul_ps(vfeps,H);
491             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
492             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
493             velec            = _mm_mul_ps(qq21,VV);
494             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
495             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
496
497             /* Update potential sum for this i atom from the interaction with this j atom. */
498             velecsum         = _mm_add_ps(velecsum,velec);
499
500             fscal            = felec;
501
502             /* Calculate temporary vectorial force */
503             tx               = _mm_mul_ps(fscal,dx21);
504             ty               = _mm_mul_ps(fscal,dy21);
505             tz               = _mm_mul_ps(fscal,dz21);
506
507             /* Update vectorial force */
508             fix2             = _mm_add_ps(fix2,tx);
509             fiy2             = _mm_add_ps(fiy2,ty);
510             fiz2             = _mm_add_ps(fiz2,tz);
511
512             fjx1             = _mm_add_ps(fjx1,tx);
513             fjy1             = _mm_add_ps(fjy1,ty);
514             fjz1             = _mm_add_ps(fjz1,tz);
515
516             /**************************
517              * CALCULATE INTERACTIONS *
518              **************************/
519
520             r22              = _mm_mul_ps(rsq22,rinv22);
521
522             /* Calculate table index by multiplying r with table scale and truncate to integer */
523             rt               = _mm_mul_ps(r22,vftabscale);
524             vfitab           = _mm_cvttps_epi32(rt);
525             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
526             vfitab           = _mm_slli_epi32(vfitab,2);
527
528             /* CUBIC SPLINE TABLE ELECTROSTATICS */
529             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
530             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
531             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
532             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
533             _MM_TRANSPOSE4_PS(Y,F,G,H);
534             Heps             = _mm_mul_ps(vfeps,H);
535             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
536             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
537             velec            = _mm_mul_ps(qq22,VV);
538             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
539             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
540
541             /* Update potential sum for this i atom from the interaction with this j atom. */
542             velecsum         = _mm_add_ps(velecsum,velec);
543
544             fscal            = felec;
545
546             /* Calculate temporary vectorial force */
547             tx               = _mm_mul_ps(fscal,dx22);
548             ty               = _mm_mul_ps(fscal,dy22);
549             tz               = _mm_mul_ps(fscal,dz22);
550
551             /* Update vectorial force */
552             fix2             = _mm_add_ps(fix2,tx);
553             fiy2             = _mm_add_ps(fiy2,ty);
554             fiz2             = _mm_add_ps(fiz2,tz);
555
556             fjx2             = _mm_add_ps(fjx2,tx);
557             fjy2             = _mm_add_ps(fjy2,ty);
558             fjz2             = _mm_add_ps(fjz2,tz);
559
560             /**************************
561              * CALCULATE INTERACTIONS *
562              **************************/
563
564             r23              = _mm_mul_ps(rsq23,rinv23);
565
566             /* Calculate table index by multiplying r with table scale and truncate to integer */
567             rt               = _mm_mul_ps(r23,vftabscale);
568             vfitab           = _mm_cvttps_epi32(rt);
569             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
570             vfitab           = _mm_slli_epi32(vfitab,2);
571
572             /* CUBIC SPLINE TABLE ELECTROSTATICS */
573             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
574             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
575             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
576             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
577             _MM_TRANSPOSE4_PS(Y,F,G,H);
578             Heps             = _mm_mul_ps(vfeps,H);
579             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
580             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
581             velec            = _mm_mul_ps(qq23,VV);
582             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
583             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
584
585             /* Update potential sum for this i atom from the interaction with this j atom. */
586             velecsum         = _mm_add_ps(velecsum,velec);
587
588             fscal            = felec;
589
590             /* Calculate temporary vectorial force */
591             tx               = _mm_mul_ps(fscal,dx23);
592             ty               = _mm_mul_ps(fscal,dy23);
593             tz               = _mm_mul_ps(fscal,dz23);
594
595             /* Update vectorial force */
596             fix2             = _mm_add_ps(fix2,tx);
597             fiy2             = _mm_add_ps(fiy2,ty);
598             fiz2             = _mm_add_ps(fiz2,tz);
599
600             fjx3             = _mm_add_ps(fjx3,tx);
601             fjy3             = _mm_add_ps(fjy3,ty);
602             fjz3             = _mm_add_ps(fjz3,tz);
603
604             /**************************
605              * CALCULATE INTERACTIONS *
606              **************************/
607
608             r31              = _mm_mul_ps(rsq31,rinv31);
609
610             /* Calculate table index by multiplying r with table scale and truncate to integer */
611             rt               = _mm_mul_ps(r31,vftabscale);
612             vfitab           = _mm_cvttps_epi32(rt);
613             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
614             vfitab           = _mm_slli_epi32(vfitab,2);
615
616             /* CUBIC SPLINE TABLE ELECTROSTATICS */
617             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
618             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
619             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
620             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
621             _MM_TRANSPOSE4_PS(Y,F,G,H);
622             Heps             = _mm_mul_ps(vfeps,H);
623             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
624             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
625             velec            = _mm_mul_ps(qq31,VV);
626             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
627             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
628
629             /* Update potential sum for this i atom from the interaction with this j atom. */
630             velecsum         = _mm_add_ps(velecsum,velec);
631
632             fscal            = felec;
633
634             /* Calculate temporary vectorial force */
635             tx               = _mm_mul_ps(fscal,dx31);
636             ty               = _mm_mul_ps(fscal,dy31);
637             tz               = _mm_mul_ps(fscal,dz31);
638
639             /* Update vectorial force */
640             fix3             = _mm_add_ps(fix3,tx);
641             fiy3             = _mm_add_ps(fiy3,ty);
642             fiz3             = _mm_add_ps(fiz3,tz);
643
644             fjx1             = _mm_add_ps(fjx1,tx);
645             fjy1             = _mm_add_ps(fjy1,ty);
646             fjz1             = _mm_add_ps(fjz1,tz);
647
648             /**************************
649              * CALCULATE INTERACTIONS *
650              **************************/
651
652             r32              = _mm_mul_ps(rsq32,rinv32);
653
654             /* Calculate table index by multiplying r with table scale and truncate to integer */
655             rt               = _mm_mul_ps(r32,vftabscale);
656             vfitab           = _mm_cvttps_epi32(rt);
657             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
658             vfitab           = _mm_slli_epi32(vfitab,2);
659
660             /* CUBIC SPLINE TABLE ELECTROSTATICS */
661             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
662             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
663             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
664             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
665             _MM_TRANSPOSE4_PS(Y,F,G,H);
666             Heps             = _mm_mul_ps(vfeps,H);
667             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
668             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
669             velec            = _mm_mul_ps(qq32,VV);
670             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
671             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
672
673             /* Update potential sum for this i atom from the interaction with this j atom. */
674             velecsum         = _mm_add_ps(velecsum,velec);
675
676             fscal            = felec;
677
678             /* Calculate temporary vectorial force */
679             tx               = _mm_mul_ps(fscal,dx32);
680             ty               = _mm_mul_ps(fscal,dy32);
681             tz               = _mm_mul_ps(fscal,dz32);
682
683             /* Update vectorial force */
684             fix3             = _mm_add_ps(fix3,tx);
685             fiy3             = _mm_add_ps(fiy3,ty);
686             fiz3             = _mm_add_ps(fiz3,tz);
687
688             fjx2             = _mm_add_ps(fjx2,tx);
689             fjy2             = _mm_add_ps(fjy2,ty);
690             fjz2             = _mm_add_ps(fjz2,tz);
691
692             /**************************
693              * CALCULATE INTERACTIONS *
694              **************************/
695
696             r33              = _mm_mul_ps(rsq33,rinv33);
697
698             /* Calculate table index by multiplying r with table scale and truncate to integer */
699             rt               = _mm_mul_ps(r33,vftabscale);
700             vfitab           = _mm_cvttps_epi32(rt);
701             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
702             vfitab           = _mm_slli_epi32(vfitab,2);
703
704             /* CUBIC SPLINE TABLE ELECTROSTATICS */
705             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
706             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
707             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
708             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
709             _MM_TRANSPOSE4_PS(Y,F,G,H);
710             Heps             = _mm_mul_ps(vfeps,H);
711             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
712             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
713             velec            = _mm_mul_ps(qq33,VV);
714             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
715             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
716
717             /* Update potential sum for this i atom from the interaction with this j atom. */
718             velecsum         = _mm_add_ps(velecsum,velec);
719
720             fscal            = felec;
721
722             /* Calculate temporary vectorial force */
723             tx               = _mm_mul_ps(fscal,dx33);
724             ty               = _mm_mul_ps(fscal,dy33);
725             tz               = _mm_mul_ps(fscal,dz33);
726
727             /* Update vectorial force */
728             fix3             = _mm_add_ps(fix3,tx);
729             fiy3             = _mm_add_ps(fiy3,ty);
730             fiz3             = _mm_add_ps(fiz3,tz);
731
732             fjx3             = _mm_add_ps(fjx3,tx);
733             fjy3             = _mm_add_ps(fjy3,ty);
734             fjz3             = _mm_add_ps(fjz3,tz);
735
736             fjptrA             = f+j_coord_offsetA;
737             fjptrB             = f+j_coord_offsetB;
738             fjptrC             = f+j_coord_offsetC;
739             fjptrD             = f+j_coord_offsetD;
740
741             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
742                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
743                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
744
745             /* Inner loop uses 422 flops */
746         }
747
748         if(jidx<j_index_end)
749         {
750
751             /* Get j neighbor index, and coordinate index */
752             jnrlistA         = jjnr[jidx];
753             jnrlistB         = jjnr[jidx+1];
754             jnrlistC         = jjnr[jidx+2];
755             jnrlistD         = jjnr[jidx+3];
756             /* Sign of each element will be negative for non-real atoms.
757              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
758              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
759              */
760             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
761             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
762             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
763             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
764             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
765             j_coord_offsetA  = DIM*jnrA;
766             j_coord_offsetB  = DIM*jnrB;
767             j_coord_offsetC  = DIM*jnrC;
768             j_coord_offsetD  = DIM*jnrD;
769
770             /* load j atom coordinates */
771             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
772                                               x+j_coord_offsetC,x+j_coord_offsetD,
773                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
774                                               &jy2,&jz2,&jx3,&jy3,&jz3);
775
776             /* Calculate displacement vector */
777             dx00             = _mm_sub_ps(ix0,jx0);
778             dy00             = _mm_sub_ps(iy0,jy0);
779             dz00             = _mm_sub_ps(iz0,jz0);
780             dx11             = _mm_sub_ps(ix1,jx1);
781             dy11             = _mm_sub_ps(iy1,jy1);
782             dz11             = _mm_sub_ps(iz1,jz1);
783             dx12             = _mm_sub_ps(ix1,jx2);
784             dy12             = _mm_sub_ps(iy1,jy2);
785             dz12             = _mm_sub_ps(iz1,jz2);
786             dx13             = _mm_sub_ps(ix1,jx3);
787             dy13             = _mm_sub_ps(iy1,jy3);
788             dz13             = _mm_sub_ps(iz1,jz3);
789             dx21             = _mm_sub_ps(ix2,jx1);
790             dy21             = _mm_sub_ps(iy2,jy1);
791             dz21             = _mm_sub_ps(iz2,jz1);
792             dx22             = _mm_sub_ps(ix2,jx2);
793             dy22             = _mm_sub_ps(iy2,jy2);
794             dz22             = _mm_sub_ps(iz2,jz2);
795             dx23             = _mm_sub_ps(ix2,jx3);
796             dy23             = _mm_sub_ps(iy2,jy3);
797             dz23             = _mm_sub_ps(iz2,jz3);
798             dx31             = _mm_sub_ps(ix3,jx1);
799             dy31             = _mm_sub_ps(iy3,jy1);
800             dz31             = _mm_sub_ps(iz3,jz1);
801             dx32             = _mm_sub_ps(ix3,jx2);
802             dy32             = _mm_sub_ps(iy3,jy2);
803             dz32             = _mm_sub_ps(iz3,jz2);
804             dx33             = _mm_sub_ps(ix3,jx3);
805             dy33             = _mm_sub_ps(iy3,jy3);
806             dz33             = _mm_sub_ps(iz3,jz3);
807
808             /* Calculate squared distance and things based on it */
809             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
810             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
811             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
812             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
813             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
814             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
815             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
816             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
817             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
818             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
819
820             rinv11           = gmx_mm_invsqrt_ps(rsq11);
821             rinv12           = gmx_mm_invsqrt_ps(rsq12);
822             rinv13           = gmx_mm_invsqrt_ps(rsq13);
823             rinv21           = gmx_mm_invsqrt_ps(rsq21);
824             rinv22           = gmx_mm_invsqrt_ps(rsq22);
825             rinv23           = gmx_mm_invsqrt_ps(rsq23);
826             rinv31           = gmx_mm_invsqrt_ps(rsq31);
827             rinv32           = gmx_mm_invsqrt_ps(rsq32);
828             rinv33           = gmx_mm_invsqrt_ps(rsq33);
829
830             rinvsq00         = gmx_mm_inv_ps(rsq00);
831
832             fjx0             = _mm_setzero_ps();
833             fjy0             = _mm_setzero_ps();
834             fjz0             = _mm_setzero_ps();
835             fjx1             = _mm_setzero_ps();
836             fjy1             = _mm_setzero_ps();
837             fjz1             = _mm_setzero_ps();
838             fjx2             = _mm_setzero_ps();
839             fjy2             = _mm_setzero_ps();
840             fjz2             = _mm_setzero_ps();
841             fjx3             = _mm_setzero_ps();
842             fjy3             = _mm_setzero_ps();
843             fjz3             = _mm_setzero_ps();
844
845             /**************************
846              * CALCULATE INTERACTIONS *
847              **************************/
848
849             /* LENNARD-JONES DISPERSION/REPULSION */
850
851             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
852             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
853             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
854             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
855             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
856
857             /* Update potential sum for this i atom from the interaction with this j atom. */
858             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
859             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
860
861             fscal            = fvdw;
862
863             fscal            = _mm_andnot_ps(dummy_mask,fscal);
864
865             /* Calculate temporary vectorial force */
866             tx               = _mm_mul_ps(fscal,dx00);
867             ty               = _mm_mul_ps(fscal,dy00);
868             tz               = _mm_mul_ps(fscal,dz00);
869
870             /* Update vectorial force */
871             fix0             = _mm_add_ps(fix0,tx);
872             fiy0             = _mm_add_ps(fiy0,ty);
873             fiz0             = _mm_add_ps(fiz0,tz);
874
875             fjx0             = _mm_add_ps(fjx0,tx);
876             fjy0             = _mm_add_ps(fjy0,ty);
877             fjz0             = _mm_add_ps(fjz0,tz);
878
879             /**************************
880              * CALCULATE INTERACTIONS *
881              **************************/
882
883             r11              = _mm_mul_ps(rsq11,rinv11);
884             r11              = _mm_andnot_ps(dummy_mask,r11);
885
886             /* Calculate table index by multiplying r with table scale and truncate to integer */
887             rt               = _mm_mul_ps(r11,vftabscale);
888             vfitab           = _mm_cvttps_epi32(rt);
889             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
890             vfitab           = _mm_slli_epi32(vfitab,2);
891
892             /* CUBIC SPLINE TABLE ELECTROSTATICS */
893             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
894             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
895             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
896             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
897             _MM_TRANSPOSE4_PS(Y,F,G,H);
898             Heps             = _mm_mul_ps(vfeps,H);
899             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
900             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
901             velec            = _mm_mul_ps(qq11,VV);
902             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
903             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
904
905             /* Update potential sum for this i atom from the interaction with this j atom. */
906             velec            = _mm_andnot_ps(dummy_mask,velec);
907             velecsum         = _mm_add_ps(velecsum,velec);
908
909             fscal            = felec;
910
911             fscal            = _mm_andnot_ps(dummy_mask,fscal);
912
913             /* Calculate temporary vectorial force */
914             tx               = _mm_mul_ps(fscal,dx11);
915             ty               = _mm_mul_ps(fscal,dy11);
916             tz               = _mm_mul_ps(fscal,dz11);
917
918             /* Update vectorial force */
919             fix1             = _mm_add_ps(fix1,tx);
920             fiy1             = _mm_add_ps(fiy1,ty);
921             fiz1             = _mm_add_ps(fiz1,tz);
922
923             fjx1             = _mm_add_ps(fjx1,tx);
924             fjy1             = _mm_add_ps(fjy1,ty);
925             fjz1             = _mm_add_ps(fjz1,tz);
926
927             /**************************
928              * CALCULATE INTERACTIONS *
929              **************************/
930
931             r12              = _mm_mul_ps(rsq12,rinv12);
932             r12              = _mm_andnot_ps(dummy_mask,r12);
933
934             /* Calculate table index by multiplying r with table scale and truncate to integer */
935             rt               = _mm_mul_ps(r12,vftabscale);
936             vfitab           = _mm_cvttps_epi32(rt);
937             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
938             vfitab           = _mm_slli_epi32(vfitab,2);
939
940             /* CUBIC SPLINE TABLE ELECTROSTATICS */
941             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
942             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
943             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
944             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
945             _MM_TRANSPOSE4_PS(Y,F,G,H);
946             Heps             = _mm_mul_ps(vfeps,H);
947             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
948             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
949             velec            = _mm_mul_ps(qq12,VV);
950             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
951             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
952
953             /* Update potential sum for this i atom from the interaction with this j atom. */
954             velec            = _mm_andnot_ps(dummy_mask,velec);
955             velecsum         = _mm_add_ps(velecsum,velec);
956
957             fscal            = felec;
958
959             fscal            = _mm_andnot_ps(dummy_mask,fscal);
960
961             /* Calculate temporary vectorial force */
962             tx               = _mm_mul_ps(fscal,dx12);
963             ty               = _mm_mul_ps(fscal,dy12);
964             tz               = _mm_mul_ps(fscal,dz12);
965
966             /* Update vectorial force */
967             fix1             = _mm_add_ps(fix1,tx);
968             fiy1             = _mm_add_ps(fiy1,ty);
969             fiz1             = _mm_add_ps(fiz1,tz);
970
971             fjx2             = _mm_add_ps(fjx2,tx);
972             fjy2             = _mm_add_ps(fjy2,ty);
973             fjz2             = _mm_add_ps(fjz2,tz);
974
975             /**************************
976              * CALCULATE INTERACTIONS *
977              **************************/
978
979             r13              = _mm_mul_ps(rsq13,rinv13);
980             r13              = _mm_andnot_ps(dummy_mask,r13);
981
982             /* Calculate table index by multiplying r with table scale and truncate to integer */
983             rt               = _mm_mul_ps(r13,vftabscale);
984             vfitab           = _mm_cvttps_epi32(rt);
985             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
986             vfitab           = _mm_slli_epi32(vfitab,2);
987
988             /* CUBIC SPLINE TABLE ELECTROSTATICS */
989             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
990             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
991             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
992             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
993             _MM_TRANSPOSE4_PS(Y,F,G,H);
994             Heps             = _mm_mul_ps(vfeps,H);
995             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
996             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
997             velec            = _mm_mul_ps(qq13,VV);
998             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
999             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1000
1001             /* Update potential sum for this i atom from the interaction with this j atom. */
1002             velec            = _mm_andnot_ps(dummy_mask,velec);
1003             velecsum         = _mm_add_ps(velecsum,velec);
1004
1005             fscal            = felec;
1006
1007             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1008
1009             /* Calculate temporary vectorial force */
1010             tx               = _mm_mul_ps(fscal,dx13);
1011             ty               = _mm_mul_ps(fscal,dy13);
1012             tz               = _mm_mul_ps(fscal,dz13);
1013
1014             /* Update vectorial force */
1015             fix1             = _mm_add_ps(fix1,tx);
1016             fiy1             = _mm_add_ps(fiy1,ty);
1017             fiz1             = _mm_add_ps(fiz1,tz);
1018
1019             fjx3             = _mm_add_ps(fjx3,tx);
1020             fjy3             = _mm_add_ps(fjy3,ty);
1021             fjz3             = _mm_add_ps(fjz3,tz);
1022
1023             /**************************
1024              * CALCULATE INTERACTIONS *
1025              **************************/
1026
1027             r21              = _mm_mul_ps(rsq21,rinv21);
1028             r21              = _mm_andnot_ps(dummy_mask,r21);
1029
1030             /* Calculate table index by multiplying r with table scale and truncate to integer */
1031             rt               = _mm_mul_ps(r21,vftabscale);
1032             vfitab           = _mm_cvttps_epi32(rt);
1033             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1034             vfitab           = _mm_slli_epi32(vfitab,2);
1035
1036             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1037             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1038             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1039             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1040             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1041             _MM_TRANSPOSE4_PS(Y,F,G,H);
1042             Heps             = _mm_mul_ps(vfeps,H);
1043             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1044             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1045             velec            = _mm_mul_ps(qq21,VV);
1046             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1047             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1048
1049             /* Update potential sum for this i atom from the interaction with this j atom. */
1050             velec            = _mm_andnot_ps(dummy_mask,velec);
1051             velecsum         = _mm_add_ps(velecsum,velec);
1052
1053             fscal            = felec;
1054
1055             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1056
1057             /* Calculate temporary vectorial force */
1058             tx               = _mm_mul_ps(fscal,dx21);
1059             ty               = _mm_mul_ps(fscal,dy21);
1060             tz               = _mm_mul_ps(fscal,dz21);
1061
1062             /* Update vectorial force */
1063             fix2             = _mm_add_ps(fix2,tx);
1064             fiy2             = _mm_add_ps(fiy2,ty);
1065             fiz2             = _mm_add_ps(fiz2,tz);
1066
1067             fjx1             = _mm_add_ps(fjx1,tx);
1068             fjy1             = _mm_add_ps(fjy1,ty);
1069             fjz1             = _mm_add_ps(fjz1,tz);
1070
1071             /**************************
1072              * CALCULATE INTERACTIONS *
1073              **************************/
1074
1075             r22              = _mm_mul_ps(rsq22,rinv22);
1076             r22              = _mm_andnot_ps(dummy_mask,r22);
1077
1078             /* Calculate table index by multiplying r with table scale and truncate to integer */
1079             rt               = _mm_mul_ps(r22,vftabscale);
1080             vfitab           = _mm_cvttps_epi32(rt);
1081             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1082             vfitab           = _mm_slli_epi32(vfitab,2);
1083
1084             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1085             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1086             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1087             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1088             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1089             _MM_TRANSPOSE4_PS(Y,F,G,H);
1090             Heps             = _mm_mul_ps(vfeps,H);
1091             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1092             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1093             velec            = _mm_mul_ps(qq22,VV);
1094             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1095             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1096
1097             /* Update potential sum for this i atom from the interaction with this j atom. */
1098             velec            = _mm_andnot_ps(dummy_mask,velec);
1099             velecsum         = _mm_add_ps(velecsum,velec);
1100
1101             fscal            = felec;
1102
1103             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1104
1105             /* Calculate temporary vectorial force */
1106             tx               = _mm_mul_ps(fscal,dx22);
1107             ty               = _mm_mul_ps(fscal,dy22);
1108             tz               = _mm_mul_ps(fscal,dz22);
1109
1110             /* Update vectorial force */
1111             fix2             = _mm_add_ps(fix2,tx);
1112             fiy2             = _mm_add_ps(fiy2,ty);
1113             fiz2             = _mm_add_ps(fiz2,tz);
1114
1115             fjx2             = _mm_add_ps(fjx2,tx);
1116             fjy2             = _mm_add_ps(fjy2,ty);
1117             fjz2             = _mm_add_ps(fjz2,tz);
1118
1119             /**************************
1120              * CALCULATE INTERACTIONS *
1121              **************************/
1122
1123             r23              = _mm_mul_ps(rsq23,rinv23);
1124             r23              = _mm_andnot_ps(dummy_mask,r23);
1125
1126             /* Calculate table index by multiplying r with table scale and truncate to integer */
1127             rt               = _mm_mul_ps(r23,vftabscale);
1128             vfitab           = _mm_cvttps_epi32(rt);
1129             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1130             vfitab           = _mm_slli_epi32(vfitab,2);
1131
1132             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1133             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1134             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1135             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1136             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1137             _MM_TRANSPOSE4_PS(Y,F,G,H);
1138             Heps             = _mm_mul_ps(vfeps,H);
1139             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1140             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1141             velec            = _mm_mul_ps(qq23,VV);
1142             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1143             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1144
1145             /* Update potential sum for this i atom from the interaction with this j atom. */
1146             velec            = _mm_andnot_ps(dummy_mask,velec);
1147             velecsum         = _mm_add_ps(velecsum,velec);
1148
1149             fscal            = felec;
1150
1151             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1152
1153             /* Calculate temporary vectorial force */
1154             tx               = _mm_mul_ps(fscal,dx23);
1155             ty               = _mm_mul_ps(fscal,dy23);
1156             tz               = _mm_mul_ps(fscal,dz23);
1157
1158             /* Update vectorial force */
1159             fix2             = _mm_add_ps(fix2,tx);
1160             fiy2             = _mm_add_ps(fiy2,ty);
1161             fiz2             = _mm_add_ps(fiz2,tz);
1162
1163             fjx3             = _mm_add_ps(fjx3,tx);
1164             fjy3             = _mm_add_ps(fjy3,ty);
1165             fjz3             = _mm_add_ps(fjz3,tz);
1166
1167             /**************************
1168              * CALCULATE INTERACTIONS *
1169              **************************/
1170
1171             r31              = _mm_mul_ps(rsq31,rinv31);
1172             r31              = _mm_andnot_ps(dummy_mask,r31);
1173
1174             /* Calculate table index by multiplying r with table scale and truncate to integer */
1175             rt               = _mm_mul_ps(r31,vftabscale);
1176             vfitab           = _mm_cvttps_epi32(rt);
1177             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1178             vfitab           = _mm_slli_epi32(vfitab,2);
1179
1180             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1181             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1182             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1183             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1184             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1185             _MM_TRANSPOSE4_PS(Y,F,G,H);
1186             Heps             = _mm_mul_ps(vfeps,H);
1187             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1188             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1189             velec            = _mm_mul_ps(qq31,VV);
1190             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1191             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1192
1193             /* Update potential sum for this i atom from the interaction with this j atom. */
1194             velec            = _mm_andnot_ps(dummy_mask,velec);
1195             velecsum         = _mm_add_ps(velecsum,velec);
1196
1197             fscal            = felec;
1198
1199             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1200
1201             /* Calculate temporary vectorial force */
1202             tx               = _mm_mul_ps(fscal,dx31);
1203             ty               = _mm_mul_ps(fscal,dy31);
1204             tz               = _mm_mul_ps(fscal,dz31);
1205
1206             /* Update vectorial force */
1207             fix3             = _mm_add_ps(fix3,tx);
1208             fiy3             = _mm_add_ps(fiy3,ty);
1209             fiz3             = _mm_add_ps(fiz3,tz);
1210
1211             fjx1             = _mm_add_ps(fjx1,tx);
1212             fjy1             = _mm_add_ps(fjy1,ty);
1213             fjz1             = _mm_add_ps(fjz1,tz);
1214
1215             /**************************
1216              * CALCULATE INTERACTIONS *
1217              **************************/
1218
1219             r32              = _mm_mul_ps(rsq32,rinv32);
1220             r32              = _mm_andnot_ps(dummy_mask,r32);
1221
1222             /* Calculate table index by multiplying r with table scale and truncate to integer */
1223             rt               = _mm_mul_ps(r32,vftabscale);
1224             vfitab           = _mm_cvttps_epi32(rt);
1225             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1226             vfitab           = _mm_slli_epi32(vfitab,2);
1227
1228             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1229             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1230             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1231             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1232             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1233             _MM_TRANSPOSE4_PS(Y,F,G,H);
1234             Heps             = _mm_mul_ps(vfeps,H);
1235             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1236             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1237             velec            = _mm_mul_ps(qq32,VV);
1238             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1239             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1240
1241             /* Update potential sum for this i atom from the interaction with this j atom. */
1242             velec            = _mm_andnot_ps(dummy_mask,velec);
1243             velecsum         = _mm_add_ps(velecsum,velec);
1244
1245             fscal            = felec;
1246
1247             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1248
1249             /* Calculate temporary vectorial force */
1250             tx               = _mm_mul_ps(fscal,dx32);
1251             ty               = _mm_mul_ps(fscal,dy32);
1252             tz               = _mm_mul_ps(fscal,dz32);
1253
1254             /* Update vectorial force */
1255             fix3             = _mm_add_ps(fix3,tx);
1256             fiy3             = _mm_add_ps(fiy3,ty);
1257             fiz3             = _mm_add_ps(fiz3,tz);
1258
1259             fjx2             = _mm_add_ps(fjx2,tx);
1260             fjy2             = _mm_add_ps(fjy2,ty);
1261             fjz2             = _mm_add_ps(fjz2,tz);
1262
1263             /**************************
1264              * CALCULATE INTERACTIONS *
1265              **************************/
1266
1267             r33              = _mm_mul_ps(rsq33,rinv33);
1268             r33              = _mm_andnot_ps(dummy_mask,r33);
1269
1270             /* Calculate table index by multiplying r with table scale and truncate to integer */
1271             rt               = _mm_mul_ps(r33,vftabscale);
1272             vfitab           = _mm_cvttps_epi32(rt);
1273             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1274             vfitab           = _mm_slli_epi32(vfitab,2);
1275
1276             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1277             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1278             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1279             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1280             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1281             _MM_TRANSPOSE4_PS(Y,F,G,H);
1282             Heps             = _mm_mul_ps(vfeps,H);
1283             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1284             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1285             velec            = _mm_mul_ps(qq33,VV);
1286             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1287             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1288
1289             /* Update potential sum for this i atom from the interaction with this j atom. */
1290             velec            = _mm_andnot_ps(dummy_mask,velec);
1291             velecsum         = _mm_add_ps(velecsum,velec);
1292
1293             fscal            = felec;
1294
1295             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1296
1297             /* Calculate temporary vectorial force */
1298             tx               = _mm_mul_ps(fscal,dx33);
1299             ty               = _mm_mul_ps(fscal,dy33);
1300             tz               = _mm_mul_ps(fscal,dz33);
1301
1302             /* Update vectorial force */
1303             fix3             = _mm_add_ps(fix3,tx);
1304             fiy3             = _mm_add_ps(fiy3,ty);
1305             fiz3             = _mm_add_ps(fiz3,tz);
1306
1307             fjx3             = _mm_add_ps(fjx3,tx);
1308             fjy3             = _mm_add_ps(fjy3,ty);
1309             fjz3             = _mm_add_ps(fjz3,tz);
1310
1311             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1312             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1313             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1314             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1315
1316             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1317                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1318                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1319
1320             /* Inner loop uses 431 flops */
1321         }
1322
1323         /* End of innermost loop */
1324
1325         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1326                                               f+i_coord_offset,fshift+i_shift_offset);
1327
1328         ggid                        = gid[iidx];
1329         /* Update potential energies */
1330         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1331         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1332
1333         /* Increment number of inner iterations */
1334         inneriter                  += j_index_end - j_index_start;
1335
1336         /* Outer loop uses 26 flops */
1337     }
1338
1339     /* Increment number of outer iterations */
1340     outeriter        += nri;
1341
1342     /* Update outer/inner flops */
1343
1344     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*431);
1345 }
1346 /*
1347  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_single
1348  * Electrostatics interaction: CubicSplineTable
1349  * VdW interaction:            LennardJones
1350  * Geometry:                   Water4-Water4
1351  * Calculate force/pot:        Force
1352  */
1353 void
1354 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_single
1355                     (t_nblist                    * gmx_restrict       nlist,
1356                      rvec                        * gmx_restrict          xx,
1357                      rvec                        * gmx_restrict          ff,
1358                      t_forcerec                  * gmx_restrict          fr,
1359                      t_mdatoms                   * gmx_restrict     mdatoms,
1360                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1361                      t_nrnb                      * gmx_restrict        nrnb)
1362 {
1363     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1364      * just 0 for non-waters.
1365      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1366      * jnr indices corresponding to data put in the four positions in the SIMD register.
1367      */
1368     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1369     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1370     int              jnrA,jnrB,jnrC,jnrD;
1371     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1372     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1373     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1374     real             rcutoff_scalar;
1375     real             *shiftvec,*fshift,*x,*f;
1376     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1377     real             scratch[4*DIM];
1378     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1379     int              vdwioffset0;
1380     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1381     int              vdwioffset1;
1382     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1383     int              vdwioffset2;
1384     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1385     int              vdwioffset3;
1386     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1387     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1388     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1389     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1390     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1391     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1392     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1393     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1394     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1395     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1396     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1397     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1398     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1399     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1400     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1401     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1402     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1403     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1404     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1405     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1406     real             *charge;
1407     int              nvdwtype;
1408     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1409     int              *vdwtype;
1410     real             *vdwparam;
1411     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1412     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1413     __m128i          vfitab;
1414     __m128i          ifour       = _mm_set1_epi32(4);
1415     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1416     real             *vftab;
1417     __m128           dummy_mask,cutoff_mask;
1418     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1419     __m128           one     = _mm_set1_ps(1.0);
1420     __m128           two     = _mm_set1_ps(2.0);
1421     x                = xx[0];
1422     f                = ff[0];
1423
1424     nri              = nlist->nri;
1425     iinr             = nlist->iinr;
1426     jindex           = nlist->jindex;
1427     jjnr             = nlist->jjnr;
1428     shiftidx         = nlist->shift;
1429     gid              = nlist->gid;
1430     shiftvec         = fr->shift_vec[0];
1431     fshift           = fr->fshift[0];
1432     facel            = _mm_set1_ps(fr->epsfac);
1433     charge           = mdatoms->chargeA;
1434     nvdwtype         = fr->ntype;
1435     vdwparam         = fr->nbfp;
1436     vdwtype          = mdatoms->typeA;
1437
1438     vftab            = kernel_data->table_elec->data;
1439     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1440
1441     /* Setup water-specific parameters */
1442     inr              = nlist->iinr[0];
1443     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1444     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1445     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1446     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1447
1448     jq1              = _mm_set1_ps(charge[inr+1]);
1449     jq2              = _mm_set1_ps(charge[inr+2]);
1450     jq3              = _mm_set1_ps(charge[inr+3]);
1451     vdwjidx0A        = 2*vdwtype[inr+0];
1452     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1453     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1454     qq11             = _mm_mul_ps(iq1,jq1);
1455     qq12             = _mm_mul_ps(iq1,jq2);
1456     qq13             = _mm_mul_ps(iq1,jq3);
1457     qq21             = _mm_mul_ps(iq2,jq1);
1458     qq22             = _mm_mul_ps(iq2,jq2);
1459     qq23             = _mm_mul_ps(iq2,jq3);
1460     qq31             = _mm_mul_ps(iq3,jq1);
1461     qq32             = _mm_mul_ps(iq3,jq2);
1462     qq33             = _mm_mul_ps(iq3,jq3);
1463
1464     /* Avoid stupid compiler warnings */
1465     jnrA = jnrB = jnrC = jnrD = 0;
1466     j_coord_offsetA = 0;
1467     j_coord_offsetB = 0;
1468     j_coord_offsetC = 0;
1469     j_coord_offsetD = 0;
1470
1471     outeriter        = 0;
1472     inneriter        = 0;
1473
1474     for(iidx=0;iidx<4*DIM;iidx++)
1475     {
1476         scratch[iidx] = 0.0;
1477     }
1478
1479     /* Start outer loop over neighborlists */
1480     for(iidx=0; iidx<nri; iidx++)
1481     {
1482         /* Load shift vector for this list */
1483         i_shift_offset   = DIM*shiftidx[iidx];
1484
1485         /* Load limits for loop over neighbors */
1486         j_index_start    = jindex[iidx];
1487         j_index_end      = jindex[iidx+1];
1488
1489         /* Get outer coordinate index */
1490         inr              = iinr[iidx];
1491         i_coord_offset   = DIM*inr;
1492
1493         /* Load i particle coords and add shift vector */
1494         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1495                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1496
1497         fix0             = _mm_setzero_ps();
1498         fiy0             = _mm_setzero_ps();
1499         fiz0             = _mm_setzero_ps();
1500         fix1             = _mm_setzero_ps();
1501         fiy1             = _mm_setzero_ps();
1502         fiz1             = _mm_setzero_ps();
1503         fix2             = _mm_setzero_ps();
1504         fiy2             = _mm_setzero_ps();
1505         fiz2             = _mm_setzero_ps();
1506         fix3             = _mm_setzero_ps();
1507         fiy3             = _mm_setzero_ps();
1508         fiz3             = _mm_setzero_ps();
1509
1510         /* Start inner kernel loop */
1511         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1512         {
1513
1514             /* Get j neighbor index, and coordinate index */
1515             jnrA             = jjnr[jidx];
1516             jnrB             = jjnr[jidx+1];
1517             jnrC             = jjnr[jidx+2];
1518             jnrD             = jjnr[jidx+3];
1519             j_coord_offsetA  = DIM*jnrA;
1520             j_coord_offsetB  = DIM*jnrB;
1521             j_coord_offsetC  = DIM*jnrC;
1522             j_coord_offsetD  = DIM*jnrD;
1523
1524             /* load j atom coordinates */
1525             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1526                                               x+j_coord_offsetC,x+j_coord_offsetD,
1527                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1528                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1529
1530             /* Calculate displacement vector */
1531             dx00             = _mm_sub_ps(ix0,jx0);
1532             dy00             = _mm_sub_ps(iy0,jy0);
1533             dz00             = _mm_sub_ps(iz0,jz0);
1534             dx11             = _mm_sub_ps(ix1,jx1);
1535             dy11             = _mm_sub_ps(iy1,jy1);
1536             dz11             = _mm_sub_ps(iz1,jz1);
1537             dx12             = _mm_sub_ps(ix1,jx2);
1538             dy12             = _mm_sub_ps(iy1,jy2);
1539             dz12             = _mm_sub_ps(iz1,jz2);
1540             dx13             = _mm_sub_ps(ix1,jx3);
1541             dy13             = _mm_sub_ps(iy1,jy3);
1542             dz13             = _mm_sub_ps(iz1,jz3);
1543             dx21             = _mm_sub_ps(ix2,jx1);
1544             dy21             = _mm_sub_ps(iy2,jy1);
1545             dz21             = _mm_sub_ps(iz2,jz1);
1546             dx22             = _mm_sub_ps(ix2,jx2);
1547             dy22             = _mm_sub_ps(iy2,jy2);
1548             dz22             = _mm_sub_ps(iz2,jz2);
1549             dx23             = _mm_sub_ps(ix2,jx3);
1550             dy23             = _mm_sub_ps(iy2,jy3);
1551             dz23             = _mm_sub_ps(iz2,jz3);
1552             dx31             = _mm_sub_ps(ix3,jx1);
1553             dy31             = _mm_sub_ps(iy3,jy1);
1554             dz31             = _mm_sub_ps(iz3,jz1);
1555             dx32             = _mm_sub_ps(ix3,jx2);
1556             dy32             = _mm_sub_ps(iy3,jy2);
1557             dz32             = _mm_sub_ps(iz3,jz2);
1558             dx33             = _mm_sub_ps(ix3,jx3);
1559             dy33             = _mm_sub_ps(iy3,jy3);
1560             dz33             = _mm_sub_ps(iz3,jz3);
1561
1562             /* Calculate squared distance and things based on it */
1563             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1564             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1565             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1566             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1567             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1568             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1569             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1570             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1571             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1572             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1573
1574             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1575             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1576             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1577             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1578             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1579             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1580             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1581             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1582             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1583
1584             rinvsq00         = gmx_mm_inv_ps(rsq00);
1585
1586             fjx0             = _mm_setzero_ps();
1587             fjy0             = _mm_setzero_ps();
1588             fjz0             = _mm_setzero_ps();
1589             fjx1             = _mm_setzero_ps();
1590             fjy1             = _mm_setzero_ps();
1591             fjz1             = _mm_setzero_ps();
1592             fjx2             = _mm_setzero_ps();
1593             fjy2             = _mm_setzero_ps();
1594             fjz2             = _mm_setzero_ps();
1595             fjx3             = _mm_setzero_ps();
1596             fjy3             = _mm_setzero_ps();
1597             fjz3             = _mm_setzero_ps();
1598
1599             /**************************
1600              * CALCULATE INTERACTIONS *
1601              **************************/
1602
1603             /* LENNARD-JONES DISPERSION/REPULSION */
1604
1605             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1606             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1607
1608             fscal            = fvdw;
1609
1610             /* Calculate temporary vectorial force */
1611             tx               = _mm_mul_ps(fscal,dx00);
1612             ty               = _mm_mul_ps(fscal,dy00);
1613             tz               = _mm_mul_ps(fscal,dz00);
1614
1615             /* Update vectorial force */
1616             fix0             = _mm_add_ps(fix0,tx);
1617             fiy0             = _mm_add_ps(fiy0,ty);
1618             fiz0             = _mm_add_ps(fiz0,tz);
1619
1620             fjx0             = _mm_add_ps(fjx0,tx);
1621             fjy0             = _mm_add_ps(fjy0,ty);
1622             fjz0             = _mm_add_ps(fjz0,tz);
1623
1624             /**************************
1625              * CALCULATE INTERACTIONS *
1626              **************************/
1627
1628             r11              = _mm_mul_ps(rsq11,rinv11);
1629
1630             /* Calculate table index by multiplying r with table scale and truncate to integer */
1631             rt               = _mm_mul_ps(r11,vftabscale);
1632             vfitab           = _mm_cvttps_epi32(rt);
1633             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1634             vfitab           = _mm_slli_epi32(vfitab,2);
1635
1636             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1637             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1638             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1639             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1640             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1641             _MM_TRANSPOSE4_PS(Y,F,G,H);
1642             Heps             = _mm_mul_ps(vfeps,H);
1643             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1644             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1645             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1646
1647             fscal            = felec;
1648
1649             /* Calculate temporary vectorial force */
1650             tx               = _mm_mul_ps(fscal,dx11);
1651             ty               = _mm_mul_ps(fscal,dy11);
1652             tz               = _mm_mul_ps(fscal,dz11);
1653
1654             /* Update vectorial force */
1655             fix1             = _mm_add_ps(fix1,tx);
1656             fiy1             = _mm_add_ps(fiy1,ty);
1657             fiz1             = _mm_add_ps(fiz1,tz);
1658
1659             fjx1             = _mm_add_ps(fjx1,tx);
1660             fjy1             = _mm_add_ps(fjy1,ty);
1661             fjz1             = _mm_add_ps(fjz1,tz);
1662
1663             /**************************
1664              * CALCULATE INTERACTIONS *
1665              **************************/
1666
1667             r12              = _mm_mul_ps(rsq12,rinv12);
1668
1669             /* Calculate table index by multiplying r with table scale and truncate to integer */
1670             rt               = _mm_mul_ps(r12,vftabscale);
1671             vfitab           = _mm_cvttps_epi32(rt);
1672             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1673             vfitab           = _mm_slli_epi32(vfitab,2);
1674
1675             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1676             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1677             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1678             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1679             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1680             _MM_TRANSPOSE4_PS(Y,F,G,H);
1681             Heps             = _mm_mul_ps(vfeps,H);
1682             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1683             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1684             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1685
1686             fscal            = felec;
1687
1688             /* Calculate temporary vectorial force */
1689             tx               = _mm_mul_ps(fscal,dx12);
1690             ty               = _mm_mul_ps(fscal,dy12);
1691             tz               = _mm_mul_ps(fscal,dz12);
1692
1693             /* Update vectorial force */
1694             fix1             = _mm_add_ps(fix1,tx);
1695             fiy1             = _mm_add_ps(fiy1,ty);
1696             fiz1             = _mm_add_ps(fiz1,tz);
1697
1698             fjx2             = _mm_add_ps(fjx2,tx);
1699             fjy2             = _mm_add_ps(fjy2,ty);
1700             fjz2             = _mm_add_ps(fjz2,tz);
1701
1702             /**************************
1703              * CALCULATE INTERACTIONS *
1704              **************************/
1705
1706             r13              = _mm_mul_ps(rsq13,rinv13);
1707
1708             /* Calculate table index by multiplying r with table scale and truncate to integer */
1709             rt               = _mm_mul_ps(r13,vftabscale);
1710             vfitab           = _mm_cvttps_epi32(rt);
1711             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1712             vfitab           = _mm_slli_epi32(vfitab,2);
1713
1714             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1715             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1716             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1717             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1718             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1719             _MM_TRANSPOSE4_PS(Y,F,G,H);
1720             Heps             = _mm_mul_ps(vfeps,H);
1721             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1722             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1723             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1724
1725             fscal            = felec;
1726
1727             /* Calculate temporary vectorial force */
1728             tx               = _mm_mul_ps(fscal,dx13);
1729             ty               = _mm_mul_ps(fscal,dy13);
1730             tz               = _mm_mul_ps(fscal,dz13);
1731
1732             /* Update vectorial force */
1733             fix1             = _mm_add_ps(fix1,tx);
1734             fiy1             = _mm_add_ps(fiy1,ty);
1735             fiz1             = _mm_add_ps(fiz1,tz);
1736
1737             fjx3             = _mm_add_ps(fjx3,tx);
1738             fjy3             = _mm_add_ps(fjy3,ty);
1739             fjz3             = _mm_add_ps(fjz3,tz);
1740
1741             /**************************
1742              * CALCULATE INTERACTIONS *
1743              **************************/
1744
1745             r21              = _mm_mul_ps(rsq21,rinv21);
1746
1747             /* Calculate table index by multiplying r with table scale and truncate to integer */
1748             rt               = _mm_mul_ps(r21,vftabscale);
1749             vfitab           = _mm_cvttps_epi32(rt);
1750             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1751             vfitab           = _mm_slli_epi32(vfitab,2);
1752
1753             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1754             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1755             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1756             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1757             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1758             _MM_TRANSPOSE4_PS(Y,F,G,H);
1759             Heps             = _mm_mul_ps(vfeps,H);
1760             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1761             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1762             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1763
1764             fscal            = felec;
1765
1766             /* Calculate temporary vectorial force */
1767             tx               = _mm_mul_ps(fscal,dx21);
1768             ty               = _mm_mul_ps(fscal,dy21);
1769             tz               = _mm_mul_ps(fscal,dz21);
1770
1771             /* Update vectorial force */
1772             fix2             = _mm_add_ps(fix2,tx);
1773             fiy2             = _mm_add_ps(fiy2,ty);
1774             fiz2             = _mm_add_ps(fiz2,tz);
1775
1776             fjx1             = _mm_add_ps(fjx1,tx);
1777             fjy1             = _mm_add_ps(fjy1,ty);
1778             fjz1             = _mm_add_ps(fjz1,tz);
1779
1780             /**************************
1781              * CALCULATE INTERACTIONS *
1782              **************************/
1783
1784             r22              = _mm_mul_ps(rsq22,rinv22);
1785
1786             /* Calculate table index by multiplying r with table scale and truncate to integer */
1787             rt               = _mm_mul_ps(r22,vftabscale);
1788             vfitab           = _mm_cvttps_epi32(rt);
1789             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1790             vfitab           = _mm_slli_epi32(vfitab,2);
1791
1792             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1793             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1794             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1795             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1796             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1797             _MM_TRANSPOSE4_PS(Y,F,G,H);
1798             Heps             = _mm_mul_ps(vfeps,H);
1799             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1800             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1801             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1802
1803             fscal            = felec;
1804
1805             /* Calculate temporary vectorial force */
1806             tx               = _mm_mul_ps(fscal,dx22);
1807             ty               = _mm_mul_ps(fscal,dy22);
1808             tz               = _mm_mul_ps(fscal,dz22);
1809
1810             /* Update vectorial force */
1811             fix2             = _mm_add_ps(fix2,tx);
1812             fiy2             = _mm_add_ps(fiy2,ty);
1813             fiz2             = _mm_add_ps(fiz2,tz);
1814
1815             fjx2             = _mm_add_ps(fjx2,tx);
1816             fjy2             = _mm_add_ps(fjy2,ty);
1817             fjz2             = _mm_add_ps(fjz2,tz);
1818
1819             /**************************
1820              * CALCULATE INTERACTIONS *
1821              **************************/
1822
1823             r23              = _mm_mul_ps(rsq23,rinv23);
1824
1825             /* Calculate table index by multiplying r with table scale and truncate to integer */
1826             rt               = _mm_mul_ps(r23,vftabscale);
1827             vfitab           = _mm_cvttps_epi32(rt);
1828             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1829             vfitab           = _mm_slli_epi32(vfitab,2);
1830
1831             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1832             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1833             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1834             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1835             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1836             _MM_TRANSPOSE4_PS(Y,F,G,H);
1837             Heps             = _mm_mul_ps(vfeps,H);
1838             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1839             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1840             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1841
1842             fscal            = felec;
1843
1844             /* Calculate temporary vectorial force */
1845             tx               = _mm_mul_ps(fscal,dx23);
1846             ty               = _mm_mul_ps(fscal,dy23);
1847             tz               = _mm_mul_ps(fscal,dz23);
1848
1849             /* Update vectorial force */
1850             fix2             = _mm_add_ps(fix2,tx);
1851             fiy2             = _mm_add_ps(fiy2,ty);
1852             fiz2             = _mm_add_ps(fiz2,tz);
1853
1854             fjx3             = _mm_add_ps(fjx3,tx);
1855             fjy3             = _mm_add_ps(fjy3,ty);
1856             fjz3             = _mm_add_ps(fjz3,tz);
1857
1858             /**************************
1859              * CALCULATE INTERACTIONS *
1860              **************************/
1861
1862             r31              = _mm_mul_ps(rsq31,rinv31);
1863
1864             /* Calculate table index by multiplying r with table scale and truncate to integer */
1865             rt               = _mm_mul_ps(r31,vftabscale);
1866             vfitab           = _mm_cvttps_epi32(rt);
1867             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1868             vfitab           = _mm_slli_epi32(vfitab,2);
1869
1870             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1871             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1872             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1873             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1874             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1875             _MM_TRANSPOSE4_PS(Y,F,G,H);
1876             Heps             = _mm_mul_ps(vfeps,H);
1877             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1878             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1879             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1880
1881             fscal            = felec;
1882
1883             /* Calculate temporary vectorial force */
1884             tx               = _mm_mul_ps(fscal,dx31);
1885             ty               = _mm_mul_ps(fscal,dy31);
1886             tz               = _mm_mul_ps(fscal,dz31);
1887
1888             /* Update vectorial force */
1889             fix3             = _mm_add_ps(fix3,tx);
1890             fiy3             = _mm_add_ps(fiy3,ty);
1891             fiz3             = _mm_add_ps(fiz3,tz);
1892
1893             fjx1             = _mm_add_ps(fjx1,tx);
1894             fjy1             = _mm_add_ps(fjy1,ty);
1895             fjz1             = _mm_add_ps(fjz1,tz);
1896
1897             /**************************
1898              * CALCULATE INTERACTIONS *
1899              **************************/
1900
1901             r32              = _mm_mul_ps(rsq32,rinv32);
1902
1903             /* Calculate table index by multiplying r with table scale and truncate to integer */
1904             rt               = _mm_mul_ps(r32,vftabscale);
1905             vfitab           = _mm_cvttps_epi32(rt);
1906             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1907             vfitab           = _mm_slli_epi32(vfitab,2);
1908
1909             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1910             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1911             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1912             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1913             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1914             _MM_TRANSPOSE4_PS(Y,F,G,H);
1915             Heps             = _mm_mul_ps(vfeps,H);
1916             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1917             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1918             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1919
1920             fscal            = felec;
1921
1922             /* Calculate temporary vectorial force */
1923             tx               = _mm_mul_ps(fscal,dx32);
1924             ty               = _mm_mul_ps(fscal,dy32);
1925             tz               = _mm_mul_ps(fscal,dz32);
1926
1927             /* Update vectorial force */
1928             fix3             = _mm_add_ps(fix3,tx);
1929             fiy3             = _mm_add_ps(fiy3,ty);
1930             fiz3             = _mm_add_ps(fiz3,tz);
1931
1932             fjx2             = _mm_add_ps(fjx2,tx);
1933             fjy2             = _mm_add_ps(fjy2,ty);
1934             fjz2             = _mm_add_ps(fjz2,tz);
1935
1936             /**************************
1937              * CALCULATE INTERACTIONS *
1938              **************************/
1939
1940             r33              = _mm_mul_ps(rsq33,rinv33);
1941
1942             /* Calculate table index by multiplying r with table scale and truncate to integer */
1943             rt               = _mm_mul_ps(r33,vftabscale);
1944             vfitab           = _mm_cvttps_epi32(rt);
1945             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1946             vfitab           = _mm_slli_epi32(vfitab,2);
1947
1948             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1949             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1950             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1951             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1952             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1953             _MM_TRANSPOSE4_PS(Y,F,G,H);
1954             Heps             = _mm_mul_ps(vfeps,H);
1955             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1956             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1957             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1958
1959             fscal            = felec;
1960
1961             /* Calculate temporary vectorial force */
1962             tx               = _mm_mul_ps(fscal,dx33);
1963             ty               = _mm_mul_ps(fscal,dy33);
1964             tz               = _mm_mul_ps(fscal,dz33);
1965
1966             /* Update vectorial force */
1967             fix3             = _mm_add_ps(fix3,tx);
1968             fiy3             = _mm_add_ps(fiy3,ty);
1969             fiz3             = _mm_add_ps(fiz3,tz);
1970
1971             fjx3             = _mm_add_ps(fjx3,tx);
1972             fjy3             = _mm_add_ps(fjy3,ty);
1973             fjz3             = _mm_add_ps(fjz3,tz);
1974
1975             fjptrA             = f+j_coord_offsetA;
1976             fjptrB             = f+j_coord_offsetB;
1977             fjptrC             = f+j_coord_offsetC;
1978             fjptrD             = f+j_coord_offsetD;
1979
1980             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1981                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1982                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1983
1984             /* Inner loop uses 381 flops */
1985         }
1986
1987         if(jidx<j_index_end)
1988         {
1989
1990             /* Get j neighbor index, and coordinate index */
1991             jnrlistA         = jjnr[jidx];
1992             jnrlistB         = jjnr[jidx+1];
1993             jnrlistC         = jjnr[jidx+2];
1994             jnrlistD         = jjnr[jidx+3];
1995             /* Sign of each element will be negative for non-real atoms.
1996              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1997              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1998              */
1999             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2000             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2001             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2002             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2003             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2004             j_coord_offsetA  = DIM*jnrA;
2005             j_coord_offsetB  = DIM*jnrB;
2006             j_coord_offsetC  = DIM*jnrC;
2007             j_coord_offsetD  = DIM*jnrD;
2008
2009             /* load j atom coordinates */
2010             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2011                                               x+j_coord_offsetC,x+j_coord_offsetD,
2012                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2013                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2014
2015             /* Calculate displacement vector */
2016             dx00             = _mm_sub_ps(ix0,jx0);
2017             dy00             = _mm_sub_ps(iy0,jy0);
2018             dz00             = _mm_sub_ps(iz0,jz0);
2019             dx11             = _mm_sub_ps(ix1,jx1);
2020             dy11             = _mm_sub_ps(iy1,jy1);
2021             dz11             = _mm_sub_ps(iz1,jz1);
2022             dx12             = _mm_sub_ps(ix1,jx2);
2023             dy12             = _mm_sub_ps(iy1,jy2);
2024             dz12             = _mm_sub_ps(iz1,jz2);
2025             dx13             = _mm_sub_ps(ix1,jx3);
2026             dy13             = _mm_sub_ps(iy1,jy3);
2027             dz13             = _mm_sub_ps(iz1,jz3);
2028             dx21             = _mm_sub_ps(ix2,jx1);
2029             dy21             = _mm_sub_ps(iy2,jy1);
2030             dz21             = _mm_sub_ps(iz2,jz1);
2031             dx22             = _mm_sub_ps(ix2,jx2);
2032             dy22             = _mm_sub_ps(iy2,jy2);
2033             dz22             = _mm_sub_ps(iz2,jz2);
2034             dx23             = _mm_sub_ps(ix2,jx3);
2035             dy23             = _mm_sub_ps(iy2,jy3);
2036             dz23             = _mm_sub_ps(iz2,jz3);
2037             dx31             = _mm_sub_ps(ix3,jx1);
2038             dy31             = _mm_sub_ps(iy3,jy1);
2039             dz31             = _mm_sub_ps(iz3,jz1);
2040             dx32             = _mm_sub_ps(ix3,jx2);
2041             dy32             = _mm_sub_ps(iy3,jy2);
2042             dz32             = _mm_sub_ps(iz3,jz2);
2043             dx33             = _mm_sub_ps(ix3,jx3);
2044             dy33             = _mm_sub_ps(iy3,jy3);
2045             dz33             = _mm_sub_ps(iz3,jz3);
2046
2047             /* Calculate squared distance and things based on it */
2048             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2049             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2050             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2051             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2052             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2053             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2054             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2055             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2056             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2057             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2058
2059             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2060             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2061             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2062             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2063             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2064             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2065             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2066             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2067             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2068
2069             rinvsq00         = gmx_mm_inv_ps(rsq00);
2070
2071             fjx0             = _mm_setzero_ps();
2072             fjy0             = _mm_setzero_ps();
2073             fjz0             = _mm_setzero_ps();
2074             fjx1             = _mm_setzero_ps();
2075             fjy1             = _mm_setzero_ps();
2076             fjz1             = _mm_setzero_ps();
2077             fjx2             = _mm_setzero_ps();
2078             fjy2             = _mm_setzero_ps();
2079             fjz2             = _mm_setzero_ps();
2080             fjx3             = _mm_setzero_ps();
2081             fjy3             = _mm_setzero_ps();
2082             fjz3             = _mm_setzero_ps();
2083
2084             /**************************
2085              * CALCULATE INTERACTIONS *
2086              **************************/
2087
2088             /* LENNARD-JONES DISPERSION/REPULSION */
2089
2090             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2091             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2092
2093             fscal            = fvdw;
2094
2095             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2096
2097             /* Calculate temporary vectorial force */
2098             tx               = _mm_mul_ps(fscal,dx00);
2099             ty               = _mm_mul_ps(fscal,dy00);
2100             tz               = _mm_mul_ps(fscal,dz00);
2101
2102             /* Update vectorial force */
2103             fix0             = _mm_add_ps(fix0,tx);
2104             fiy0             = _mm_add_ps(fiy0,ty);
2105             fiz0             = _mm_add_ps(fiz0,tz);
2106
2107             fjx0             = _mm_add_ps(fjx0,tx);
2108             fjy0             = _mm_add_ps(fjy0,ty);
2109             fjz0             = _mm_add_ps(fjz0,tz);
2110
2111             /**************************
2112              * CALCULATE INTERACTIONS *
2113              **************************/
2114
2115             r11              = _mm_mul_ps(rsq11,rinv11);
2116             r11              = _mm_andnot_ps(dummy_mask,r11);
2117
2118             /* Calculate table index by multiplying r with table scale and truncate to integer */
2119             rt               = _mm_mul_ps(r11,vftabscale);
2120             vfitab           = _mm_cvttps_epi32(rt);
2121             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2122             vfitab           = _mm_slli_epi32(vfitab,2);
2123
2124             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2125             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2126             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2127             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2128             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2129             _MM_TRANSPOSE4_PS(Y,F,G,H);
2130             Heps             = _mm_mul_ps(vfeps,H);
2131             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2132             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2133             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2134
2135             fscal            = felec;
2136
2137             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2138
2139             /* Calculate temporary vectorial force */
2140             tx               = _mm_mul_ps(fscal,dx11);
2141             ty               = _mm_mul_ps(fscal,dy11);
2142             tz               = _mm_mul_ps(fscal,dz11);
2143
2144             /* Update vectorial force */
2145             fix1             = _mm_add_ps(fix1,tx);
2146             fiy1             = _mm_add_ps(fiy1,ty);
2147             fiz1             = _mm_add_ps(fiz1,tz);
2148
2149             fjx1             = _mm_add_ps(fjx1,tx);
2150             fjy1             = _mm_add_ps(fjy1,ty);
2151             fjz1             = _mm_add_ps(fjz1,tz);
2152
2153             /**************************
2154              * CALCULATE INTERACTIONS *
2155              **************************/
2156
2157             r12              = _mm_mul_ps(rsq12,rinv12);
2158             r12              = _mm_andnot_ps(dummy_mask,r12);
2159
2160             /* Calculate table index by multiplying r with table scale and truncate to integer */
2161             rt               = _mm_mul_ps(r12,vftabscale);
2162             vfitab           = _mm_cvttps_epi32(rt);
2163             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2164             vfitab           = _mm_slli_epi32(vfitab,2);
2165
2166             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2167             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2168             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2169             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2170             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2171             _MM_TRANSPOSE4_PS(Y,F,G,H);
2172             Heps             = _mm_mul_ps(vfeps,H);
2173             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2174             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2175             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2176
2177             fscal            = felec;
2178
2179             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2180
2181             /* Calculate temporary vectorial force */
2182             tx               = _mm_mul_ps(fscal,dx12);
2183             ty               = _mm_mul_ps(fscal,dy12);
2184             tz               = _mm_mul_ps(fscal,dz12);
2185
2186             /* Update vectorial force */
2187             fix1             = _mm_add_ps(fix1,tx);
2188             fiy1             = _mm_add_ps(fiy1,ty);
2189             fiz1             = _mm_add_ps(fiz1,tz);
2190
2191             fjx2             = _mm_add_ps(fjx2,tx);
2192             fjy2             = _mm_add_ps(fjy2,ty);
2193             fjz2             = _mm_add_ps(fjz2,tz);
2194
2195             /**************************
2196              * CALCULATE INTERACTIONS *
2197              **************************/
2198
2199             r13              = _mm_mul_ps(rsq13,rinv13);
2200             r13              = _mm_andnot_ps(dummy_mask,r13);
2201
2202             /* Calculate table index by multiplying r with table scale and truncate to integer */
2203             rt               = _mm_mul_ps(r13,vftabscale);
2204             vfitab           = _mm_cvttps_epi32(rt);
2205             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2206             vfitab           = _mm_slli_epi32(vfitab,2);
2207
2208             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2209             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2210             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2211             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2212             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2213             _MM_TRANSPOSE4_PS(Y,F,G,H);
2214             Heps             = _mm_mul_ps(vfeps,H);
2215             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2216             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2217             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2218
2219             fscal            = felec;
2220
2221             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2222
2223             /* Calculate temporary vectorial force */
2224             tx               = _mm_mul_ps(fscal,dx13);
2225             ty               = _mm_mul_ps(fscal,dy13);
2226             tz               = _mm_mul_ps(fscal,dz13);
2227
2228             /* Update vectorial force */
2229             fix1             = _mm_add_ps(fix1,tx);
2230             fiy1             = _mm_add_ps(fiy1,ty);
2231             fiz1             = _mm_add_ps(fiz1,tz);
2232
2233             fjx3             = _mm_add_ps(fjx3,tx);
2234             fjy3             = _mm_add_ps(fjy3,ty);
2235             fjz3             = _mm_add_ps(fjz3,tz);
2236
2237             /**************************
2238              * CALCULATE INTERACTIONS *
2239              **************************/
2240
2241             r21              = _mm_mul_ps(rsq21,rinv21);
2242             r21              = _mm_andnot_ps(dummy_mask,r21);
2243
2244             /* Calculate table index by multiplying r with table scale and truncate to integer */
2245             rt               = _mm_mul_ps(r21,vftabscale);
2246             vfitab           = _mm_cvttps_epi32(rt);
2247             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2248             vfitab           = _mm_slli_epi32(vfitab,2);
2249
2250             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2251             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2252             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2253             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2254             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2255             _MM_TRANSPOSE4_PS(Y,F,G,H);
2256             Heps             = _mm_mul_ps(vfeps,H);
2257             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2258             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2259             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2260
2261             fscal            = felec;
2262
2263             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2264
2265             /* Calculate temporary vectorial force */
2266             tx               = _mm_mul_ps(fscal,dx21);
2267             ty               = _mm_mul_ps(fscal,dy21);
2268             tz               = _mm_mul_ps(fscal,dz21);
2269
2270             /* Update vectorial force */
2271             fix2             = _mm_add_ps(fix2,tx);
2272             fiy2             = _mm_add_ps(fiy2,ty);
2273             fiz2             = _mm_add_ps(fiz2,tz);
2274
2275             fjx1             = _mm_add_ps(fjx1,tx);
2276             fjy1             = _mm_add_ps(fjy1,ty);
2277             fjz1             = _mm_add_ps(fjz1,tz);
2278
2279             /**************************
2280              * CALCULATE INTERACTIONS *
2281              **************************/
2282
2283             r22              = _mm_mul_ps(rsq22,rinv22);
2284             r22              = _mm_andnot_ps(dummy_mask,r22);
2285
2286             /* Calculate table index by multiplying r with table scale and truncate to integer */
2287             rt               = _mm_mul_ps(r22,vftabscale);
2288             vfitab           = _mm_cvttps_epi32(rt);
2289             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2290             vfitab           = _mm_slli_epi32(vfitab,2);
2291
2292             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2293             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2294             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2295             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2296             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2297             _MM_TRANSPOSE4_PS(Y,F,G,H);
2298             Heps             = _mm_mul_ps(vfeps,H);
2299             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2300             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2301             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2302
2303             fscal            = felec;
2304
2305             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2306
2307             /* Calculate temporary vectorial force */
2308             tx               = _mm_mul_ps(fscal,dx22);
2309             ty               = _mm_mul_ps(fscal,dy22);
2310             tz               = _mm_mul_ps(fscal,dz22);
2311
2312             /* Update vectorial force */
2313             fix2             = _mm_add_ps(fix2,tx);
2314             fiy2             = _mm_add_ps(fiy2,ty);
2315             fiz2             = _mm_add_ps(fiz2,tz);
2316
2317             fjx2             = _mm_add_ps(fjx2,tx);
2318             fjy2             = _mm_add_ps(fjy2,ty);
2319             fjz2             = _mm_add_ps(fjz2,tz);
2320
2321             /**************************
2322              * CALCULATE INTERACTIONS *
2323              **************************/
2324
2325             r23              = _mm_mul_ps(rsq23,rinv23);
2326             r23              = _mm_andnot_ps(dummy_mask,r23);
2327
2328             /* Calculate table index by multiplying r with table scale and truncate to integer */
2329             rt               = _mm_mul_ps(r23,vftabscale);
2330             vfitab           = _mm_cvttps_epi32(rt);
2331             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2332             vfitab           = _mm_slli_epi32(vfitab,2);
2333
2334             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2335             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2336             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2337             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2338             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2339             _MM_TRANSPOSE4_PS(Y,F,G,H);
2340             Heps             = _mm_mul_ps(vfeps,H);
2341             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2342             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2343             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2344
2345             fscal            = felec;
2346
2347             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2348
2349             /* Calculate temporary vectorial force */
2350             tx               = _mm_mul_ps(fscal,dx23);
2351             ty               = _mm_mul_ps(fscal,dy23);
2352             tz               = _mm_mul_ps(fscal,dz23);
2353
2354             /* Update vectorial force */
2355             fix2             = _mm_add_ps(fix2,tx);
2356             fiy2             = _mm_add_ps(fiy2,ty);
2357             fiz2             = _mm_add_ps(fiz2,tz);
2358
2359             fjx3             = _mm_add_ps(fjx3,tx);
2360             fjy3             = _mm_add_ps(fjy3,ty);
2361             fjz3             = _mm_add_ps(fjz3,tz);
2362
2363             /**************************
2364              * CALCULATE INTERACTIONS *
2365              **************************/
2366
2367             r31              = _mm_mul_ps(rsq31,rinv31);
2368             r31              = _mm_andnot_ps(dummy_mask,r31);
2369
2370             /* Calculate table index by multiplying r with table scale and truncate to integer */
2371             rt               = _mm_mul_ps(r31,vftabscale);
2372             vfitab           = _mm_cvttps_epi32(rt);
2373             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2374             vfitab           = _mm_slli_epi32(vfitab,2);
2375
2376             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2377             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2378             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2379             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2380             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2381             _MM_TRANSPOSE4_PS(Y,F,G,H);
2382             Heps             = _mm_mul_ps(vfeps,H);
2383             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2384             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2385             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2386
2387             fscal            = felec;
2388
2389             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2390
2391             /* Calculate temporary vectorial force */
2392             tx               = _mm_mul_ps(fscal,dx31);
2393             ty               = _mm_mul_ps(fscal,dy31);
2394             tz               = _mm_mul_ps(fscal,dz31);
2395
2396             /* Update vectorial force */
2397             fix3             = _mm_add_ps(fix3,tx);
2398             fiy3             = _mm_add_ps(fiy3,ty);
2399             fiz3             = _mm_add_ps(fiz3,tz);
2400
2401             fjx1             = _mm_add_ps(fjx1,tx);
2402             fjy1             = _mm_add_ps(fjy1,ty);
2403             fjz1             = _mm_add_ps(fjz1,tz);
2404
2405             /**************************
2406              * CALCULATE INTERACTIONS *
2407              **************************/
2408
2409             r32              = _mm_mul_ps(rsq32,rinv32);
2410             r32              = _mm_andnot_ps(dummy_mask,r32);
2411
2412             /* Calculate table index by multiplying r with table scale and truncate to integer */
2413             rt               = _mm_mul_ps(r32,vftabscale);
2414             vfitab           = _mm_cvttps_epi32(rt);
2415             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2416             vfitab           = _mm_slli_epi32(vfitab,2);
2417
2418             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2419             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2420             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2421             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2422             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2423             _MM_TRANSPOSE4_PS(Y,F,G,H);
2424             Heps             = _mm_mul_ps(vfeps,H);
2425             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2426             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2427             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2428
2429             fscal            = felec;
2430
2431             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2432
2433             /* Calculate temporary vectorial force */
2434             tx               = _mm_mul_ps(fscal,dx32);
2435             ty               = _mm_mul_ps(fscal,dy32);
2436             tz               = _mm_mul_ps(fscal,dz32);
2437
2438             /* Update vectorial force */
2439             fix3             = _mm_add_ps(fix3,tx);
2440             fiy3             = _mm_add_ps(fiy3,ty);
2441             fiz3             = _mm_add_ps(fiz3,tz);
2442
2443             fjx2             = _mm_add_ps(fjx2,tx);
2444             fjy2             = _mm_add_ps(fjy2,ty);
2445             fjz2             = _mm_add_ps(fjz2,tz);
2446
2447             /**************************
2448              * CALCULATE INTERACTIONS *
2449              **************************/
2450
2451             r33              = _mm_mul_ps(rsq33,rinv33);
2452             r33              = _mm_andnot_ps(dummy_mask,r33);
2453
2454             /* Calculate table index by multiplying r with table scale and truncate to integer */
2455             rt               = _mm_mul_ps(r33,vftabscale);
2456             vfitab           = _mm_cvttps_epi32(rt);
2457             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2458             vfitab           = _mm_slli_epi32(vfitab,2);
2459
2460             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2461             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2462             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2463             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2464             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2465             _MM_TRANSPOSE4_PS(Y,F,G,H);
2466             Heps             = _mm_mul_ps(vfeps,H);
2467             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2468             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2469             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2470
2471             fscal            = felec;
2472
2473             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2474
2475             /* Calculate temporary vectorial force */
2476             tx               = _mm_mul_ps(fscal,dx33);
2477             ty               = _mm_mul_ps(fscal,dy33);
2478             tz               = _mm_mul_ps(fscal,dz33);
2479
2480             /* Update vectorial force */
2481             fix3             = _mm_add_ps(fix3,tx);
2482             fiy3             = _mm_add_ps(fiy3,ty);
2483             fiz3             = _mm_add_ps(fiz3,tz);
2484
2485             fjx3             = _mm_add_ps(fjx3,tx);
2486             fjy3             = _mm_add_ps(fjy3,ty);
2487             fjz3             = _mm_add_ps(fjz3,tz);
2488
2489             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2490             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2491             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2492             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2493
2494             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2495                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2496                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2497
2498             /* Inner loop uses 390 flops */
2499         }
2500
2501         /* End of innermost loop */
2502
2503         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2504                                               f+i_coord_offset,fshift+i_shift_offset);
2505
2506         /* Increment number of inner iterations */
2507         inneriter                  += j_index_end - j_index_start;
2508
2509         /* Outer loop uses 24 flops */
2510     }
2511
2512     /* Increment number of outer iterations */
2513     outeriter        += nri;
2514
2515     /* Update outer/inner flops */
2516
2517     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*390);
2518 }