Use full path for legacyheaders
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_single
52  * Electrostatics interaction: CubicSplineTable
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     int              nvdwtype;
107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108     int              *vdwtype;
109     real             *vdwparam;
110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
112     __m128i          vfitab;
113     __m128i          ifour       = _mm_set1_epi32(4);
114     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
115     real             *vftab;
116     __m128           dummy_mask,cutoff_mask;
117     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118     __m128           one     = _mm_set1_ps(1.0);
119     __m128           two     = _mm_set1_ps(2.0);
120     x                = xx[0];
121     f                = ff[0];
122
123     nri              = nlist->nri;
124     iinr             = nlist->iinr;
125     jindex           = nlist->jindex;
126     jjnr             = nlist->jjnr;
127     shiftidx         = nlist->shift;
128     gid              = nlist->gid;
129     shiftvec         = fr->shift_vec[0];
130     fshift           = fr->fshift[0];
131     facel            = _mm_set1_ps(fr->epsfac);
132     charge           = mdatoms->chargeA;
133     nvdwtype         = fr->ntype;
134     vdwparam         = fr->nbfp;
135     vdwtype          = mdatoms->typeA;
136
137     vftab            = kernel_data->table_elec->data;
138     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
139
140     /* Setup water-specific parameters */
141     inr              = nlist->iinr[0];
142     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
143     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
144     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
145     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
146
147     jq0              = _mm_set1_ps(charge[inr+0]);
148     jq1              = _mm_set1_ps(charge[inr+1]);
149     jq2              = _mm_set1_ps(charge[inr+2]);
150     vdwjidx0A        = 2*vdwtype[inr+0];
151     qq00             = _mm_mul_ps(iq0,jq0);
152     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
153     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
154     qq01             = _mm_mul_ps(iq0,jq1);
155     qq02             = _mm_mul_ps(iq0,jq2);
156     qq10             = _mm_mul_ps(iq1,jq0);
157     qq11             = _mm_mul_ps(iq1,jq1);
158     qq12             = _mm_mul_ps(iq1,jq2);
159     qq20             = _mm_mul_ps(iq2,jq0);
160     qq21             = _mm_mul_ps(iq2,jq1);
161     qq22             = _mm_mul_ps(iq2,jq2);
162
163     /* Avoid stupid compiler warnings */
164     jnrA = jnrB = jnrC = jnrD = 0;
165     j_coord_offsetA = 0;
166     j_coord_offsetB = 0;
167     j_coord_offsetC = 0;
168     j_coord_offsetD = 0;
169
170     outeriter        = 0;
171     inneriter        = 0;
172
173     for(iidx=0;iidx<4*DIM;iidx++)
174     {
175         scratch[iidx] = 0.0;
176     }
177
178     /* Start outer loop over neighborlists */
179     for(iidx=0; iidx<nri; iidx++)
180     {
181         /* Load shift vector for this list */
182         i_shift_offset   = DIM*shiftidx[iidx];
183
184         /* Load limits for loop over neighbors */
185         j_index_start    = jindex[iidx];
186         j_index_end      = jindex[iidx+1];
187
188         /* Get outer coordinate index */
189         inr              = iinr[iidx];
190         i_coord_offset   = DIM*inr;
191
192         /* Load i particle coords and add shift vector */
193         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
194                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
195
196         fix0             = _mm_setzero_ps();
197         fiy0             = _mm_setzero_ps();
198         fiz0             = _mm_setzero_ps();
199         fix1             = _mm_setzero_ps();
200         fiy1             = _mm_setzero_ps();
201         fiz1             = _mm_setzero_ps();
202         fix2             = _mm_setzero_ps();
203         fiy2             = _mm_setzero_ps();
204         fiz2             = _mm_setzero_ps();
205
206         /* Reset potential sums */
207         velecsum         = _mm_setzero_ps();
208         vvdwsum          = _mm_setzero_ps();
209
210         /* Start inner kernel loop */
211         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
212         {
213
214             /* Get j neighbor index, and coordinate index */
215             jnrA             = jjnr[jidx];
216             jnrB             = jjnr[jidx+1];
217             jnrC             = jjnr[jidx+2];
218             jnrD             = jjnr[jidx+3];
219             j_coord_offsetA  = DIM*jnrA;
220             j_coord_offsetB  = DIM*jnrB;
221             j_coord_offsetC  = DIM*jnrC;
222             j_coord_offsetD  = DIM*jnrD;
223
224             /* load j atom coordinates */
225             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
226                                               x+j_coord_offsetC,x+j_coord_offsetD,
227                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
228
229             /* Calculate displacement vector */
230             dx00             = _mm_sub_ps(ix0,jx0);
231             dy00             = _mm_sub_ps(iy0,jy0);
232             dz00             = _mm_sub_ps(iz0,jz0);
233             dx01             = _mm_sub_ps(ix0,jx1);
234             dy01             = _mm_sub_ps(iy0,jy1);
235             dz01             = _mm_sub_ps(iz0,jz1);
236             dx02             = _mm_sub_ps(ix0,jx2);
237             dy02             = _mm_sub_ps(iy0,jy2);
238             dz02             = _mm_sub_ps(iz0,jz2);
239             dx10             = _mm_sub_ps(ix1,jx0);
240             dy10             = _mm_sub_ps(iy1,jy0);
241             dz10             = _mm_sub_ps(iz1,jz0);
242             dx11             = _mm_sub_ps(ix1,jx1);
243             dy11             = _mm_sub_ps(iy1,jy1);
244             dz11             = _mm_sub_ps(iz1,jz1);
245             dx12             = _mm_sub_ps(ix1,jx2);
246             dy12             = _mm_sub_ps(iy1,jy2);
247             dz12             = _mm_sub_ps(iz1,jz2);
248             dx20             = _mm_sub_ps(ix2,jx0);
249             dy20             = _mm_sub_ps(iy2,jy0);
250             dz20             = _mm_sub_ps(iz2,jz0);
251             dx21             = _mm_sub_ps(ix2,jx1);
252             dy21             = _mm_sub_ps(iy2,jy1);
253             dz21             = _mm_sub_ps(iz2,jz1);
254             dx22             = _mm_sub_ps(ix2,jx2);
255             dy22             = _mm_sub_ps(iy2,jy2);
256             dz22             = _mm_sub_ps(iz2,jz2);
257
258             /* Calculate squared distance and things based on it */
259             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
260             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
261             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
262             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
263             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
264             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
265             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
266             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
267             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
268
269             rinv00           = gmx_mm_invsqrt_ps(rsq00);
270             rinv01           = gmx_mm_invsqrt_ps(rsq01);
271             rinv02           = gmx_mm_invsqrt_ps(rsq02);
272             rinv10           = gmx_mm_invsqrt_ps(rsq10);
273             rinv11           = gmx_mm_invsqrt_ps(rsq11);
274             rinv12           = gmx_mm_invsqrt_ps(rsq12);
275             rinv20           = gmx_mm_invsqrt_ps(rsq20);
276             rinv21           = gmx_mm_invsqrt_ps(rsq21);
277             rinv22           = gmx_mm_invsqrt_ps(rsq22);
278
279             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
280
281             fjx0             = _mm_setzero_ps();
282             fjy0             = _mm_setzero_ps();
283             fjz0             = _mm_setzero_ps();
284             fjx1             = _mm_setzero_ps();
285             fjy1             = _mm_setzero_ps();
286             fjz1             = _mm_setzero_ps();
287             fjx2             = _mm_setzero_ps();
288             fjy2             = _mm_setzero_ps();
289             fjz2             = _mm_setzero_ps();
290
291             /**************************
292              * CALCULATE INTERACTIONS *
293              **************************/
294
295             r00              = _mm_mul_ps(rsq00,rinv00);
296
297             /* Calculate table index by multiplying r with table scale and truncate to integer */
298             rt               = _mm_mul_ps(r00,vftabscale);
299             vfitab           = _mm_cvttps_epi32(rt);
300             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
301             vfitab           = _mm_slli_epi32(vfitab,2);
302
303             /* CUBIC SPLINE TABLE ELECTROSTATICS */
304             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
305             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
306             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
307             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
308             _MM_TRANSPOSE4_PS(Y,F,G,H);
309             Heps             = _mm_mul_ps(vfeps,H);
310             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
311             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
312             velec            = _mm_mul_ps(qq00,VV);
313             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
314             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
315
316             /* LENNARD-JONES DISPERSION/REPULSION */
317
318             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
319             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
320             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
321             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
322             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
323
324             /* Update potential sum for this i atom from the interaction with this j atom. */
325             velecsum         = _mm_add_ps(velecsum,velec);
326             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
327
328             fscal            = _mm_add_ps(felec,fvdw);
329
330             /* Calculate temporary vectorial force */
331             tx               = _mm_mul_ps(fscal,dx00);
332             ty               = _mm_mul_ps(fscal,dy00);
333             tz               = _mm_mul_ps(fscal,dz00);
334
335             /* Update vectorial force */
336             fix0             = _mm_add_ps(fix0,tx);
337             fiy0             = _mm_add_ps(fiy0,ty);
338             fiz0             = _mm_add_ps(fiz0,tz);
339
340             fjx0             = _mm_add_ps(fjx0,tx);
341             fjy0             = _mm_add_ps(fjy0,ty);
342             fjz0             = _mm_add_ps(fjz0,tz);
343
344             /**************************
345              * CALCULATE INTERACTIONS *
346              **************************/
347
348             r01              = _mm_mul_ps(rsq01,rinv01);
349
350             /* Calculate table index by multiplying r with table scale and truncate to integer */
351             rt               = _mm_mul_ps(r01,vftabscale);
352             vfitab           = _mm_cvttps_epi32(rt);
353             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
354             vfitab           = _mm_slli_epi32(vfitab,2);
355
356             /* CUBIC SPLINE TABLE ELECTROSTATICS */
357             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
358             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
359             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
360             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
361             _MM_TRANSPOSE4_PS(Y,F,G,H);
362             Heps             = _mm_mul_ps(vfeps,H);
363             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
364             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
365             velec            = _mm_mul_ps(qq01,VV);
366             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
367             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
368
369             /* Update potential sum for this i atom from the interaction with this j atom. */
370             velecsum         = _mm_add_ps(velecsum,velec);
371
372             fscal            = felec;
373
374             /* Calculate temporary vectorial force */
375             tx               = _mm_mul_ps(fscal,dx01);
376             ty               = _mm_mul_ps(fscal,dy01);
377             tz               = _mm_mul_ps(fscal,dz01);
378
379             /* Update vectorial force */
380             fix0             = _mm_add_ps(fix0,tx);
381             fiy0             = _mm_add_ps(fiy0,ty);
382             fiz0             = _mm_add_ps(fiz0,tz);
383
384             fjx1             = _mm_add_ps(fjx1,tx);
385             fjy1             = _mm_add_ps(fjy1,ty);
386             fjz1             = _mm_add_ps(fjz1,tz);
387
388             /**************************
389              * CALCULATE INTERACTIONS *
390              **************************/
391
392             r02              = _mm_mul_ps(rsq02,rinv02);
393
394             /* Calculate table index by multiplying r with table scale and truncate to integer */
395             rt               = _mm_mul_ps(r02,vftabscale);
396             vfitab           = _mm_cvttps_epi32(rt);
397             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
398             vfitab           = _mm_slli_epi32(vfitab,2);
399
400             /* CUBIC SPLINE TABLE ELECTROSTATICS */
401             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
402             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
403             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
404             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
405             _MM_TRANSPOSE4_PS(Y,F,G,H);
406             Heps             = _mm_mul_ps(vfeps,H);
407             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
408             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
409             velec            = _mm_mul_ps(qq02,VV);
410             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
411             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
412
413             /* Update potential sum for this i atom from the interaction with this j atom. */
414             velecsum         = _mm_add_ps(velecsum,velec);
415
416             fscal            = felec;
417
418             /* Calculate temporary vectorial force */
419             tx               = _mm_mul_ps(fscal,dx02);
420             ty               = _mm_mul_ps(fscal,dy02);
421             tz               = _mm_mul_ps(fscal,dz02);
422
423             /* Update vectorial force */
424             fix0             = _mm_add_ps(fix0,tx);
425             fiy0             = _mm_add_ps(fiy0,ty);
426             fiz0             = _mm_add_ps(fiz0,tz);
427
428             fjx2             = _mm_add_ps(fjx2,tx);
429             fjy2             = _mm_add_ps(fjy2,ty);
430             fjz2             = _mm_add_ps(fjz2,tz);
431
432             /**************************
433              * CALCULATE INTERACTIONS *
434              **************************/
435
436             r10              = _mm_mul_ps(rsq10,rinv10);
437
438             /* Calculate table index by multiplying r with table scale and truncate to integer */
439             rt               = _mm_mul_ps(r10,vftabscale);
440             vfitab           = _mm_cvttps_epi32(rt);
441             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
442             vfitab           = _mm_slli_epi32(vfitab,2);
443
444             /* CUBIC SPLINE TABLE ELECTROSTATICS */
445             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
446             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
447             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
448             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
449             _MM_TRANSPOSE4_PS(Y,F,G,H);
450             Heps             = _mm_mul_ps(vfeps,H);
451             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
452             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
453             velec            = _mm_mul_ps(qq10,VV);
454             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
455             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
456
457             /* Update potential sum for this i atom from the interaction with this j atom. */
458             velecsum         = _mm_add_ps(velecsum,velec);
459
460             fscal            = felec;
461
462             /* Calculate temporary vectorial force */
463             tx               = _mm_mul_ps(fscal,dx10);
464             ty               = _mm_mul_ps(fscal,dy10);
465             tz               = _mm_mul_ps(fscal,dz10);
466
467             /* Update vectorial force */
468             fix1             = _mm_add_ps(fix1,tx);
469             fiy1             = _mm_add_ps(fiy1,ty);
470             fiz1             = _mm_add_ps(fiz1,tz);
471
472             fjx0             = _mm_add_ps(fjx0,tx);
473             fjy0             = _mm_add_ps(fjy0,ty);
474             fjz0             = _mm_add_ps(fjz0,tz);
475
476             /**************************
477              * CALCULATE INTERACTIONS *
478              **************************/
479
480             r11              = _mm_mul_ps(rsq11,rinv11);
481
482             /* Calculate table index by multiplying r with table scale and truncate to integer */
483             rt               = _mm_mul_ps(r11,vftabscale);
484             vfitab           = _mm_cvttps_epi32(rt);
485             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
486             vfitab           = _mm_slli_epi32(vfitab,2);
487
488             /* CUBIC SPLINE TABLE ELECTROSTATICS */
489             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
490             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
491             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
492             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
493             _MM_TRANSPOSE4_PS(Y,F,G,H);
494             Heps             = _mm_mul_ps(vfeps,H);
495             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
496             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
497             velec            = _mm_mul_ps(qq11,VV);
498             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
499             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
500
501             /* Update potential sum for this i atom from the interaction with this j atom. */
502             velecsum         = _mm_add_ps(velecsum,velec);
503
504             fscal            = felec;
505
506             /* Calculate temporary vectorial force */
507             tx               = _mm_mul_ps(fscal,dx11);
508             ty               = _mm_mul_ps(fscal,dy11);
509             tz               = _mm_mul_ps(fscal,dz11);
510
511             /* Update vectorial force */
512             fix1             = _mm_add_ps(fix1,tx);
513             fiy1             = _mm_add_ps(fiy1,ty);
514             fiz1             = _mm_add_ps(fiz1,tz);
515
516             fjx1             = _mm_add_ps(fjx1,tx);
517             fjy1             = _mm_add_ps(fjy1,ty);
518             fjz1             = _mm_add_ps(fjz1,tz);
519
520             /**************************
521              * CALCULATE INTERACTIONS *
522              **************************/
523
524             r12              = _mm_mul_ps(rsq12,rinv12);
525
526             /* Calculate table index by multiplying r with table scale and truncate to integer */
527             rt               = _mm_mul_ps(r12,vftabscale);
528             vfitab           = _mm_cvttps_epi32(rt);
529             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
530             vfitab           = _mm_slli_epi32(vfitab,2);
531
532             /* CUBIC SPLINE TABLE ELECTROSTATICS */
533             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
534             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
535             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
536             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
537             _MM_TRANSPOSE4_PS(Y,F,G,H);
538             Heps             = _mm_mul_ps(vfeps,H);
539             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
540             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
541             velec            = _mm_mul_ps(qq12,VV);
542             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
543             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
544
545             /* Update potential sum for this i atom from the interaction with this j atom. */
546             velecsum         = _mm_add_ps(velecsum,velec);
547
548             fscal            = felec;
549
550             /* Calculate temporary vectorial force */
551             tx               = _mm_mul_ps(fscal,dx12);
552             ty               = _mm_mul_ps(fscal,dy12);
553             tz               = _mm_mul_ps(fscal,dz12);
554
555             /* Update vectorial force */
556             fix1             = _mm_add_ps(fix1,tx);
557             fiy1             = _mm_add_ps(fiy1,ty);
558             fiz1             = _mm_add_ps(fiz1,tz);
559
560             fjx2             = _mm_add_ps(fjx2,tx);
561             fjy2             = _mm_add_ps(fjy2,ty);
562             fjz2             = _mm_add_ps(fjz2,tz);
563
564             /**************************
565              * CALCULATE INTERACTIONS *
566              **************************/
567
568             r20              = _mm_mul_ps(rsq20,rinv20);
569
570             /* Calculate table index by multiplying r with table scale and truncate to integer */
571             rt               = _mm_mul_ps(r20,vftabscale);
572             vfitab           = _mm_cvttps_epi32(rt);
573             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
574             vfitab           = _mm_slli_epi32(vfitab,2);
575
576             /* CUBIC SPLINE TABLE ELECTROSTATICS */
577             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
578             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
579             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
580             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
581             _MM_TRANSPOSE4_PS(Y,F,G,H);
582             Heps             = _mm_mul_ps(vfeps,H);
583             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
584             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
585             velec            = _mm_mul_ps(qq20,VV);
586             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
587             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
588
589             /* Update potential sum for this i atom from the interaction with this j atom. */
590             velecsum         = _mm_add_ps(velecsum,velec);
591
592             fscal            = felec;
593
594             /* Calculate temporary vectorial force */
595             tx               = _mm_mul_ps(fscal,dx20);
596             ty               = _mm_mul_ps(fscal,dy20);
597             tz               = _mm_mul_ps(fscal,dz20);
598
599             /* Update vectorial force */
600             fix2             = _mm_add_ps(fix2,tx);
601             fiy2             = _mm_add_ps(fiy2,ty);
602             fiz2             = _mm_add_ps(fiz2,tz);
603
604             fjx0             = _mm_add_ps(fjx0,tx);
605             fjy0             = _mm_add_ps(fjy0,ty);
606             fjz0             = _mm_add_ps(fjz0,tz);
607
608             /**************************
609              * CALCULATE INTERACTIONS *
610              **************************/
611
612             r21              = _mm_mul_ps(rsq21,rinv21);
613
614             /* Calculate table index by multiplying r with table scale and truncate to integer */
615             rt               = _mm_mul_ps(r21,vftabscale);
616             vfitab           = _mm_cvttps_epi32(rt);
617             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
618             vfitab           = _mm_slli_epi32(vfitab,2);
619
620             /* CUBIC SPLINE TABLE ELECTROSTATICS */
621             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
622             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
623             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
624             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
625             _MM_TRANSPOSE4_PS(Y,F,G,H);
626             Heps             = _mm_mul_ps(vfeps,H);
627             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
628             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
629             velec            = _mm_mul_ps(qq21,VV);
630             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
631             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
632
633             /* Update potential sum for this i atom from the interaction with this j atom. */
634             velecsum         = _mm_add_ps(velecsum,velec);
635
636             fscal            = felec;
637
638             /* Calculate temporary vectorial force */
639             tx               = _mm_mul_ps(fscal,dx21);
640             ty               = _mm_mul_ps(fscal,dy21);
641             tz               = _mm_mul_ps(fscal,dz21);
642
643             /* Update vectorial force */
644             fix2             = _mm_add_ps(fix2,tx);
645             fiy2             = _mm_add_ps(fiy2,ty);
646             fiz2             = _mm_add_ps(fiz2,tz);
647
648             fjx1             = _mm_add_ps(fjx1,tx);
649             fjy1             = _mm_add_ps(fjy1,ty);
650             fjz1             = _mm_add_ps(fjz1,tz);
651
652             /**************************
653              * CALCULATE INTERACTIONS *
654              **************************/
655
656             r22              = _mm_mul_ps(rsq22,rinv22);
657
658             /* Calculate table index by multiplying r with table scale and truncate to integer */
659             rt               = _mm_mul_ps(r22,vftabscale);
660             vfitab           = _mm_cvttps_epi32(rt);
661             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
662             vfitab           = _mm_slli_epi32(vfitab,2);
663
664             /* CUBIC SPLINE TABLE ELECTROSTATICS */
665             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
666             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
667             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
668             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
669             _MM_TRANSPOSE4_PS(Y,F,G,H);
670             Heps             = _mm_mul_ps(vfeps,H);
671             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
672             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
673             velec            = _mm_mul_ps(qq22,VV);
674             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
675             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
676
677             /* Update potential sum for this i atom from the interaction with this j atom. */
678             velecsum         = _mm_add_ps(velecsum,velec);
679
680             fscal            = felec;
681
682             /* Calculate temporary vectorial force */
683             tx               = _mm_mul_ps(fscal,dx22);
684             ty               = _mm_mul_ps(fscal,dy22);
685             tz               = _mm_mul_ps(fscal,dz22);
686
687             /* Update vectorial force */
688             fix2             = _mm_add_ps(fix2,tx);
689             fiy2             = _mm_add_ps(fiy2,ty);
690             fiz2             = _mm_add_ps(fiz2,tz);
691
692             fjx2             = _mm_add_ps(fjx2,tx);
693             fjy2             = _mm_add_ps(fjy2,ty);
694             fjz2             = _mm_add_ps(fjz2,tz);
695
696             fjptrA             = f+j_coord_offsetA;
697             fjptrB             = f+j_coord_offsetB;
698             fjptrC             = f+j_coord_offsetC;
699             fjptrD             = f+j_coord_offsetD;
700
701             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
702                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
703
704             /* Inner loop uses 400 flops */
705         }
706
707         if(jidx<j_index_end)
708         {
709
710             /* Get j neighbor index, and coordinate index */
711             jnrlistA         = jjnr[jidx];
712             jnrlistB         = jjnr[jidx+1];
713             jnrlistC         = jjnr[jidx+2];
714             jnrlistD         = jjnr[jidx+3];
715             /* Sign of each element will be negative for non-real atoms.
716              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
717              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
718              */
719             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
720             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
721             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
722             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
723             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
724             j_coord_offsetA  = DIM*jnrA;
725             j_coord_offsetB  = DIM*jnrB;
726             j_coord_offsetC  = DIM*jnrC;
727             j_coord_offsetD  = DIM*jnrD;
728
729             /* load j atom coordinates */
730             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
731                                               x+j_coord_offsetC,x+j_coord_offsetD,
732                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
733
734             /* Calculate displacement vector */
735             dx00             = _mm_sub_ps(ix0,jx0);
736             dy00             = _mm_sub_ps(iy0,jy0);
737             dz00             = _mm_sub_ps(iz0,jz0);
738             dx01             = _mm_sub_ps(ix0,jx1);
739             dy01             = _mm_sub_ps(iy0,jy1);
740             dz01             = _mm_sub_ps(iz0,jz1);
741             dx02             = _mm_sub_ps(ix0,jx2);
742             dy02             = _mm_sub_ps(iy0,jy2);
743             dz02             = _mm_sub_ps(iz0,jz2);
744             dx10             = _mm_sub_ps(ix1,jx0);
745             dy10             = _mm_sub_ps(iy1,jy0);
746             dz10             = _mm_sub_ps(iz1,jz0);
747             dx11             = _mm_sub_ps(ix1,jx1);
748             dy11             = _mm_sub_ps(iy1,jy1);
749             dz11             = _mm_sub_ps(iz1,jz1);
750             dx12             = _mm_sub_ps(ix1,jx2);
751             dy12             = _mm_sub_ps(iy1,jy2);
752             dz12             = _mm_sub_ps(iz1,jz2);
753             dx20             = _mm_sub_ps(ix2,jx0);
754             dy20             = _mm_sub_ps(iy2,jy0);
755             dz20             = _mm_sub_ps(iz2,jz0);
756             dx21             = _mm_sub_ps(ix2,jx1);
757             dy21             = _mm_sub_ps(iy2,jy1);
758             dz21             = _mm_sub_ps(iz2,jz1);
759             dx22             = _mm_sub_ps(ix2,jx2);
760             dy22             = _mm_sub_ps(iy2,jy2);
761             dz22             = _mm_sub_ps(iz2,jz2);
762
763             /* Calculate squared distance and things based on it */
764             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
765             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
766             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
767             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
768             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
769             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
770             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
771             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
772             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
773
774             rinv00           = gmx_mm_invsqrt_ps(rsq00);
775             rinv01           = gmx_mm_invsqrt_ps(rsq01);
776             rinv02           = gmx_mm_invsqrt_ps(rsq02);
777             rinv10           = gmx_mm_invsqrt_ps(rsq10);
778             rinv11           = gmx_mm_invsqrt_ps(rsq11);
779             rinv12           = gmx_mm_invsqrt_ps(rsq12);
780             rinv20           = gmx_mm_invsqrt_ps(rsq20);
781             rinv21           = gmx_mm_invsqrt_ps(rsq21);
782             rinv22           = gmx_mm_invsqrt_ps(rsq22);
783
784             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
785
786             fjx0             = _mm_setzero_ps();
787             fjy0             = _mm_setzero_ps();
788             fjz0             = _mm_setzero_ps();
789             fjx1             = _mm_setzero_ps();
790             fjy1             = _mm_setzero_ps();
791             fjz1             = _mm_setzero_ps();
792             fjx2             = _mm_setzero_ps();
793             fjy2             = _mm_setzero_ps();
794             fjz2             = _mm_setzero_ps();
795
796             /**************************
797              * CALCULATE INTERACTIONS *
798              **************************/
799
800             r00              = _mm_mul_ps(rsq00,rinv00);
801             r00              = _mm_andnot_ps(dummy_mask,r00);
802
803             /* Calculate table index by multiplying r with table scale and truncate to integer */
804             rt               = _mm_mul_ps(r00,vftabscale);
805             vfitab           = _mm_cvttps_epi32(rt);
806             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
807             vfitab           = _mm_slli_epi32(vfitab,2);
808
809             /* CUBIC SPLINE TABLE ELECTROSTATICS */
810             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
811             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
812             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
813             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
814             _MM_TRANSPOSE4_PS(Y,F,G,H);
815             Heps             = _mm_mul_ps(vfeps,H);
816             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
817             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
818             velec            = _mm_mul_ps(qq00,VV);
819             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
820             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
821
822             /* LENNARD-JONES DISPERSION/REPULSION */
823
824             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
825             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
826             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
827             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
828             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
829
830             /* Update potential sum for this i atom from the interaction with this j atom. */
831             velec            = _mm_andnot_ps(dummy_mask,velec);
832             velecsum         = _mm_add_ps(velecsum,velec);
833             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
834             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
835
836             fscal            = _mm_add_ps(felec,fvdw);
837
838             fscal            = _mm_andnot_ps(dummy_mask,fscal);
839
840             /* Calculate temporary vectorial force */
841             tx               = _mm_mul_ps(fscal,dx00);
842             ty               = _mm_mul_ps(fscal,dy00);
843             tz               = _mm_mul_ps(fscal,dz00);
844
845             /* Update vectorial force */
846             fix0             = _mm_add_ps(fix0,tx);
847             fiy0             = _mm_add_ps(fiy0,ty);
848             fiz0             = _mm_add_ps(fiz0,tz);
849
850             fjx0             = _mm_add_ps(fjx0,tx);
851             fjy0             = _mm_add_ps(fjy0,ty);
852             fjz0             = _mm_add_ps(fjz0,tz);
853
854             /**************************
855              * CALCULATE INTERACTIONS *
856              **************************/
857
858             r01              = _mm_mul_ps(rsq01,rinv01);
859             r01              = _mm_andnot_ps(dummy_mask,r01);
860
861             /* Calculate table index by multiplying r with table scale and truncate to integer */
862             rt               = _mm_mul_ps(r01,vftabscale);
863             vfitab           = _mm_cvttps_epi32(rt);
864             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
865             vfitab           = _mm_slli_epi32(vfitab,2);
866
867             /* CUBIC SPLINE TABLE ELECTROSTATICS */
868             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
869             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
870             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
871             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
872             _MM_TRANSPOSE4_PS(Y,F,G,H);
873             Heps             = _mm_mul_ps(vfeps,H);
874             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
875             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
876             velec            = _mm_mul_ps(qq01,VV);
877             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
878             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
879
880             /* Update potential sum for this i atom from the interaction with this j atom. */
881             velec            = _mm_andnot_ps(dummy_mask,velec);
882             velecsum         = _mm_add_ps(velecsum,velec);
883
884             fscal            = felec;
885
886             fscal            = _mm_andnot_ps(dummy_mask,fscal);
887
888             /* Calculate temporary vectorial force */
889             tx               = _mm_mul_ps(fscal,dx01);
890             ty               = _mm_mul_ps(fscal,dy01);
891             tz               = _mm_mul_ps(fscal,dz01);
892
893             /* Update vectorial force */
894             fix0             = _mm_add_ps(fix0,tx);
895             fiy0             = _mm_add_ps(fiy0,ty);
896             fiz0             = _mm_add_ps(fiz0,tz);
897
898             fjx1             = _mm_add_ps(fjx1,tx);
899             fjy1             = _mm_add_ps(fjy1,ty);
900             fjz1             = _mm_add_ps(fjz1,tz);
901
902             /**************************
903              * CALCULATE INTERACTIONS *
904              **************************/
905
906             r02              = _mm_mul_ps(rsq02,rinv02);
907             r02              = _mm_andnot_ps(dummy_mask,r02);
908
909             /* Calculate table index by multiplying r with table scale and truncate to integer */
910             rt               = _mm_mul_ps(r02,vftabscale);
911             vfitab           = _mm_cvttps_epi32(rt);
912             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
913             vfitab           = _mm_slli_epi32(vfitab,2);
914
915             /* CUBIC SPLINE TABLE ELECTROSTATICS */
916             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
917             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
918             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
919             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
920             _MM_TRANSPOSE4_PS(Y,F,G,H);
921             Heps             = _mm_mul_ps(vfeps,H);
922             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
923             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
924             velec            = _mm_mul_ps(qq02,VV);
925             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
926             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
927
928             /* Update potential sum for this i atom from the interaction with this j atom. */
929             velec            = _mm_andnot_ps(dummy_mask,velec);
930             velecsum         = _mm_add_ps(velecsum,velec);
931
932             fscal            = felec;
933
934             fscal            = _mm_andnot_ps(dummy_mask,fscal);
935
936             /* Calculate temporary vectorial force */
937             tx               = _mm_mul_ps(fscal,dx02);
938             ty               = _mm_mul_ps(fscal,dy02);
939             tz               = _mm_mul_ps(fscal,dz02);
940
941             /* Update vectorial force */
942             fix0             = _mm_add_ps(fix0,tx);
943             fiy0             = _mm_add_ps(fiy0,ty);
944             fiz0             = _mm_add_ps(fiz0,tz);
945
946             fjx2             = _mm_add_ps(fjx2,tx);
947             fjy2             = _mm_add_ps(fjy2,ty);
948             fjz2             = _mm_add_ps(fjz2,tz);
949
950             /**************************
951              * CALCULATE INTERACTIONS *
952              **************************/
953
954             r10              = _mm_mul_ps(rsq10,rinv10);
955             r10              = _mm_andnot_ps(dummy_mask,r10);
956
957             /* Calculate table index by multiplying r with table scale and truncate to integer */
958             rt               = _mm_mul_ps(r10,vftabscale);
959             vfitab           = _mm_cvttps_epi32(rt);
960             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
961             vfitab           = _mm_slli_epi32(vfitab,2);
962
963             /* CUBIC SPLINE TABLE ELECTROSTATICS */
964             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
965             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
966             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
967             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
968             _MM_TRANSPOSE4_PS(Y,F,G,H);
969             Heps             = _mm_mul_ps(vfeps,H);
970             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
971             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
972             velec            = _mm_mul_ps(qq10,VV);
973             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
974             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
975
976             /* Update potential sum for this i atom from the interaction with this j atom. */
977             velec            = _mm_andnot_ps(dummy_mask,velec);
978             velecsum         = _mm_add_ps(velecsum,velec);
979
980             fscal            = felec;
981
982             fscal            = _mm_andnot_ps(dummy_mask,fscal);
983
984             /* Calculate temporary vectorial force */
985             tx               = _mm_mul_ps(fscal,dx10);
986             ty               = _mm_mul_ps(fscal,dy10);
987             tz               = _mm_mul_ps(fscal,dz10);
988
989             /* Update vectorial force */
990             fix1             = _mm_add_ps(fix1,tx);
991             fiy1             = _mm_add_ps(fiy1,ty);
992             fiz1             = _mm_add_ps(fiz1,tz);
993
994             fjx0             = _mm_add_ps(fjx0,tx);
995             fjy0             = _mm_add_ps(fjy0,ty);
996             fjz0             = _mm_add_ps(fjz0,tz);
997
998             /**************************
999              * CALCULATE INTERACTIONS *
1000              **************************/
1001
1002             r11              = _mm_mul_ps(rsq11,rinv11);
1003             r11              = _mm_andnot_ps(dummy_mask,r11);
1004
1005             /* Calculate table index by multiplying r with table scale and truncate to integer */
1006             rt               = _mm_mul_ps(r11,vftabscale);
1007             vfitab           = _mm_cvttps_epi32(rt);
1008             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1009             vfitab           = _mm_slli_epi32(vfitab,2);
1010
1011             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1012             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1013             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1014             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1015             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1016             _MM_TRANSPOSE4_PS(Y,F,G,H);
1017             Heps             = _mm_mul_ps(vfeps,H);
1018             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1019             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1020             velec            = _mm_mul_ps(qq11,VV);
1021             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1022             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1023
1024             /* Update potential sum for this i atom from the interaction with this j atom. */
1025             velec            = _mm_andnot_ps(dummy_mask,velec);
1026             velecsum         = _mm_add_ps(velecsum,velec);
1027
1028             fscal            = felec;
1029
1030             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1031
1032             /* Calculate temporary vectorial force */
1033             tx               = _mm_mul_ps(fscal,dx11);
1034             ty               = _mm_mul_ps(fscal,dy11);
1035             tz               = _mm_mul_ps(fscal,dz11);
1036
1037             /* Update vectorial force */
1038             fix1             = _mm_add_ps(fix1,tx);
1039             fiy1             = _mm_add_ps(fiy1,ty);
1040             fiz1             = _mm_add_ps(fiz1,tz);
1041
1042             fjx1             = _mm_add_ps(fjx1,tx);
1043             fjy1             = _mm_add_ps(fjy1,ty);
1044             fjz1             = _mm_add_ps(fjz1,tz);
1045
1046             /**************************
1047              * CALCULATE INTERACTIONS *
1048              **************************/
1049
1050             r12              = _mm_mul_ps(rsq12,rinv12);
1051             r12              = _mm_andnot_ps(dummy_mask,r12);
1052
1053             /* Calculate table index by multiplying r with table scale and truncate to integer */
1054             rt               = _mm_mul_ps(r12,vftabscale);
1055             vfitab           = _mm_cvttps_epi32(rt);
1056             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1057             vfitab           = _mm_slli_epi32(vfitab,2);
1058
1059             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1060             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1061             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1062             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1063             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1064             _MM_TRANSPOSE4_PS(Y,F,G,H);
1065             Heps             = _mm_mul_ps(vfeps,H);
1066             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1067             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1068             velec            = _mm_mul_ps(qq12,VV);
1069             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1070             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1071
1072             /* Update potential sum for this i atom from the interaction with this j atom. */
1073             velec            = _mm_andnot_ps(dummy_mask,velec);
1074             velecsum         = _mm_add_ps(velecsum,velec);
1075
1076             fscal            = felec;
1077
1078             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1079
1080             /* Calculate temporary vectorial force */
1081             tx               = _mm_mul_ps(fscal,dx12);
1082             ty               = _mm_mul_ps(fscal,dy12);
1083             tz               = _mm_mul_ps(fscal,dz12);
1084
1085             /* Update vectorial force */
1086             fix1             = _mm_add_ps(fix1,tx);
1087             fiy1             = _mm_add_ps(fiy1,ty);
1088             fiz1             = _mm_add_ps(fiz1,tz);
1089
1090             fjx2             = _mm_add_ps(fjx2,tx);
1091             fjy2             = _mm_add_ps(fjy2,ty);
1092             fjz2             = _mm_add_ps(fjz2,tz);
1093
1094             /**************************
1095              * CALCULATE INTERACTIONS *
1096              **************************/
1097
1098             r20              = _mm_mul_ps(rsq20,rinv20);
1099             r20              = _mm_andnot_ps(dummy_mask,r20);
1100
1101             /* Calculate table index by multiplying r with table scale and truncate to integer */
1102             rt               = _mm_mul_ps(r20,vftabscale);
1103             vfitab           = _mm_cvttps_epi32(rt);
1104             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1105             vfitab           = _mm_slli_epi32(vfitab,2);
1106
1107             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1108             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1109             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1110             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1111             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1112             _MM_TRANSPOSE4_PS(Y,F,G,H);
1113             Heps             = _mm_mul_ps(vfeps,H);
1114             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1115             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1116             velec            = _mm_mul_ps(qq20,VV);
1117             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1118             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1119
1120             /* Update potential sum for this i atom from the interaction with this j atom. */
1121             velec            = _mm_andnot_ps(dummy_mask,velec);
1122             velecsum         = _mm_add_ps(velecsum,velec);
1123
1124             fscal            = felec;
1125
1126             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1127
1128             /* Calculate temporary vectorial force */
1129             tx               = _mm_mul_ps(fscal,dx20);
1130             ty               = _mm_mul_ps(fscal,dy20);
1131             tz               = _mm_mul_ps(fscal,dz20);
1132
1133             /* Update vectorial force */
1134             fix2             = _mm_add_ps(fix2,tx);
1135             fiy2             = _mm_add_ps(fiy2,ty);
1136             fiz2             = _mm_add_ps(fiz2,tz);
1137
1138             fjx0             = _mm_add_ps(fjx0,tx);
1139             fjy0             = _mm_add_ps(fjy0,ty);
1140             fjz0             = _mm_add_ps(fjz0,tz);
1141
1142             /**************************
1143              * CALCULATE INTERACTIONS *
1144              **************************/
1145
1146             r21              = _mm_mul_ps(rsq21,rinv21);
1147             r21              = _mm_andnot_ps(dummy_mask,r21);
1148
1149             /* Calculate table index by multiplying r with table scale and truncate to integer */
1150             rt               = _mm_mul_ps(r21,vftabscale);
1151             vfitab           = _mm_cvttps_epi32(rt);
1152             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1153             vfitab           = _mm_slli_epi32(vfitab,2);
1154
1155             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1156             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1157             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1158             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1159             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1160             _MM_TRANSPOSE4_PS(Y,F,G,H);
1161             Heps             = _mm_mul_ps(vfeps,H);
1162             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1163             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1164             velec            = _mm_mul_ps(qq21,VV);
1165             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1166             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1167
1168             /* Update potential sum for this i atom from the interaction with this j atom. */
1169             velec            = _mm_andnot_ps(dummy_mask,velec);
1170             velecsum         = _mm_add_ps(velecsum,velec);
1171
1172             fscal            = felec;
1173
1174             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1175
1176             /* Calculate temporary vectorial force */
1177             tx               = _mm_mul_ps(fscal,dx21);
1178             ty               = _mm_mul_ps(fscal,dy21);
1179             tz               = _mm_mul_ps(fscal,dz21);
1180
1181             /* Update vectorial force */
1182             fix2             = _mm_add_ps(fix2,tx);
1183             fiy2             = _mm_add_ps(fiy2,ty);
1184             fiz2             = _mm_add_ps(fiz2,tz);
1185
1186             fjx1             = _mm_add_ps(fjx1,tx);
1187             fjy1             = _mm_add_ps(fjy1,ty);
1188             fjz1             = _mm_add_ps(fjz1,tz);
1189
1190             /**************************
1191              * CALCULATE INTERACTIONS *
1192              **************************/
1193
1194             r22              = _mm_mul_ps(rsq22,rinv22);
1195             r22              = _mm_andnot_ps(dummy_mask,r22);
1196
1197             /* Calculate table index by multiplying r with table scale and truncate to integer */
1198             rt               = _mm_mul_ps(r22,vftabscale);
1199             vfitab           = _mm_cvttps_epi32(rt);
1200             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1201             vfitab           = _mm_slli_epi32(vfitab,2);
1202
1203             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1204             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1205             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1206             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1207             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1208             _MM_TRANSPOSE4_PS(Y,F,G,H);
1209             Heps             = _mm_mul_ps(vfeps,H);
1210             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1211             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1212             velec            = _mm_mul_ps(qq22,VV);
1213             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1214             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1215
1216             /* Update potential sum for this i atom from the interaction with this j atom. */
1217             velec            = _mm_andnot_ps(dummy_mask,velec);
1218             velecsum         = _mm_add_ps(velecsum,velec);
1219
1220             fscal            = felec;
1221
1222             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1223
1224             /* Calculate temporary vectorial force */
1225             tx               = _mm_mul_ps(fscal,dx22);
1226             ty               = _mm_mul_ps(fscal,dy22);
1227             tz               = _mm_mul_ps(fscal,dz22);
1228
1229             /* Update vectorial force */
1230             fix2             = _mm_add_ps(fix2,tx);
1231             fiy2             = _mm_add_ps(fiy2,ty);
1232             fiz2             = _mm_add_ps(fiz2,tz);
1233
1234             fjx2             = _mm_add_ps(fjx2,tx);
1235             fjy2             = _mm_add_ps(fjy2,ty);
1236             fjz2             = _mm_add_ps(fjz2,tz);
1237
1238             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1239             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1240             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1241             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1242
1243             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1244                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1245
1246             /* Inner loop uses 409 flops */
1247         }
1248
1249         /* End of innermost loop */
1250
1251         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1252                                               f+i_coord_offset,fshift+i_shift_offset);
1253
1254         ggid                        = gid[iidx];
1255         /* Update potential energies */
1256         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1257         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1258
1259         /* Increment number of inner iterations */
1260         inneriter                  += j_index_end - j_index_start;
1261
1262         /* Outer loop uses 20 flops */
1263     }
1264
1265     /* Increment number of outer iterations */
1266     outeriter        += nri;
1267
1268     /* Update outer/inner flops */
1269
1270     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1271 }
1272 /*
1273  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_single
1274  * Electrostatics interaction: CubicSplineTable
1275  * VdW interaction:            LennardJones
1276  * Geometry:                   Water3-Water3
1277  * Calculate force/pot:        Force
1278  */
1279 void
1280 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_single
1281                     (t_nblist                    * gmx_restrict       nlist,
1282                      rvec                        * gmx_restrict          xx,
1283                      rvec                        * gmx_restrict          ff,
1284                      t_forcerec                  * gmx_restrict          fr,
1285                      t_mdatoms                   * gmx_restrict     mdatoms,
1286                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1287                      t_nrnb                      * gmx_restrict        nrnb)
1288 {
1289     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1290      * just 0 for non-waters.
1291      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1292      * jnr indices corresponding to data put in the four positions in the SIMD register.
1293      */
1294     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1295     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1296     int              jnrA,jnrB,jnrC,jnrD;
1297     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1298     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1299     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1300     real             rcutoff_scalar;
1301     real             *shiftvec,*fshift,*x,*f;
1302     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1303     real             scratch[4*DIM];
1304     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1305     int              vdwioffset0;
1306     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1307     int              vdwioffset1;
1308     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1309     int              vdwioffset2;
1310     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1311     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1312     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1313     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1314     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1315     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1316     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1317     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1318     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1319     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1320     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1321     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1322     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1323     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1324     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1325     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1326     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1327     real             *charge;
1328     int              nvdwtype;
1329     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1330     int              *vdwtype;
1331     real             *vdwparam;
1332     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1333     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1334     __m128i          vfitab;
1335     __m128i          ifour       = _mm_set1_epi32(4);
1336     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1337     real             *vftab;
1338     __m128           dummy_mask,cutoff_mask;
1339     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1340     __m128           one     = _mm_set1_ps(1.0);
1341     __m128           two     = _mm_set1_ps(2.0);
1342     x                = xx[0];
1343     f                = ff[0];
1344
1345     nri              = nlist->nri;
1346     iinr             = nlist->iinr;
1347     jindex           = nlist->jindex;
1348     jjnr             = nlist->jjnr;
1349     shiftidx         = nlist->shift;
1350     gid              = nlist->gid;
1351     shiftvec         = fr->shift_vec[0];
1352     fshift           = fr->fshift[0];
1353     facel            = _mm_set1_ps(fr->epsfac);
1354     charge           = mdatoms->chargeA;
1355     nvdwtype         = fr->ntype;
1356     vdwparam         = fr->nbfp;
1357     vdwtype          = mdatoms->typeA;
1358
1359     vftab            = kernel_data->table_elec->data;
1360     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1361
1362     /* Setup water-specific parameters */
1363     inr              = nlist->iinr[0];
1364     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1365     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1366     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1367     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1368
1369     jq0              = _mm_set1_ps(charge[inr+0]);
1370     jq1              = _mm_set1_ps(charge[inr+1]);
1371     jq2              = _mm_set1_ps(charge[inr+2]);
1372     vdwjidx0A        = 2*vdwtype[inr+0];
1373     qq00             = _mm_mul_ps(iq0,jq0);
1374     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1375     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1376     qq01             = _mm_mul_ps(iq0,jq1);
1377     qq02             = _mm_mul_ps(iq0,jq2);
1378     qq10             = _mm_mul_ps(iq1,jq0);
1379     qq11             = _mm_mul_ps(iq1,jq1);
1380     qq12             = _mm_mul_ps(iq1,jq2);
1381     qq20             = _mm_mul_ps(iq2,jq0);
1382     qq21             = _mm_mul_ps(iq2,jq1);
1383     qq22             = _mm_mul_ps(iq2,jq2);
1384
1385     /* Avoid stupid compiler warnings */
1386     jnrA = jnrB = jnrC = jnrD = 0;
1387     j_coord_offsetA = 0;
1388     j_coord_offsetB = 0;
1389     j_coord_offsetC = 0;
1390     j_coord_offsetD = 0;
1391
1392     outeriter        = 0;
1393     inneriter        = 0;
1394
1395     for(iidx=0;iidx<4*DIM;iidx++)
1396     {
1397         scratch[iidx] = 0.0;
1398     }
1399
1400     /* Start outer loop over neighborlists */
1401     for(iidx=0; iidx<nri; iidx++)
1402     {
1403         /* Load shift vector for this list */
1404         i_shift_offset   = DIM*shiftidx[iidx];
1405
1406         /* Load limits for loop over neighbors */
1407         j_index_start    = jindex[iidx];
1408         j_index_end      = jindex[iidx+1];
1409
1410         /* Get outer coordinate index */
1411         inr              = iinr[iidx];
1412         i_coord_offset   = DIM*inr;
1413
1414         /* Load i particle coords and add shift vector */
1415         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1416                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1417
1418         fix0             = _mm_setzero_ps();
1419         fiy0             = _mm_setzero_ps();
1420         fiz0             = _mm_setzero_ps();
1421         fix1             = _mm_setzero_ps();
1422         fiy1             = _mm_setzero_ps();
1423         fiz1             = _mm_setzero_ps();
1424         fix2             = _mm_setzero_ps();
1425         fiy2             = _mm_setzero_ps();
1426         fiz2             = _mm_setzero_ps();
1427
1428         /* Start inner kernel loop */
1429         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1430         {
1431
1432             /* Get j neighbor index, and coordinate index */
1433             jnrA             = jjnr[jidx];
1434             jnrB             = jjnr[jidx+1];
1435             jnrC             = jjnr[jidx+2];
1436             jnrD             = jjnr[jidx+3];
1437             j_coord_offsetA  = DIM*jnrA;
1438             j_coord_offsetB  = DIM*jnrB;
1439             j_coord_offsetC  = DIM*jnrC;
1440             j_coord_offsetD  = DIM*jnrD;
1441
1442             /* load j atom coordinates */
1443             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1444                                               x+j_coord_offsetC,x+j_coord_offsetD,
1445                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1446
1447             /* Calculate displacement vector */
1448             dx00             = _mm_sub_ps(ix0,jx0);
1449             dy00             = _mm_sub_ps(iy0,jy0);
1450             dz00             = _mm_sub_ps(iz0,jz0);
1451             dx01             = _mm_sub_ps(ix0,jx1);
1452             dy01             = _mm_sub_ps(iy0,jy1);
1453             dz01             = _mm_sub_ps(iz0,jz1);
1454             dx02             = _mm_sub_ps(ix0,jx2);
1455             dy02             = _mm_sub_ps(iy0,jy2);
1456             dz02             = _mm_sub_ps(iz0,jz2);
1457             dx10             = _mm_sub_ps(ix1,jx0);
1458             dy10             = _mm_sub_ps(iy1,jy0);
1459             dz10             = _mm_sub_ps(iz1,jz0);
1460             dx11             = _mm_sub_ps(ix1,jx1);
1461             dy11             = _mm_sub_ps(iy1,jy1);
1462             dz11             = _mm_sub_ps(iz1,jz1);
1463             dx12             = _mm_sub_ps(ix1,jx2);
1464             dy12             = _mm_sub_ps(iy1,jy2);
1465             dz12             = _mm_sub_ps(iz1,jz2);
1466             dx20             = _mm_sub_ps(ix2,jx0);
1467             dy20             = _mm_sub_ps(iy2,jy0);
1468             dz20             = _mm_sub_ps(iz2,jz0);
1469             dx21             = _mm_sub_ps(ix2,jx1);
1470             dy21             = _mm_sub_ps(iy2,jy1);
1471             dz21             = _mm_sub_ps(iz2,jz1);
1472             dx22             = _mm_sub_ps(ix2,jx2);
1473             dy22             = _mm_sub_ps(iy2,jy2);
1474             dz22             = _mm_sub_ps(iz2,jz2);
1475
1476             /* Calculate squared distance and things based on it */
1477             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1478             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1479             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1480             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1481             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1482             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1483             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1484             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1485             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1486
1487             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1488             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1489             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1490             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1491             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1492             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1493             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1494             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1495             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1496
1497             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1498
1499             fjx0             = _mm_setzero_ps();
1500             fjy0             = _mm_setzero_ps();
1501             fjz0             = _mm_setzero_ps();
1502             fjx1             = _mm_setzero_ps();
1503             fjy1             = _mm_setzero_ps();
1504             fjz1             = _mm_setzero_ps();
1505             fjx2             = _mm_setzero_ps();
1506             fjy2             = _mm_setzero_ps();
1507             fjz2             = _mm_setzero_ps();
1508
1509             /**************************
1510              * CALCULATE INTERACTIONS *
1511              **************************/
1512
1513             r00              = _mm_mul_ps(rsq00,rinv00);
1514
1515             /* Calculate table index by multiplying r with table scale and truncate to integer */
1516             rt               = _mm_mul_ps(r00,vftabscale);
1517             vfitab           = _mm_cvttps_epi32(rt);
1518             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1519             vfitab           = _mm_slli_epi32(vfitab,2);
1520
1521             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1522             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1523             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1524             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1525             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1526             _MM_TRANSPOSE4_PS(Y,F,G,H);
1527             Heps             = _mm_mul_ps(vfeps,H);
1528             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1529             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1530             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1531
1532             /* LENNARD-JONES DISPERSION/REPULSION */
1533
1534             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1535             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1536
1537             fscal            = _mm_add_ps(felec,fvdw);
1538
1539             /* Calculate temporary vectorial force */
1540             tx               = _mm_mul_ps(fscal,dx00);
1541             ty               = _mm_mul_ps(fscal,dy00);
1542             tz               = _mm_mul_ps(fscal,dz00);
1543
1544             /* Update vectorial force */
1545             fix0             = _mm_add_ps(fix0,tx);
1546             fiy0             = _mm_add_ps(fiy0,ty);
1547             fiz0             = _mm_add_ps(fiz0,tz);
1548
1549             fjx0             = _mm_add_ps(fjx0,tx);
1550             fjy0             = _mm_add_ps(fjy0,ty);
1551             fjz0             = _mm_add_ps(fjz0,tz);
1552
1553             /**************************
1554              * CALCULATE INTERACTIONS *
1555              **************************/
1556
1557             r01              = _mm_mul_ps(rsq01,rinv01);
1558
1559             /* Calculate table index by multiplying r with table scale and truncate to integer */
1560             rt               = _mm_mul_ps(r01,vftabscale);
1561             vfitab           = _mm_cvttps_epi32(rt);
1562             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1563             vfitab           = _mm_slli_epi32(vfitab,2);
1564
1565             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1566             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1567             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1568             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1569             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1570             _MM_TRANSPOSE4_PS(Y,F,G,H);
1571             Heps             = _mm_mul_ps(vfeps,H);
1572             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1573             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1574             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1575
1576             fscal            = felec;
1577
1578             /* Calculate temporary vectorial force */
1579             tx               = _mm_mul_ps(fscal,dx01);
1580             ty               = _mm_mul_ps(fscal,dy01);
1581             tz               = _mm_mul_ps(fscal,dz01);
1582
1583             /* Update vectorial force */
1584             fix0             = _mm_add_ps(fix0,tx);
1585             fiy0             = _mm_add_ps(fiy0,ty);
1586             fiz0             = _mm_add_ps(fiz0,tz);
1587
1588             fjx1             = _mm_add_ps(fjx1,tx);
1589             fjy1             = _mm_add_ps(fjy1,ty);
1590             fjz1             = _mm_add_ps(fjz1,tz);
1591
1592             /**************************
1593              * CALCULATE INTERACTIONS *
1594              **************************/
1595
1596             r02              = _mm_mul_ps(rsq02,rinv02);
1597
1598             /* Calculate table index by multiplying r with table scale and truncate to integer */
1599             rt               = _mm_mul_ps(r02,vftabscale);
1600             vfitab           = _mm_cvttps_epi32(rt);
1601             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1602             vfitab           = _mm_slli_epi32(vfitab,2);
1603
1604             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1605             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1606             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1607             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1608             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1609             _MM_TRANSPOSE4_PS(Y,F,G,H);
1610             Heps             = _mm_mul_ps(vfeps,H);
1611             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1612             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1613             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1614
1615             fscal            = felec;
1616
1617             /* Calculate temporary vectorial force */
1618             tx               = _mm_mul_ps(fscal,dx02);
1619             ty               = _mm_mul_ps(fscal,dy02);
1620             tz               = _mm_mul_ps(fscal,dz02);
1621
1622             /* Update vectorial force */
1623             fix0             = _mm_add_ps(fix0,tx);
1624             fiy0             = _mm_add_ps(fiy0,ty);
1625             fiz0             = _mm_add_ps(fiz0,tz);
1626
1627             fjx2             = _mm_add_ps(fjx2,tx);
1628             fjy2             = _mm_add_ps(fjy2,ty);
1629             fjz2             = _mm_add_ps(fjz2,tz);
1630
1631             /**************************
1632              * CALCULATE INTERACTIONS *
1633              **************************/
1634
1635             r10              = _mm_mul_ps(rsq10,rinv10);
1636
1637             /* Calculate table index by multiplying r with table scale and truncate to integer */
1638             rt               = _mm_mul_ps(r10,vftabscale);
1639             vfitab           = _mm_cvttps_epi32(rt);
1640             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1641             vfitab           = _mm_slli_epi32(vfitab,2);
1642
1643             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1644             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1645             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1646             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1647             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1648             _MM_TRANSPOSE4_PS(Y,F,G,H);
1649             Heps             = _mm_mul_ps(vfeps,H);
1650             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1651             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1652             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1653
1654             fscal            = felec;
1655
1656             /* Calculate temporary vectorial force */
1657             tx               = _mm_mul_ps(fscal,dx10);
1658             ty               = _mm_mul_ps(fscal,dy10);
1659             tz               = _mm_mul_ps(fscal,dz10);
1660
1661             /* Update vectorial force */
1662             fix1             = _mm_add_ps(fix1,tx);
1663             fiy1             = _mm_add_ps(fiy1,ty);
1664             fiz1             = _mm_add_ps(fiz1,tz);
1665
1666             fjx0             = _mm_add_ps(fjx0,tx);
1667             fjy0             = _mm_add_ps(fjy0,ty);
1668             fjz0             = _mm_add_ps(fjz0,tz);
1669
1670             /**************************
1671              * CALCULATE INTERACTIONS *
1672              **************************/
1673
1674             r11              = _mm_mul_ps(rsq11,rinv11);
1675
1676             /* Calculate table index by multiplying r with table scale and truncate to integer */
1677             rt               = _mm_mul_ps(r11,vftabscale);
1678             vfitab           = _mm_cvttps_epi32(rt);
1679             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1680             vfitab           = _mm_slli_epi32(vfitab,2);
1681
1682             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1683             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1684             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1685             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1686             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1687             _MM_TRANSPOSE4_PS(Y,F,G,H);
1688             Heps             = _mm_mul_ps(vfeps,H);
1689             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1690             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1691             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1692
1693             fscal            = felec;
1694
1695             /* Calculate temporary vectorial force */
1696             tx               = _mm_mul_ps(fscal,dx11);
1697             ty               = _mm_mul_ps(fscal,dy11);
1698             tz               = _mm_mul_ps(fscal,dz11);
1699
1700             /* Update vectorial force */
1701             fix1             = _mm_add_ps(fix1,tx);
1702             fiy1             = _mm_add_ps(fiy1,ty);
1703             fiz1             = _mm_add_ps(fiz1,tz);
1704
1705             fjx1             = _mm_add_ps(fjx1,tx);
1706             fjy1             = _mm_add_ps(fjy1,ty);
1707             fjz1             = _mm_add_ps(fjz1,tz);
1708
1709             /**************************
1710              * CALCULATE INTERACTIONS *
1711              **************************/
1712
1713             r12              = _mm_mul_ps(rsq12,rinv12);
1714
1715             /* Calculate table index by multiplying r with table scale and truncate to integer */
1716             rt               = _mm_mul_ps(r12,vftabscale);
1717             vfitab           = _mm_cvttps_epi32(rt);
1718             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1719             vfitab           = _mm_slli_epi32(vfitab,2);
1720
1721             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1722             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1723             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1724             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1725             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1726             _MM_TRANSPOSE4_PS(Y,F,G,H);
1727             Heps             = _mm_mul_ps(vfeps,H);
1728             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1729             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1730             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1731
1732             fscal            = felec;
1733
1734             /* Calculate temporary vectorial force */
1735             tx               = _mm_mul_ps(fscal,dx12);
1736             ty               = _mm_mul_ps(fscal,dy12);
1737             tz               = _mm_mul_ps(fscal,dz12);
1738
1739             /* Update vectorial force */
1740             fix1             = _mm_add_ps(fix1,tx);
1741             fiy1             = _mm_add_ps(fiy1,ty);
1742             fiz1             = _mm_add_ps(fiz1,tz);
1743
1744             fjx2             = _mm_add_ps(fjx2,tx);
1745             fjy2             = _mm_add_ps(fjy2,ty);
1746             fjz2             = _mm_add_ps(fjz2,tz);
1747
1748             /**************************
1749              * CALCULATE INTERACTIONS *
1750              **************************/
1751
1752             r20              = _mm_mul_ps(rsq20,rinv20);
1753
1754             /* Calculate table index by multiplying r with table scale and truncate to integer */
1755             rt               = _mm_mul_ps(r20,vftabscale);
1756             vfitab           = _mm_cvttps_epi32(rt);
1757             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1758             vfitab           = _mm_slli_epi32(vfitab,2);
1759
1760             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1761             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1762             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1763             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1764             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1765             _MM_TRANSPOSE4_PS(Y,F,G,H);
1766             Heps             = _mm_mul_ps(vfeps,H);
1767             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1768             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1769             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1770
1771             fscal            = felec;
1772
1773             /* Calculate temporary vectorial force */
1774             tx               = _mm_mul_ps(fscal,dx20);
1775             ty               = _mm_mul_ps(fscal,dy20);
1776             tz               = _mm_mul_ps(fscal,dz20);
1777
1778             /* Update vectorial force */
1779             fix2             = _mm_add_ps(fix2,tx);
1780             fiy2             = _mm_add_ps(fiy2,ty);
1781             fiz2             = _mm_add_ps(fiz2,tz);
1782
1783             fjx0             = _mm_add_ps(fjx0,tx);
1784             fjy0             = _mm_add_ps(fjy0,ty);
1785             fjz0             = _mm_add_ps(fjz0,tz);
1786
1787             /**************************
1788              * CALCULATE INTERACTIONS *
1789              **************************/
1790
1791             r21              = _mm_mul_ps(rsq21,rinv21);
1792
1793             /* Calculate table index by multiplying r with table scale and truncate to integer */
1794             rt               = _mm_mul_ps(r21,vftabscale);
1795             vfitab           = _mm_cvttps_epi32(rt);
1796             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1797             vfitab           = _mm_slli_epi32(vfitab,2);
1798
1799             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1800             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1801             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1802             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1803             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1804             _MM_TRANSPOSE4_PS(Y,F,G,H);
1805             Heps             = _mm_mul_ps(vfeps,H);
1806             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1807             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1808             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1809
1810             fscal            = felec;
1811
1812             /* Calculate temporary vectorial force */
1813             tx               = _mm_mul_ps(fscal,dx21);
1814             ty               = _mm_mul_ps(fscal,dy21);
1815             tz               = _mm_mul_ps(fscal,dz21);
1816
1817             /* Update vectorial force */
1818             fix2             = _mm_add_ps(fix2,tx);
1819             fiy2             = _mm_add_ps(fiy2,ty);
1820             fiz2             = _mm_add_ps(fiz2,tz);
1821
1822             fjx1             = _mm_add_ps(fjx1,tx);
1823             fjy1             = _mm_add_ps(fjy1,ty);
1824             fjz1             = _mm_add_ps(fjz1,tz);
1825
1826             /**************************
1827              * CALCULATE INTERACTIONS *
1828              **************************/
1829
1830             r22              = _mm_mul_ps(rsq22,rinv22);
1831
1832             /* Calculate table index by multiplying r with table scale and truncate to integer */
1833             rt               = _mm_mul_ps(r22,vftabscale);
1834             vfitab           = _mm_cvttps_epi32(rt);
1835             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1836             vfitab           = _mm_slli_epi32(vfitab,2);
1837
1838             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1839             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1840             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1841             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1842             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1843             _MM_TRANSPOSE4_PS(Y,F,G,H);
1844             Heps             = _mm_mul_ps(vfeps,H);
1845             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1846             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1847             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1848
1849             fscal            = felec;
1850
1851             /* Calculate temporary vectorial force */
1852             tx               = _mm_mul_ps(fscal,dx22);
1853             ty               = _mm_mul_ps(fscal,dy22);
1854             tz               = _mm_mul_ps(fscal,dz22);
1855
1856             /* Update vectorial force */
1857             fix2             = _mm_add_ps(fix2,tx);
1858             fiy2             = _mm_add_ps(fiy2,ty);
1859             fiz2             = _mm_add_ps(fiz2,tz);
1860
1861             fjx2             = _mm_add_ps(fjx2,tx);
1862             fjy2             = _mm_add_ps(fjy2,ty);
1863             fjz2             = _mm_add_ps(fjz2,tz);
1864
1865             fjptrA             = f+j_coord_offsetA;
1866             fjptrB             = f+j_coord_offsetB;
1867             fjptrC             = f+j_coord_offsetC;
1868             fjptrD             = f+j_coord_offsetD;
1869
1870             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1871                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1872
1873             /* Inner loop uses 359 flops */
1874         }
1875
1876         if(jidx<j_index_end)
1877         {
1878
1879             /* Get j neighbor index, and coordinate index */
1880             jnrlistA         = jjnr[jidx];
1881             jnrlistB         = jjnr[jidx+1];
1882             jnrlistC         = jjnr[jidx+2];
1883             jnrlistD         = jjnr[jidx+3];
1884             /* Sign of each element will be negative for non-real atoms.
1885              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1886              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1887              */
1888             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1889             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1890             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1891             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1892             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1893             j_coord_offsetA  = DIM*jnrA;
1894             j_coord_offsetB  = DIM*jnrB;
1895             j_coord_offsetC  = DIM*jnrC;
1896             j_coord_offsetD  = DIM*jnrD;
1897
1898             /* load j atom coordinates */
1899             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1900                                               x+j_coord_offsetC,x+j_coord_offsetD,
1901                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1902
1903             /* Calculate displacement vector */
1904             dx00             = _mm_sub_ps(ix0,jx0);
1905             dy00             = _mm_sub_ps(iy0,jy0);
1906             dz00             = _mm_sub_ps(iz0,jz0);
1907             dx01             = _mm_sub_ps(ix0,jx1);
1908             dy01             = _mm_sub_ps(iy0,jy1);
1909             dz01             = _mm_sub_ps(iz0,jz1);
1910             dx02             = _mm_sub_ps(ix0,jx2);
1911             dy02             = _mm_sub_ps(iy0,jy2);
1912             dz02             = _mm_sub_ps(iz0,jz2);
1913             dx10             = _mm_sub_ps(ix1,jx0);
1914             dy10             = _mm_sub_ps(iy1,jy0);
1915             dz10             = _mm_sub_ps(iz1,jz0);
1916             dx11             = _mm_sub_ps(ix1,jx1);
1917             dy11             = _mm_sub_ps(iy1,jy1);
1918             dz11             = _mm_sub_ps(iz1,jz1);
1919             dx12             = _mm_sub_ps(ix1,jx2);
1920             dy12             = _mm_sub_ps(iy1,jy2);
1921             dz12             = _mm_sub_ps(iz1,jz2);
1922             dx20             = _mm_sub_ps(ix2,jx0);
1923             dy20             = _mm_sub_ps(iy2,jy0);
1924             dz20             = _mm_sub_ps(iz2,jz0);
1925             dx21             = _mm_sub_ps(ix2,jx1);
1926             dy21             = _mm_sub_ps(iy2,jy1);
1927             dz21             = _mm_sub_ps(iz2,jz1);
1928             dx22             = _mm_sub_ps(ix2,jx2);
1929             dy22             = _mm_sub_ps(iy2,jy2);
1930             dz22             = _mm_sub_ps(iz2,jz2);
1931
1932             /* Calculate squared distance and things based on it */
1933             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1934             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1935             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1936             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1937             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1938             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1939             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1940             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1941             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1942
1943             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1944             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1945             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1946             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1947             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1948             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1949             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1950             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1951             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1952
1953             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1954
1955             fjx0             = _mm_setzero_ps();
1956             fjy0             = _mm_setzero_ps();
1957             fjz0             = _mm_setzero_ps();
1958             fjx1             = _mm_setzero_ps();
1959             fjy1             = _mm_setzero_ps();
1960             fjz1             = _mm_setzero_ps();
1961             fjx2             = _mm_setzero_ps();
1962             fjy2             = _mm_setzero_ps();
1963             fjz2             = _mm_setzero_ps();
1964
1965             /**************************
1966              * CALCULATE INTERACTIONS *
1967              **************************/
1968
1969             r00              = _mm_mul_ps(rsq00,rinv00);
1970             r00              = _mm_andnot_ps(dummy_mask,r00);
1971
1972             /* Calculate table index by multiplying r with table scale and truncate to integer */
1973             rt               = _mm_mul_ps(r00,vftabscale);
1974             vfitab           = _mm_cvttps_epi32(rt);
1975             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1976             vfitab           = _mm_slli_epi32(vfitab,2);
1977
1978             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1979             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1980             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1981             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1982             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1983             _MM_TRANSPOSE4_PS(Y,F,G,H);
1984             Heps             = _mm_mul_ps(vfeps,H);
1985             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1986             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1987             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1988
1989             /* LENNARD-JONES DISPERSION/REPULSION */
1990
1991             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1992             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1993
1994             fscal            = _mm_add_ps(felec,fvdw);
1995
1996             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1997
1998             /* Calculate temporary vectorial force */
1999             tx               = _mm_mul_ps(fscal,dx00);
2000             ty               = _mm_mul_ps(fscal,dy00);
2001             tz               = _mm_mul_ps(fscal,dz00);
2002
2003             /* Update vectorial force */
2004             fix0             = _mm_add_ps(fix0,tx);
2005             fiy0             = _mm_add_ps(fiy0,ty);
2006             fiz0             = _mm_add_ps(fiz0,tz);
2007
2008             fjx0             = _mm_add_ps(fjx0,tx);
2009             fjy0             = _mm_add_ps(fjy0,ty);
2010             fjz0             = _mm_add_ps(fjz0,tz);
2011
2012             /**************************
2013              * CALCULATE INTERACTIONS *
2014              **************************/
2015
2016             r01              = _mm_mul_ps(rsq01,rinv01);
2017             r01              = _mm_andnot_ps(dummy_mask,r01);
2018
2019             /* Calculate table index by multiplying r with table scale and truncate to integer */
2020             rt               = _mm_mul_ps(r01,vftabscale);
2021             vfitab           = _mm_cvttps_epi32(rt);
2022             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2023             vfitab           = _mm_slli_epi32(vfitab,2);
2024
2025             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2026             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2027             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2028             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2029             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2030             _MM_TRANSPOSE4_PS(Y,F,G,H);
2031             Heps             = _mm_mul_ps(vfeps,H);
2032             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2033             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2034             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2035
2036             fscal            = felec;
2037
2038             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2039
2040             /* Calculate temporary vectorial force */
2041             tx               = _mm_mul_ps(fscal,dx01);
2042             ty               = _mm_mul_ps(fscal,dy01);
2043             tz               = _mm_mul_ps(fscal,dz01);
2044
2045             /* Update vectorial force */
2046             fix0             = _mm_add_ps(fix0,tx);
2047             fiy0             = _mm_add_ps(fiy0,ty);
2048             fiz0             = _mm_add_ps(fiz0,tz);
2049
2050             fjx1             = _mm_add_ps(fjx1,tx);
2051             fjy1             = _mm_add_ps(fjy1,ty);
2052             fjz1             = _mm_add_ps(fjz1,tz);
2053
2054             /**************************
2055              * CALCULATE INTERACTIONS *
2056              **************************/
2057
2058             r02              = _mm_mul_ps(rsq02,rinv02);
2059             r02              = _mm_andnot_ps(dummy_mask,r02);
2060
2061             /* Calculate table index by multiplying r with table scale and truncate to integer */
2062             rt               = _mm_mul_ps(r02,vftabscale);
2063             vfitab           = _mm_cvttps_epi32(rt);
2064             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2065             vfitab           = _mm_slli_epi32(vfitab,2);
2066
2067             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2068             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2069             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2070             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2071             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2072             _MM_TRANSPOSE4_PS(Y,F,G,H);
2073             Heps             = _mm_mul_ps(vfeps,H);
2074             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2075             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2076             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2077
2078             fscal            = felec;
2079
2080             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2081
2082             /* Calculate temporary vectorial force */
2083             tx               = _mm_mul_ps(fscal,dx02);
2084             ty               = _mm_mul_ps(fscal,dy02);
2085             tz               = _mm_mul_ps(fscal,dz02);
2086
2087             /* Update vectorial force */
2088             fix0             = _mm_add_ps(fix0,tx);
2089             fiy0             = _mm_add_ps(fiy0,ty);
2090             fiz0             = _mm_add_ps(fiz0,tz);
2091
2092             fjx2             = _mm_add_ps(fjx2,tx);
2093             fjy2             = _mm_add_ps(fjy2,ty);
2094             fjz2             = _mm_add_ps(fjz2,tz);
2095
2096             /**************************
2097              * CALCULATE INTERACTIONS *
2098              **************************/
2099
2100             r10              = _mm_mul_ps(rsq10,rinv10);
2101             r10              = _mm_andnot_ps(dummy_mask,r10);
2102
2103             /* Calculate table index by multiplying r with table scale and truncate to integer */
2104             rt               = _mm_mul_ps(r10,vftabscale);
2105             vfitab           = _mm_cvttps_epi32(rt);
2106             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2107             vfitab           = _mm_slli_epi32(vfitab,2);
2108
2109             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2110             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2111             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2112             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2113             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2114             _MM_TRANSPOSE4_PS(Y,F,G,H);
2115             Heps             = _mm_mul_ps(vfeps,H);
2116             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2117             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2118             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2119
2120             fscal            = felec;
2121
2122             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2123
2124             /* Calculate temporary vectorial force */
2125             tx               = _mm_mul_ps(fscal,dx10);
2126             ty               = _mm_mul_ps(fscal,dy10);
2127             tz               = _mm_mul_ps(fscal,dz10);
2128
2129             /* Update vectorial force */
2130             fix1             = _mm_add_ps(fix1,tx);
2131             fiy1             = _mm_add_ps(fiy1,ty);
2132             fiz1             = _mm_add_ps(fiz1,tz);
2133
2134             fjx0             = _mm_add_ps(fjx0,tx);
2135             fjy0             = _mm_add_ps(fjy0,ty);
2136             fjz0             = _mm_add_ps(fjz0,tz);
2137
2138             /**************************
2139              * CALCULATE INTERACTIONS *
2140              **************************/
2141
2142             r11              = _mm_mul_ps(rsq11,rinv11);
2143             r11              = _mm_andnot_ps(dummy_mask,r11);
2144
2145             /* Calculate table index by multiplying r with table scale and truncate to integer */
2146             rt               = _mm_mul_ps(r11,vftabscale);
2147             vfitab           = _mm_cvttps_epi32(rt);
2148             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2149             vfitab           = _mm_slli_epi32(vfitab,2);
2150
2151             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2152             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2153             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2154             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2155             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2156             _MM_TRANSPOSE4_PS(Y,F,G,H);
2157             Heps             = _mm_mul_ps(vfeps,H);
2158             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2159             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2160             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2161
2162             fscal            = felec;
2163
2164             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2165
2166             /* Calculate temporary vectorial force */
2167             tx               = _mm_mul_ps(fscal,dx11);
2168             ty               = _mm_mul_ps(fscal,dy11);
2169             tz               = _mm_mul_ps(fscal,dz11);
2170
2171             /* Update vectorial force */
2172             fix1             = _mm_add_ps(fix1,tx);
2173             fiy1             = _mm_add_ps(fiy1,ty);
2174             fiz1             = _mm_add_ps(fiz1,tz);
2175
2176             fjx1             = _mm_add_ps(fjx1,tx);
2177             fjy1             = _mm_add_ps(fjy1,ty);
2178             fjz1             = _mm_add_ps(fjz1,tz);
2179
2180             /**************************
2181              * CALCULATE INTERACTIONS *
2182              **************************/
2183
2184             r12              = _mm_mul_ps(rsq12,rinv12);
2185             r12              = _mm_andnot_ps(dummy_mask,r12);
2186
2187             /* Calculate table index by multiplying r with table scale and truncate to integer */
2188             rt               = _mm_mul_ps(r12,vftabscale);
2189             vfitab           = _mm_cvttps_epi32(rt);
2190             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2191             vfitab           = _mm_slli_epi32(vfitab,2);
2192
2193             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2194             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2195             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2196             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2197             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2198             _MM_TRANSPOSE4_PS(Y,F,G,H);
2199             Heps             = _mm_mul_ps(vfeps,H);
2200             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2201             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2202             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2203
2204             fscal            = felec;
2205
2206             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2207
2208             /* Calculate temporary vectorial force */
2209             tx               = _mm_mul_ps(fscal,dx12);
2210             ty               = _mm_mul_ps(fscal,dy12);
2211             tz               = _mm_mul_ps(fscal,dz12);
2212
2213             /* Update vectorial force */
2214             fix1             = _mm_add_ps(fix1,tx);
2215             fiy1             = _mm_add_ps(fiy1,ty);
2216             fiz1             = _mm_add_ps(fiz1,tz);
2217
2218             fjx2             = _mm_add_ps(fjx2,tx);
2219             fjy2             = _mm_add_ps(fjy2,ty);
2220             fjz2             = _mm_add_ps(fjz2,tz);
2221
2222             /**************************
2223              * CALCULATE INTERACTIONS *
2224              **************************/
2225
2226             r20              = _mm_mul_ps(rsq20,rinv20);
2227             r20              = _mm_andnot_ps(dummy_mask,r20);
2228
2229             /* Calculate table index by multiplying r with table scale and truncate to integer */
2230             rt               = _mm_mul_ps(r20,vftabscale);
2231             vfitab           = _mm_cvttps_epi32(rt);
2232             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2233             vfitab           = _mm_slli_epi32(vfitab,2);
2234
2235             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2236             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2237             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2238             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2239             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2240             _MM_TRANSPOSE4_PS(Y,F,G,H);
2241             Heps             = _mm_mul_ps(vfeps,H);
2242             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2243             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2244             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2245
2246             fscal            = felec;
2247
2248             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2249
2250             /* Calculate temporary vectorial force */
2251             tx               = _mm_mul_ps(fscal,dx20);
2252             ty               = _mm_mul_ps(fscal,dy20);
2253             tz               = _mm_mul_ps(fscal,dz20);
2254
2255             /* Update vectorial force */
2256             fix2             = _mm_add_ps(fix2,tx);
2257             fiy2             = _mm_add_ps(fiy2,ty);
2258             fiz2             = _mm_add_ps(fiz2,tz);
2259
2260             fjx0             = _mm_add_ps(fjx0,tx);
2261             fjy0             = _mm_add_ps(fjy0,ty);
2262             fjz0             = _mm_add_ps(fjz0,tz);
2263
2264             /**************************
2265              * CALCULATE INTERACTIONS *
2266              **************************/
2267
2268             r21              = _mm_mul_ps(rsq21,rinv21);
2269             r21              = _mm_andnot_ps(dummy_mask,r21);
2270
2271             /* Calculate table index by multiplying r with table scale and truncate to integer */
2272             rt               = _mm_mul_ps(r21,vftabscale);
2273             vfitab           = _mm_cvttps_epi32(rt);
2274             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2275             vfitab           = _mm_slli_epi32(vfitab,2);
2276
2277             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2278             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2279             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2280             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2281             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2282             _MM_TRANSPOSE4_PS(Y,F,G,H);
2283             Heps             = _mm_mul_ps(vfeps,H);
2284             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2285             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2286             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2287
2288             fscal            = felec;
2289
2290             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2291
2292             /* Calculate temporary vectorial force */
2293             tx               = _mm_mul_ps(fscal,dx21);
2294             ty               = _mm_mul_ps(fscal,dy21);
2295             tz               = _mm_mul_ps(fscal,dz21);
2296
2297             /* Update vectorial force */
2298             fix2             = _mm_add_ps(fix2,tx);
2299             fiy2             = _mm_add_ps(fiy2,ty);
2300             fiz2             = _mm_add_ps(fiz2,tz);
2301
2302             fjx1             = _mm_add_ps(fjx1,tx);
2303             fjy1             = _mm_add_ps(fjy1,ty);
2304             fjz1             = _mm_add_ps(fjz1,tz);
2305
2306             /**************************
2307              * CALCULATE INTERACTIONS *
2308              **************************/
2309
2310             r22              = _mm_mul_ps(rsq22,rinv22);
2311             r22              = _mm_andnot_ps(dummy_mask,r22);
2312
2313             /* Calculate table index by multiplying r with table scale and truncate to integer */
2314             rt               = _mm_mul_ps(r22,vftabscale);
2315             vfitab           = _mm_cvttps_epi32(rt);
2316             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2317             vfitab           = _mm_slli_epi32(vfitab,2);
2318
2319             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2320             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2321             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2322             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2323             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2324             _MM_TRANSPOSE4_PS(Y,F,G,H);
2325             Heps             = _mm_mul_ps(vfeps,H);
2326             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2327             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2328             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2329
2330             fscal            = felec;
2331
2332             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2333
2334             /* Calculate temporary vectorial force */
2335             tx               = _mm_mul_ps(fscal,dx22);
2336             ty               = _mm_mul_ps(fscal,dy22);
2337             tz               = _mm_mul_ps(fscal,dz22);
2338
2339             /* Update vectorial force */
2340             fix2             = _mm_add_ps(fix2,tx);
2341             fiy2             = _mm_add_ps(fiy2,ty);
2342             fiz2             = _mm_add_ps(fiz2,tz);
2343
2344             fjx2             = _mm_add_ps(fjx2,tx);
2345             fjy2             = _mm_add_ps(fjy2,ty);
2346             fjz2             = _mm_add_ps(fjz2,tz);
2347
2348             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2349             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2350             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2351             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2352
2353             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2354                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2355
2356             /* Inner loop uses 368 flops */
2357         }
2358
2359         /* End of innermost loop */
2360
2361         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2362                                               f+i_coord_offset,fshift+i_shift_offset);
2363
2364         /* Increment number of inner iterations */
2365         inneriter                  += j_index_end - j_index_start;
2366
2367         /* Outer loop uses 18 flops */
2368     }
2369
2370     /* Increment number of outer iterations */
2371     outeriter        += nri;
2372
2373     /* Update outer/inner flops */
2374
2375     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);
2376 }