Rename remaining GMX_ACCELERATION to GMX_CPU_ACCELERATION
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single
38  * Electrostatics interaction: Coulomb
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             shX,shY,shZ,rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     int              nvdwtype;
90     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91     int              *vdwtype;
92     real             *vdwparam;
93     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
94     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
95     __m128i          vfitab;
96     __m128i          ifour       = _mm_set1_epi32(4);
97     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
98     real             *vftab;
99     __m128           dummy_mask,cutoff_mask;
100     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
101     __m128           one     = _mm_set1_ps(1.0);
102     __m128           two     = _mm_set1_ps(2.0);
103     x                = xx[0];
104     f                = ff[0];
105
106     nri              = nlist->nri;
107     iinr             = nlist->iinr;
108     jindex           = nlist->jindex;
109     jjnr             = nlist->jjnr;
110     shiftidx         = nlist->shift;
111     gid              = nlist->gid;
112     shiftvec         = fr->shift_vec[0];
113     fshift           = fr->fshift[0];
114     facel            = _mm_set1_ps(fr->epsfac);
115     charge           = mdatoms->chargeA;
116     nvdwtype         = fr->ntype;
117     vdwparam         = fr->nbfp;
118     vdwtype          = mdatoms->typeA;
119
120     vftab            = kernel_data->table_vdw->data;
121     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
122
123     /* Setup water-specific parameters */
124     inr              = nlist->iinr[0];
125     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
126     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
127     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
128     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
129
130     jq0              = _mm_set1_ps(charge[inr+0]);
131     jq1              = _mm_set1_ps(charge[inr+1]);
132     jq2              = _mm_set1_ps(charge[inr+2]);
133     vdwjidx0A        = 2*vdwtype[inr+0];
134     qq00             = _mm_mul_ps(iq0,jq0);
135     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
136     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
137     qq01             = _mm_mul_ps(iq0,jq1);
138     qq02             = _mm_mul_ps(iq0,jq2);
139     qq10             = _mm_mul_ps(iq1,jq0);
140     qq11             = _mm_mul_ps(iq1,jq1);
141     qq12             = _mm_mul_ps(iq1,jq2);
142     qq20             = _mm_mul_ps(iq2,jq0);
143     qq21             = _mm_mul_ps(iq2,jq1);
144     qq22             = _mm_mul_ps(iq2,jq2);
145
146     /* Avoid stupid compiler warnings */
147     jnrA = jnrB = jnrC = jnrD = 0;
148     j_coord_offsetA = 0;
149     j_coord_offsetB = 0;
150     j_coord_offsetC = 0;
151     j_coord_offsetD = 0;
152
153     outeriter        = 0;
154     inneriter        = 0;
155
156     /* Start outer loop over neighborlists */
157     for(iidx=0; iidx<nri; iidx++)
158     {
159         /* Load shift vector for this list */
160         i_shift_offset   = DIM*shiftidx[iidx];
161         shX              = shiftvec[i_shift_offset+XX];
162         shY              = shiftvec[i_shift_offset+YY];
163         shZ              = shiftvec[i_shift_offset+ZZ];
164
165         /* Load limits for loop over neighbors */
166         j_index_start    = jindex[iidx];
167         j_index_end      = jindex[iidx+1];
168
169         /* Get outer coordinate index */
170         inr              = iinr[iidx];
171         i_coord_offset   = DIM*inr;
172
173         /* Load i particle coords and add shift vector */
174         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
175         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
176         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
177         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
178         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
179         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
180         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
181         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
182         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
183
184         fix0             = _mm_setzero_ps();
185         fiy0             = _mm_setzero_ps();
186         fiz0             = _mm_setzero_ps();
187         fix1             = _mm_setzero_ps();
188         fiy1             = _mm_setzero_ps();
189         fiz1             = _mm_setzero_ps();
190         fix2             = _mm_setzero_ps();
191         fiy2             = _mm_setzero_ps();
192         fiz2             = _mm_setzero_ps();
193
194         /* Reset potential sums */
195         velecsum         = _mm_setzero_ps();
196         vvdwsum          = _mm_setzero_ps();
197
198         /* Start inner kernel loop */
199         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200         {
201
202             /* Get j neighbor index, and coordinate index */
203             jnrA             = jjnr[jidx];
204             jnrB             = jjnr[jidx+1];
205             jnrC             = jjnr[jidx+2];
206             jnrD             = jjnr[jidx+3];
207
208             j_coord_offsetA  = DIM*jnrA;
209             j_coord_offsetB  = DIM*jnrB;
210             j_coord_offsetC  = DIM*jnrC;
211             j_coord_offsetD  = DIM*jnrD;
212
213             /* load j atom coordinates */
214             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
215                                               x+j_coord_offsetC,x+j_coord_offsetD,
216                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
217
218             /* Calculate displacement vector */
219             dx00             = _mm_sub_ps(ix0,jx0);
220             dy00             = _mm_sub_ps(iy0,jy0);
221             dz00             = _mm_sub_ps(iz0,jz0);
222             dx01             = _mm_sub_ps(ix0,jx1);
223             dy01             = _mm_sub_ps(iy0,jy1);
224             dz01             = _mm_sub_ps(iz0,jz1);
225             dx02             = _mm_sub_ps(ix0,jx2);
226             dy02             = _mm_sub_ps(iy0,jy2);
227             dz02             = _mm_sub_ps(iz0,jz2);
228             dx10             = _mm_sub_ps(ix1,jx0);
229             dy10             = _mm_sub_ps(iy1,jy0);
230             dz10             = _mm_sub_ps(iz1,jz0);
231             dx11             = _mm_sub_ps(ix1,jx1);
232             dy11             = _mm_sub_ps(iy1,jy1);
233             dz11             = _mm_sub_ps(iz1,jz1);
234             dx12             = _mm_sub_ps(ix1,jx2);
235             dy12             = _mm_sub_ps(iy1,jy2);
236             dz12             = _mm_sub_ps(iz1,jz2);
237             dx20             = _mm_sub_ps(ix2,jx0);
238             dy20             = _mm_sub_ps(iy2,jy0);
239             dz20             = _mm_sub_ps(iz2,jz0);
240             dx21             = _mm_sub_ps(ix2,jx1);
241             dy21             = _mm_sub_ps(iy2,jy1);
242             dz21             = _mm_sub_ps(iz2,jz1);
243             dx22             = _mm_sub_ps(ix2,jx2);
244             dy22             = _mm_sub_ps(iy2,jy2);
245             dz22             = _mm_sub_ps(iz2,jz2);
246
247             /* Calculate squared distance and things based on it */
248             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
249             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
250             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
251             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
252             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
253             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
254             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
255             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
256             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
257
258             rinv00           = gmx_mm_invsqrt_ps(rsq00);
259             rinv01           = gmx_mm_invsqrt_ps(rsq01);
260             rinv02           = gmx_mm_invsqrt_ps(rsq02);
261             rinv10           = gmx_mm_invsqrt_ps(rsq10);
262             rinv11           = gmx_mm_invsqrt_ps(rsq11);
263             rinv12           = gmx_mm_invsqrt_ps(rsq12);
264             rinv20           = gmx_mm_invsqrt_ps(rsq20);
265             rinv21           = gmx_mm_invsqrt_ps(rsq21);
266             rinv22           = gmx_mm_invsqrt_ps(rsq22);
267
268             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
269             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
270             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
271             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
272             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
273             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
274             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
275             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
276             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
277
278             fjx0             = _mm_setzero_ps();
279             fjy0             = _mm_setzero_ps();
280             fjz0             = _mm_setzero_ps();
281             fjx1             = _mm_setzero_ps();
282             fjy1             = _mm_setzero_ps();
283             fjz1             = _mm_setzero_ps();
284             fjx2             = _mm_setzero_ps();
285             fjy2             = _mm_setzero_ps();
286             fjz2             = _mm_setzero_ps();
287
288             /**************************
289              * CALCULATE INTERACTIONS *
290              **************************/
291
292             r00              = _mm_mul_ps(rsq00,rinv00);
293
294             /* Calculate table index by multiplying r with table scale and truncate to integer */
295             rt               = _mm_mul_ps(r00,vftabscale);
296             vfitab           = _mm_cvttps_epi32(rt);
297             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
298             vfitab           = _mm_slli_epi32(vfitab,3);
299
300             /* COULOMB ELECTROSTATICS */
301             velec            = _mm_mul_ps(qq00,rinv00);
302             felec            = _mm_mul_ps(velec,rinvsq00);
303
304             /* CUBIC SPLINE TABLE DISPERSION */
305             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
306             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
307             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
308             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
309             _MM_TRANSPOSE4_PS(Y,F,G,H);
310             Heps             = _mm_mul_ps(vfeps,H);
311             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
312             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
313             vvdw6            = _mm_mul_ps(c6_00,VV);
314             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
315             fvdw6            = _mm_mul_ps(c6_00,FF);
316
317             /* CUBIC SPLINE TABLE REPULSION */
318             vfitab           = _mm_add_epi32(vfitab,ifour);
319             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
320             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
321             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
322             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
323             _MM_TRANSPOSE4_PS(Y,F,G,H);
324             Heps             = _mm_mul_ps(vfeps,H);
325             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
326             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
327             vvdw12           = _mm_mul_ps(c12_00,VV);
328             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
329             fvdw12           = _mm_mul_ps(c12_00,FF);
330             vvdw             = _mm_add_ps(vvdw12,vvdw6);
331             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
332
333             /* Update potential sum for this i atom from the interaction with this j atom. */
334             velecsum         = _mm_add_ps(velecsum,velec);
335             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
336
337             fscal            = _mm_add_ps(felec,fvdw);
338
339             /* Calculate temporary vectorial force */
340             tx               = _mm_mul_ps(fscal,dx00);
341             ty               = _mm_mul_ps(fscal,dy00);
342             tz               = _mm_mul_ps(fscal,dz00);
343
344             /* Update vectorial force */
345             fix0             = _mm_add_ps(fix0,tx);
346             fiy0             = _mm_add_ps(fiy0,ty);
347             fiz0             = _mm_add_ps(fiz0,tz);
348
349             fjx0             = _mm_add_ps(fjx0,tx);
350             fjy0             = _mm_add_ps(fjy0,ty);
351             fjz0             = _mm_add_ps(fjz0,tz);
352
353             /**************************
354              * CALCULATE INTERACTIONS *
355              **************************/
356
357             /* COULOMB ELECTROSTATICS */
358             velec            = _mm_mul_ps(qq01,rinv01);
359             felec            = _mm_mul_ps(velec,rinvsq01);
360
361             /* Update potential sum for this i atom from the interaction with this j atom. */
362             velecsum         = _mm_add_ps(velecsum,velec);
363
364             fscal            = felec;
365
366             /* Calculate temporary vectorial force */
367             tx               = _mm_mul_ps(fscal,dx01);
368             ty               = _mm_mul_ps(fscal,dy01);
369             tz               = _mm_mul_ps(fscal,dz01);
370
371             /* Update vectorial force */
372             fix0             = _mm_add_ps(fix0,tx);
373             fiy0             = _mm_add_ps(fiy0,ty);
374             fiz0             = _mm_add_ps(fiz0,tz);
375
376             fjx1             = _mm_add_ps(fjx1,tx);
377             fjy1             = _mm_add_ps(fjy1,ty);
378             fjz1             = _mm_add_ps(fjz1,tz);
379
380             /**************************
381              * CALCULATE INTERACTIONS *
382              **************************/
383
384             /* COULOMB ELECTROSTATICS */
385             velec            = _mm_mul_ps(qq02,rinv02);
386             felec            = _mm_mul_ps(velec,rinvsq02);
387
388             /* Update potential sum for this i atom from the interaction with this j atom. */
389             velecsum         = _mm_add_ps(velecsum,velec);
390
391             fscal            = felec;
392
393             /* Calculate temporary vectorial force */
394             tx               = _mm_mul_ps(fscal,dx02);
395             ty               = _mm_mul_ps(fscal,dy02);
396             tz               = _mm_mul_ps(fscal,dz02);
397
398             /* Update vectorial force */
399             fix0             = _mm_add_ps(fix0,tx);
400             fiy0             = _mm_add_ps(fiy0,ty);
401             fiz0             = _mm_add_ps(fiz0,tz);
402
403             fjx2             = _mm_add_ps(fjx2,tx);
404             fjy2             = _mm_add_ps(fjy2,ty);
405             fjz2             = _mm_add_ps(fjz2,tz);
406
407             /**************************
408              * CALCULATE INTERACTIONS *
409              **************************/
410
411             /* COULOMB ELECTROSTATICS */
412             velec            = _mm_mul_ps(qq10,rinv10);
413             felec            = _mm_mul_ps(velec,rinvsq10);
414
415             /* Update potential sum for this i atom from the interaction with this j atom. */
416             velecsum         = _mm_add_ps(velecsum,velec);
417
418             fscal            = felec;
419
420             /* Calculate temporary vectorial force */
421             tx               = _mm_mul_ps(fscal,dx10);
422             ty               = _mm_mul_ps(fscal,dy10);
423             tz               = _mm_mul_ps(fscal,dz10);
424
425             /* Update vectorial force */
426             fix1             = _mm_add_ps(fix1,tx);
427             fiy1             = _mm_add_ps(fiy1,ty);
428             fiz1             = _mm_add_ps(fiz1,tz);
429
430             fjx0             = _mm_add_ps(fjx0,tx);
431             fjy0             = _mm_add_ps(fjy0,ty);
432             fjz0             = _mm_add_ps(fjz0,tz);
433
434             /**************************
435              * CALCULATE INTERACTIONS *
436              **************************/
437
438             /* COULOMB ELECTROSTATICS */
439             velec            = _mm_mul_ps(qq11,rinv11);
440             felec            = _mm_mul_ps(velec,rinvsq11);
441
442             /* Update potential sum for this i atom from the interaction with this j atom. */
443             velecsum         = _mm_add_ps(velecsum,velec);
444
445             fscal            = felec;
446
447             /* Calculate temporary vectorial force */
448             tx               = _mm_mul_ps(fscal,dx11);
449             ty               = _mm_mul_ps(fscal,dy11);
450             tz               = _mm_mul_ps(fscal,dz11);
451
452             /* Update vectorial force */
453             fix1             = _mm_add_ps(fix1,tx);
454             fiy1             = _mm_add_ps(fiy1,ty);
455             fiz1             = _mm_add_ps(fiz1,tz);
456
457             fjx1             = _mm_add_ps(fjx1,tx);
458             fjy1             = _mm_add_ps(fjy1,ty);
459             fjz1             = _mm_add_ps(fjz1,tz);
460
461             /**************************
462              * CALCULATE INTERACTIONS *
463              **************************/
464
465             /* COULOMB ELECTROSTATICS */
466             velec            = _mm_mul_ps(qq12,rinv12);
467             felec            = _mm_mul_ps(velec,rinvsq12);
468
469             /* Update potential sum for this i atom from the interaction with this j atom. */
470             velecsum         = _mm_add_ps(velecsum,velec);
471
472             fscal            = felec;
473
474             /* Calculate temporary vectorial force */
475             tx               = _mm_mul_ps(fscal,dx12);
476             ty               = _mm_mul_ps(fscal,dy12);
477             tz               = _mm_mul_ps(fscal,dz12);
478
479             /* Update vectorial force */
480             fix1             = _mm_add_ps(fix1,tx);
481             fiy1             = _mm_add_ps(fiy1,ty);
482             fiz1             = _mm_add_ps(fiz1,tz);
483
484             fjx2             = _mm_add_ps(fjx2,tx);
485             fjy2             = _mm_add_ps(fjy2,ty);
486             fjz2             = _mm_add_ps(fjz2,tz);
487
488             /**************************
489              * CALCULATE INTERACTIONS *
490              **************************/
491
492             /* COULOMB ELECTROSTATICS */
493             velec            = _mm_mul_ps(qq20,rinv20);
494             felec            = _mm_mul_ps(velec,rinvsq20);
495
496             /* Update potential sum for this i atom from the interaction with this j atom. */
497             velecsum         = _mm_add_ps(velecsum,velec);
498
499             fscal            = felec;
500
501             /* Calculate temporary vectorial force */
502             tx               = _mm_mul_ps(fscal,dx20);
503             ty               = _mm_mul_ps(fscal,dy20);
504             tz               = _mm_mul_ps(fscal,dz20);
505
506             /* Update vectorial force */
507             fix2             = _mm_add_ps(fix2,tx);
508             fiy2             = _mm_add_ps(fiy2,ty);
509             fiz2             = _mm_add_ps(fiz2,tz);
510
511             fjx0             = _mm_add_ps(fjx0,tx);
512             fjy0             = _mm_add_ps(fjy0,ty);
513             fjz0             = _mm_add_ps(fjz0,tz);
514
515             /**************************
516              * CALCULATE INTERACTIONS *
517              **************************/
518
519             /* COULOMB ELECTROSTATICS */
520             velec            = _mm_mul_ps(qq21,rinv21);
521             felec            = _mm_mul_ps(velec,rinvsq21);
522
523             /* Update potential sum for this i atom from the interaction with this j atom. */
524             velecsum         = _mm_add_ps(velecsum,velec);
525
526             fscal            = felec;
527
528             /* Calculate temporary vectorial force */
529             tx               = _mm_mul_ps(fscal,dx21);
530             ty               = _mm_mul_ps(fscal,dy21);
531             tz               = _mm_mul_ps(fscal,dz21);
532
533             /* Update vectorial force */
534             fix2             = _mm_add_ps(fix2,tx);
535             fiy2             = _mm_add_ps(fiy2,ty);
536             fiz2             = _mm_add_ps(fiz2,tz);
537
538             fjx1             = _mm_add_ps(fjx1,tx);
539             fjy1             = _mm_add_ps(fjy1,ty);
540             fjz1             = _mm_add_ps(fjz1,tz);
541
542             /**************************
543              * CALCULATE INTERACTIONS *
544              **************************/
545
546             /* COULOMB ELECTROSTATICS */
547             velec            = _mm_mul_ps(qq22,rinv22);
548             felec            = _mm_mul_ps(velec,rinvsq22);
549
550             /* Update potential sum for this i atom from the interaction with this j atom. */
551             velecsum         = _mm_add_ps(velecsum,velec);
552
553             fscal            = felec;
554
555             /* Calculate temporary vectorial force */
556             tx               = _mm_mul_ps(fscal,dx22);
557             ty               = _mm_mul_ps(fscal,dy22);
558             tz               = _mm_mul_ps(fscal,dz22);
559
560             /* Update vectorial force */
561             fix2             = _mm_add_ps(fix2,tx);
562             fiy2             = _mm_add_ps(fiy2,ty);
563             fiz2             = _mm_add_ps(fiz2,tz);
564
565             fjx2             = _mm_add_ps(fjx2,tx);
566             fjy2             = _mm_add_ps(fjy2,ty);
567             fjz2             = _mm_add_ps(fjz2,tz);
568
569             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
570                                                    f+j_coord_offsetC,f+j_coord_offsetD,
571                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
572
573             /* Inner loop uses 287 flops */
574         }
575
576         if(jidx<j_index_end)
577         {
578
579             /* Get j neighbor index, and coordinate index */
580             jnrA             = jjnr[jidx];
581             jnrB             = jjnr[jidx+1];
582             jnrC             = jjnr[jidx+2];
583             jnrD             = jjnr[jidx+3];
584
585             /* Sign of each element will be negative for non-real atoms.
586              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
587              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
588              */
589             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
590             jnrA       = (jnrA>=0) ? jnrA : 0;
591             jnrB       = (jnrB>=0) ? jnrB : 0;
592             jnrC       = (jnrC>=0) ? jnrC : 0;
593             jnrD       = (jnrD>=0) ? jnrD : 0;
594
595             j_coord_offsetA  = DIM*jnrA;
596             j_coord_offsetB  = DIM*jnrB;
597             j_coord_offsetC  = DIM*jnrC;
598             j_coord_offsetD  = DIM*jnrD;
599
600             /* load j atom coordinates */
601             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
602                                               x+j_coord_offsetC,x+j_coord_offsetD,
603                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
604
605             /* Calculate displacement vector */
606             dx00             = _mm_sub_ps(ix0,jx0);
607             dy00             = _mm_sub_ps(iy0,jy0);
608             dz00             = _mm_sub_ps(iz0,jz0);
609             dx01             = _mm_sub_ps(ix0,jx1);
610             dy01             = _mm_sub_ps(iy0,jy1);
611             dz01             = _mm_sub_ps(iz0,jz1);
612             dx02             = _mm_sub_ps(ix0,jx2);
613             dy02             = _mm_sub_ps(iy0,jy2);
614             dz02             = _mm_sub_ps(iz0,jz2);
615             dx10             = _mm_sub_ps(ix1,jx0);
616             dy10             = _mm_sub_ps(iy1,jy0);
617             dz10             = _mm_sub_ps(iz1,jz0);
618             dx11             = _mm_sub_ps(ix1,jx1);
619             dy11             = _mm_sub_ps(iy1,jy1);
620             dz11             = _mm_sub_ps(iz1,jz1);
621             dx12             = _mm_sub_ps(ix1,jx2);
622             dy12             = _mm_sub_ps(iy1,jy2);
623             dz12             = _mm_sub_ps(iz1,jz2);
624             dx20             = _mm_sub_ps(ix2,jx0);
625             dy20             = _mm_sub_ps(iy2,jy0);
626             dz20             = _mm_sub_ps(iz2,jz0);
627             dx21             = _mm_sub_ps(ix2,jx1);
628             dy21             = _mm_sub_ps(iy2,jy1);
629             dz21             = _mm_sub_ps(iz2,jz1);
630             dx22             = _mm_sub_ps(ix2,jx2);
631             dy22             = _mm_sub_ps(iy2,jy2);
632             dz22             = _mm_sub_ps(iz2,jz2);
633
634             /* Calculate squared distance and things based on it */
635             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
636             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
637             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
638             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
639             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
640             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
641             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
642             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
643             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
644
645             rinv00           = gmx_mm_invsqrt_ps(rsq00);
646             rinv01           = gmx_mm_invsqrt_ps(rsq01);
647             rinv02           = gmx_mm_invsqrt_ps(rsq02);
648             rinv10           = gmx_mm_invsqrt_ps(rsq10);
649             rinv11           = gmx_mm_invsqrt_ps(rsq11);
650             rinv12           = gmx_mm_invsqrt_ps(rsq12);
651             rinv20           = gmx_mm_invsqrt_ps(rsq20);
652             rinv21           = gmx_mm_invsqrt_ps(rsq21);
653             rinv22           = gmx_mm_invsqrt_ps(rsq22);
654
655             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
656             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
657             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
658             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
659             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
660             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
661             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
662             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
663             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
664
665             fjx0             = _mm_setzero_ps();
666             fjy0             = _mm_setzero_ps();
667             fjz0             = _mm_setzero_ps();
668             fjx1             = _mm_setzero_ps();
669             fjy1             = _mm_setzero_ps();
670             fjz1             = _mm_setzero_ps();
671             fjx2             = _mm_setzero_ps();
672             fjy2             = _mm_setzero_ps();
673             fjz2             = _mm_setzero_ps();
674
675             /**************************
676              * CALCULATE INTERACTIONS *
677              **************************/
678
679             r00              = _mm_mul_ps(rsq00,rinv00);
680             r00              = _mm_andnot_ps(dummy_mask,r00);
681
682             /* Calculate table index by multiplying r with table scale and truncate to integer */
683             rt               = _mm_mul_ps(r00,vftabscale);
684             vfitab           = _mm_cvttps_epi32(rt);
685             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
686             vfitab           = _mm_slli_epi32(vfitab,3);
687
688             /* COULOMB ELECTROSTATICS */
689             velec            = _mm_mul_ps(qq00,rinv00);
690             felec            = _mm_mul_ps(velec,rinvsq00);
691
692             /* CUBIC SPLINE TABLE DISPERSION */
693             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
694             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
695             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
696             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
697             _MM_TRANSPOSE4_PS(Y,F,G,H);
698             Heps             = _mm_mul_ps(vfeps,H);
699             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
700             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
701             vvdw6            = _mm_mul_ps(c6_00,VV);
702             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
703             fvdw6            = _mm_mul_ps(c6_00,FF);
704
705             /* CUBIC SPLINE TABLE REPULSION */
706             vfitab           = _mm_add_epi32(vfitab,ifour);
707             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
708             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
709             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
710             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
711             _MM_TRANSPOSE4_PS(Y,F,G,H);
712             Heps             = _mm_mul_ps(vfeps,H);
713             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
714             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
715             vvdw12           = _mm_mul_ps(c12_00,VV);
716             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
717             fvdw12           = _mm_mul_ps(c12_00,FF);
718             vvdw             = _mm_add_ps(vvdw12,vvdw6);
719             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
720
721             /* Update potential sum for this i atom from the interaction with this j atom. */
722             velec            = _mm_andnot_ps(dummy_mask,velec);
723             velecsum         = _mm_add_ps(velecsum,velec);
724             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
725             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
726
727             fscal            = _mm_add_ps(felec,fvdw);
728
729             fscal            = _mm_andnot_ps(dummy_mask,fscal);
730
731             /* Calculate temporary vectorial force */
732             tx               = _mm_mul_ps(fscal,dx00);
733             ty               = _mm_mul_ps(fscal,dy00);
734             tz               = _mm_mul_ps(fscal,dz00);
735
736             /* Update vectorial force */
737             fix0             = _mm_add_ps(fix0,tx);
738             fiy0             = _mm_add_ps(fiy0,ty);
739             fiz0             = _mm_add_ps(fiz0,tz);
740
741             fjx0             = _mm_add_ps(fjx0,tx);
742             fjy0             = _mm_add_ps(fjy0,ty);
743             fjz0             = _mm_add_ps(fjz0,tz);
744
745             /**************************
746              * CALCULATE INTERACTIONS *
747              **************************/
748
749             /* COULOMB ELECTROSTATICS */
750             velec            = _mm_mul_ps(qq01,rinv01);
751             felec            = _mm_mul_ps(velec,rinvsq01);
752
753             /* Update potential sum for this i atom from the interaction with this j atom. */
754             velec            = _mm_andnot_ps(dummy_mask,velec);
755             velecsum         = _mm_add_ps(velecsum,velec);
756
757             fscal            = felec;
758
759             fscal            = _mm_andnot_ps(dummy_mask,fscal);
760
761             /* Calculate temporary vectorial force */
762             tx               = _mm_mul_ps(fscal,dx01);
763             ty               = _mm_mul_ps(fscal,dy01);
764             tz               = _mm_mul_ps(fscal,dz01);
765
766             /* Update vectorial force */
767             fix0             = _mm_add_ps(fix0,tx);
768             fiy0             = _mm_add_ps(fiy0,ty);
769             fiz0             = _mm_add_ps(fiz0,tz);
770
771             fjx1             = _mm_add_ps(fjx1,tx);
772             fjy1             = _mm_add_ps(fjy1,ty);
773             fjz1             = _mm_add_ps(fjz1,tz);
774
775             /**************************
776              * CALCULATE INTERACTIONS *
777              **************************/
778
779             /* COULOMB ELECTROSTATICS */
780             velec            = _mm_mul_ps(qq02,rinv02);
781             felec            = _mm_mul_ps(velec,rinvsq02);
782
783             /* Update potential sum for this i atom from the interaction with this j atom. */
784             velec            = _mm_andnot_ps(dummy_mask,velec);
785             velecsum         = _mm_add_ps(velecsum,velec);
786
787             fscal            = felec;
788
789             fscal            = _mm_andnot_ps(dummy_mask,fscal);
790
791             /* Calculate temporary vectorial force */
792             tx               = _mm_mul_ps(fscal,dx02);
793             ty               = _mm_mul_ps(fscal,dy02);
794             tz               = _mm_mul_ps(fscal,dz02);
795
796             /* Update vectorial force */
797             fix0             = _mm_add_ps(fix0,tx);
798             fiy0             = _mm_add_ps(fiy0,ty);
799             fiz0             = _mm_add_ps(fiz0,tz);
800
801             fjx2             = _mm_add_ps(fjx2,tx);
802             fjy2             = _mm_add_ps(fjy2,ty);
803             fjz2             = _mm_add_ps(fjz2,tz);
804
805             /**************************
806              * CALCULATE INTERACTIONS *
807              **************************/
808
809             /* COULOMB ELECTROSTATICS */
810             velec            = _mm_mul_ps(qq10,rinv10);
811             felec            = _mm_mul_ps(velec,rinvsq10);
812
813             /* Update potential sum for this i atom from the interaction with this j atom. */
814             velec            = _mm_andnot_ps(dummy_mask,velec);
815             velecsum         = _mm_add_ps(velecsum,velec);
816
817             fscal            = felec;
818
819             fscal            = _mm_andnot_ps(dummy_mask,fscal);
820
821             /* Calculate temporary vectorial force */
822             tx               = _mm_mul_ps(fscal,dx10);
823             ty               = _mm_mul_ps(fscal,dy10);
824             tz               = _mm_mul_ps(fscal,dz10);
825
826             /* Update vectorial force */
827             fix1             = _mm_add_ps(fix1,tx);
828             fiy1             = _mm_add_ps(fiy1,ty);
829             fiz1             = _mm_add_ps(fiz1,tz);
830
831             fjx0             = _mm_add_ps(fjx0,tx);
832             fjy0             = _mm_add_ps(fjy0,ty);
833             fjz0             = _mm_add_ps(fjz0,tz);
834
835             /**************************
836              * CALCULATE INTERACTIONS *
837              **************************/
838
839             /* COULOMB ELECTROSTATICS */
840             velec            = _mm_mul_ps(qq11,rinv11);
841             felec            = _mm_mul_ps(velec,rinvsq11);
842
843             /* Update potential sum for this i atom from the interaction with this j atom. */
844             velec            = _mm_andnot_ps(dummy_mask,velec);
845             velecsum         = _mm_add_ps(velecsum,velec);
846
847             fscal            = felec;
848
849             fscal            = _mm_andnot_ps(dummy_mask,fscal);
850
851             /* Calculate temporary vectorial force */
852             tx               = _mm_mul_ps(fscal,dx11);
853             ty               = _mm_mul_ps(fscal,dy11);
854             tz               = _mm_mul_ps(fscal,dz11);
855
856             /* Update vectorial force */
857             fix1             = _mm_add_ps(fix1,tx);
858             fiy1             = _mm_add_ps(fiy1,ty);
859             fiz1             = _mm_add_ps(fiz1,tz);
860
861             fjx1             = _mm_add_ps(fjx1,tx);
862             fjy1             = _mm_add_ps(fjy1,ty);
863             fjz1             = _mm_add_ps(fjz1,tz);
864
865             /**************************
866              * CALCULATE INTERACTIONS *
867              **************************/
868
869             /* COULOMB ELECTROSTATICS */
870             velec            = _mm_mul_ps(qq12,rinv12);
871             felec            = _mm_mul_ps(velec,rinvsq12);
872
873             /* Update potential sum for this i atom from the interaction with this j atom. */
874             velec            = _mm_andnot_ps(dummy_mask,velec);
875             velecsum         = _mm_add_ps(velecsum,velec);
876
877             fscal            = felec;
878
879             fscal            = _mm_andnot_ps(dummy_mask,fscal);
880
881             /* Calculate temporary vectorial force */
882             tx               = _mm_mul_ps(fscal,dx12);
883             ty               = _mm_mul_ps(fscal,dy12);
884             tz               = _mm_mul_ps(fscal,dz12);
885
886             /* Update vectorial force */
887             fix1             = _mm_add_ps(fix1,tx);
888             fiy1             = _mm_add_ps(fiy1,ty);
889             fiz1             = _mm_add_ps(fiz1,tz);
890
891             fjx2             = _mm_add_ps(fjx2,tx);
892             fjy2             = _mm_add_ps(fjy2,ty);
893             fjz2             = _mm_add_ps(fjz2,tz);
894
895             /**************************
896              * CALCULATE INTERACTIONS *
897              **************************/
898
899             /* COULOMB ELECTROSTATICS */
900             velec            = _mm_mul_ps(qq20,rinv20);
901             felec            = _mm_mul_ps(velec,rinvsq20);
902
903             /* Update potential sum for this i atom from the interaction with this j atom. */
904             velec            = _mm_andnot_ps(dummy_mask,velec);
905             velecsum         = _mm_add_ps(velecsum,velec);
906
907             fscal            = felec;
908
909             fscal            = _mm_andnot_ps(dummy_mask,fscal);
910
911             /* Calculate temporary vectorial force */
912             tx               = _mm_mul_ps(fscal,dx20);
913             ty               = _mm_mul_ps(fscal,dy20);
914             tz               = _mm_mul_ps(fscal,dz20);
915
916             /* Update vectorial force */
917             fix2             = _mm_add_ps(fix2,tx);
918             fiy2             = _mm_add_ps(fiy2,ty);
919             fiz2             = _mm_add_ps(fiz2,tz);
920
921             fjx0             = _mm_add_ps(fjx0,tx);
922             fjy0             = _mm_add_ps(fjy0,ty);
923             fjz0             = _mm_add_ps(fjz0,tz);
924
925             /**************************
926              * CALCULATE INTERACTIONS *
927              **************************/
928
929             /* COULOMB ELECTROSTATICS */
930             velec            = _mm_mul_ps(qq21,rinv21);
931             felec            = _mm_mul_ps(velec,rinvsq21);
932
933             /* Update potential sum for this i atom from the interaction with this j atom. */
934             velec            = _mm_andnot_ps(dummy_mask,velec);
935             velecsum         = _mm_add_ps(velecsum,velec);
936
937             fscal            = felec;
938
939             fscal            = _mm_andnot_ps(dummy_mask,fscal);
940
941             /* Calculate temporary vectorial force */
942             tx               = _mm_mul_ps(fscal,dx21);
943             ty               = _mm_mul_ps(fscal,dy21);
944             tz               = _mm_mul_ps(fscal,dz21);
945
946             /* Update vectorial force */
947             fix2             = _mm_add_ps(fix2,tx);
948             fiy2             = _mm_add_ps(fiy2,ty);
949             fiz2             = _mm_add_ps(fiz2,tz);
950
951             fjx1             = _mm_add_ps(fjx1,tx);
952             fjy1             = _mm_add_ps(fjy1,ty);
953             fjz1             = _mm_add_ps(fjz1,tz);
954
955             /**************************
956              * CALCULATE INTERACTIONS *
957              **************************/
958
959             /* COULOMB ELECTROSTATICS */
960             velec            = _mm_mul_ps(qq22,rinv22);
961             felec            = _mm_mul_ps(velec,rinvsq22);
962
963             /* Update potential sum for this i atom from the interaction with this j atom. */
964             velec            = _mm_andnot_ps(dummy_mask,velec);
965             velecsum         = _mm_add_ps(velecsum,velec);
966
967             fscal            = felec;
968
969             fscal            = _mm_andnot_ps(dummy_mask,fscal);
970
971             /* Calculate temporary vectorial force */
972             tx               = _mm_mul_ps(fscal,dx22);
973             ty               = _mm_mul_ps(fscal,dy22);
974             tz               = _mm_mul_ps(fscal,dz22);
975
976             /* Update vectorial force */
977             fix2             = _mm_add_ps(fix2,tx);
978             fiy2             = _mm_add_ps(fiy2,ty);
979             fiz2             = _mm_add_ps(fiz2,tz);
980
981             fjx2             = _mm_add_ps(fjx2,tx);
982             fjy2             = _mm_add_ps(fjy2,ty);
983             fjz2             = _mm_add_ps(fjz2,tz);
984
985             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
986                                                    f+j_coord_offsetC,f+j_coord_offsetD,
987                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
988
989             /* Inner loop uses 288 flops */
990         }
991
992         /* End of innermost loop */
993
994         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
995                                               f+i_coord_offset,fshift+i_shift_offset);
996
997         ggid                        = gid[iidx];
998         /* Update potential energies */
999         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1000         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1001
1002         /* Increment number of inner iterations */
1003         inneriter                  += j_index_end - j_index_start;
1004
1005         /* Outer loop uses 29 flops */
1006     }
1007
1008     /* Increment number of outer iterations */
1009     outeriter        += nri;
1010
1011     /* Update outer/inner flops */
1012
1013     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*288);
1014 }
1015 /*
1016  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single
1017  * Electrostatics interaction: Coulomb
1018  * VdW interaction:            CubicSplineTable
1019  * Geometry:                   Water3-Water3
1020  * Calculate force/pot:        Force
1021  */
1022 void
1023 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single
1024                     (t_nblist * gmx_restrict                nlist,
1025                      rvec * gmx_restrict                    xx,
1026                      rvec * gmx_restrict                    ff,
1027                      t_forcerec * gmx_restrict              fr,
1028                      t_mdatoms * gmx_restrict               mdatoms,
1029                      nb_kernel_data_t * gmx_restrict        kernel_data,
1030                      t_nrnb * gmx_restrict                  nrnb)
1031 {
1032     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1033      * just 0 for non-waters.
1034      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1035      * jnr indices corresponding to data put in the four positions in the SIMD register.
1036      */
1037     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1038     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1039     int              jnrA,jnrB,jnrC,jnrD;
1040     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1041     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1042     real             shX,shY,shZ,rcutoff_scalar;
1043     real             *shiftvec,*fshift,*x,*f;
1044     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1045     int              vdwioffset0;
1046     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1047     int              vdwioffset1;
1048     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1049     int              vdwioffset2;
1050     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1051     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1052     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1053     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1054     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1055     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1056     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1057     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1058     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1059     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1060     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1061     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1062     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1063     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1064     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1065     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1066     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1067     real             *charge;
1068     int              nvdwtype;
1069     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1070     int              *vdwtype;
1071     real             *vdwparam;
1072     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1073     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1074     __m128i          vfitab;
1075     __m128i          ifour       = _mm_set1_epi32(4);
1076     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1077     real             *vftab;
1078     __m128           dummy_mask,cutoff_mask;
1079     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1080     __m128           one     = _mm_set1_ps(1.0);
1081     __m128           two     = _mm_set1_ps(2.0);
1082     x                = xx[0];
1083     f                = ff[0];
1084
1085     nri              = nlist->nri;
1086     iinr             = nlist->iinr;
1087     jindex           = nlist->jindex;
1088     jjnr             = nlist->jjnr;
1089     shiftidx         = nlist->shift;
1090     gid              = nlist->gid;
1091     shiftvec         = fr->shift_vec[0];
1092     fshift           = fr->fshift[0];
1093     facel            = _mm_set1_ps(fr->epsfac);
1094     charge           = mdatoms->chargeA;
1095     nvdwtype         = fr->ntype;
1096     vdwparam         = fr->nbfp;
1097     vdwtype          = mdatoms->typeA;
1098
1099     vftab            = kernel_data->table_vdw->data;
1100     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
1101
1102     /* Setup water-specific parameters */
1103     inr              = nlist->iinr[0];
1104     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1105     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1106     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1107     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1108
1109     jq0              = _mm_set1_ps(charge[inr+0]);
1110     jq1              = _mm_set1_ps(charge[inr+1]);
1111     jq2              = _mm_set1_ps(charge[inr+2]);
1112     vdwjidx0A        = 2*vdwtype[inr+0];
1113     qq00             = _mm_mul_ps(iq0,jq0);
1114     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1115     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1116     qq01             = _mm_mul_ps(iq0,jq1);
1117     qq02             = _mm_mul_ps(iq0,jq2);
1118     qq10             = _mm_mul_ps(iq1,jq0);
1119     qq11             = _mm_mul_ps(iq1,jq1);
1120     qq12             = _mm_mul_ps(iq1,jq2);
1121     qq20             = _mm_mul_ps(iq2,jq0);
1122     qq21             = _mm_mul_ps(iq2,jq1);
1123     qq22             = _mm_mul_ps(iq2,jq2);
1124
1125     /* Avoid stupid compiler warnings */
1126     jnrA = jnrB = jnrC = jnrD = 0;
1127     j_coord_offsetA = 0;
1128     j_coord_offsetB = 0;
1129     j_coord_offsetC = 0;
1130     j_coord_offsetD = 0;
1131
1132     outeriter        = 0;
1133     inneriter        = 0;
1134
1135     /* Start outer loop over neighborlists */
1136     for(iidx=0; iidx<nri; iidx++)
1137     {
1138         /* Load shift vector for this list */
1139         i_shift_offset   = DIM*shiftidx[iidx];
1140         shX              = shiftvec[i_shift_offset+XX];
1141         shY              = shiftvec[i_shift_offset+YY];
1142         shZ              = shiftvec[i_shift_offset+ZZ];
1143
1144         /* Load limits for loop over neighbors */
1145         j_index_start    = jindex[iidx];
1146         j_index_end      = jindex[iidx+1];
1147
1148         /* Get outer coordinate index */
1149         inr              = iinr[iidx];
1150         i_coord_offset   = DIM*inr;
1151
1152         /* Load i particle coords and add shift vector */
1153         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1154         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1155         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1156         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1157         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1158         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1159         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1160         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1161         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1162
1163         fix0             = _mm_setzero_ps();
1164         fiy0             = _mm_setzero_ps();
1165         fiz0             = _mm_setzero_ps();
1166         fix1             = _mm_setzero_ps();
1167         fiy1             = _mm_setzero_ps();
1168         fiz1             = _mm_setzero_ps();
1169         fix2             = _mm_setzero_ps();
1170         fiy2             = _mm_setzero_ps();
1171         fiz2             = _mm_setzero_ps();
1172
1173         /* Start inner kernel loop */
1174         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1175         {
1176
1177             /* Get j neighbor index, and coordinate index */
1178             jnrA             = jjnr[jidx];
1179             jnrB             = jjnr[jidx+1];
1180             jnrC             = jjnr[jidx+2];
1181             jnrD             = jjnr[jidx+3];
1182
1183             j_coord_offsetA  = DIM*jnrA;
1184             j_coord_offsetB  = DIM*jnrB;
1185             j_coord_offsetC  = DIM*jnrC;
1186             j_coord_offsetD  = DIM*jnrD;
1187
1188             /* load j atom coordinates */
1189             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1190                                               x+j_coord_offsetC,x+j_coord_offsetD,
1191                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1192
1193             /* Calculate displacement vector */
1194             dx00             = _mm_sub_ps(ix0,jx0);
1195             dy00             = _mm_sub_ps(iy0,jy0);
1196             dz00             = _mm_sub_ps(iz0,jz0);
1197             dx01             = _mm_sub_ps(ix0,jx1);
1198             dy01             = _mm_sub_ps(iy0,jy1);
1199             dz01             = _mm_sub_ps(iz0,jz1);
1200             dx02             = _mm_sub_ps(ix0,jx2);
1201             dy02             = _mm_sub_ps(iy0,jy2);
1202             dz02             = _mm_sub_ps(iz0,jz2);
1203             dx10             = _mm_sub_ps(ix1,jx0);
1204             dy10             = _mm_sub_ps(iy1,jy0);
1205             dz10             = _mm_sub_ps(iz1,jz0);
1206             dx11             = _mm_sub_ps(ix1,jx1);
1207             dy11             = _mm_sub_ps(iy1,jy1);
1208             dz11             = _mm_sub_ps(iz1,jz1);
1209             dx12             = _mm_sub_ps(ix1,jx2);
1210             dy12             = _mm_sub_ps(iy1,jy2);
1211             dz12             = _mm_sub_ps(iz1,jz2);
1212             dx20             = _mm_sub_ps(ix2,jx0);
1213             dy20             = _mm_sub_ps(iy2,jy0);
1214             dz20             = _mm_sub_ps(iz2,jz0);
1215             dx21             = _mm_sub_ps(ix2,jx1);
1216             dy21             = _mm_sub_ps(iy2,jy1);
1217             dz21             = _mm_sub_ps(iz2,jz1);
1218             dx22             = _mm_sub_ps(ix2,jx2);
1219             dy22             = _mm_sub_ps(iy2,jy2);
1220             dz22             = _mm_sub_ps(iz2,jz2);
1221
1222             /* Calculate squared distance and things based on it */
1223             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1224             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1225             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1226             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1227             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1228             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1229             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1230             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1231             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1232
1233             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1234             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1235             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1236             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1237             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1238             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1239             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1240             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1241             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1242
1243             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1244             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1245             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1246             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1247             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1248             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1249             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1250             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1251             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1252
1253             fjx0             = _mm_setzero_ps();
1254             fjy0             = _mm_setzero_ps();
1255             fjz0             = _mm_setzero_ps();
1256             fjx1             = _mm_setzero_ps();
1257             fjy1             = _mm_setzero_ps();
1258             fjz1             = _mm_setzero_ps();
1259             fjx2             = _mm_setzero_ps();
1260             fjy2             = _mm_setzero_ps();
1261             fjz2             = _mm_setzero_ps();
1262
1263             /**************************
1264              * CALCULATE INTERACTIONS *
1265              **************************/
1266
1267             r00              = _mm_mul_ps(rsq00,rinv00);
1268
1269             /* Calculate table index by multiplying r with table scale and truncate to integer */
1270             rt               = _mm_mul_ps(r00,vftabscale);
1271             vfitab           = _mm_cvttps_epi32(rt);
1272             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1273             vfitab           = _mm_slli_epi32(vfitab,3);
1274
1275             /* COULOMB ELECTROSTATICS */
1276             velec            = _mm_mul_ps(qq00,rinv00);
1277             felec            = _mm_mul_ps(velec,rinvsq00);
1278
1279             /* CUBIC SPLINE TABLE DISPERSION */
1280             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1281             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1282             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1283             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1284             _MM_TRANSPOSE4_PS(Y,F,G,H);
1285             Heps             = _mm_mul_ps(vfeps,H);
1286             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1287             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1288             fvdw6            = _mm_mul_ps(c6_00,FF);
1289
1290             /* CUBIC SPLINE TABLE REPULSION */
1291             vfitab           = _mm_add_epi32(vfitab,ifour);
1292             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1293             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1294             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1295             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1296             _MM_TRANSPOSE4_PS(Y,F,G,H);
1297             Heps             = _mm_mul_ps(vfeps,H);
1298             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1299             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1300             fvdw12           = _mm_mul_ps(c12_00,FF);
1301             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1302
1303             fscal            = _mm_add_ps(felec,fvdw);
1304
1305             /* Calculate temporary vectorial force */
1306             tx               = _mm_mul_ps(fscal,dx00);
1307             ty               = _mm_mul_ps(fscal,dy00);
1308             tz               = _mm_mul_ps(fscal,dz00);
1309
1310             /* Update vectorial force */
1311             fix0             = _mm_add_ps(fix0,tx);
1312             fiy0             = _mm_add_ps(fiy0,ty);
1313             fiz0             = _mm_add_ps(fiz0,tz);
1314
1315             fjx0             = _mm_add_ps(fjx0,tx);
1316             fjy0             = _mm_add_ps(fjy0,ty);
1317             fjz0             = _mm_add_ps(fjz0,tz);
1318
1319             /**************************
1320              * CALCULATE INTERACTIONS *
1321              **************************/
1322
1323             /* COULOMB ELECTROSTATICS */
1324             velec            = _mm_mul_ps(qq01,rinv01);
1325             felec            = _mm_mul_ps(velec,rinvsq01);
1326
1327             fscal            = felec;
1328
1329             /* Calculate temporary vectorial force */
1330             tx               = _mm_mul_ps(fscal,dx01);
1331             ty               = _mm_mul_ps(fscal,dy01);
1332             tz               = _mm_mul_ps(fscal,dz01);
1333
1334             /* Update vectorial force */
1335             fix0             = _mm_add_ps(fix0,tx);
1336             fiy0             = _mm_add_ps(fiy0,ty);
1337             fiz0             = _mm_add_ps(fiz0,tz);
1338
1339             fjx1             = _mm_add_ps(fjx1,tx);
1340             fjy1             = _mm_add_ps(fjy1,ty);
1341             fjz1             = _mm_add_ps(fjz1,tz);
1342
1343             /**************************
1344              * CALCULATE INTERACTIONS *
1345              **************************/
1346
1347             /* COULOMB ELECTROSTATICS */
1348             velec            = _mm_mul_ps(qq02,rinv02);
1349             felec            = _mm_mul_ps(velec,rinvsq02);
1350
1351             fscal            = felec;
1352
1353             /* Calculate temporary vectorial force */
1354             tx               = _mm_mul_ps(fscal,dx02);
1355             ty               = _mm_mul_ps(fscal,dy02);
1356             tz               = _mm_mul_ps(fscal,dz02);
1357
1358             /* Update vectorial force */
1359             fix0             = _mm_add_ps(fix0,tx);
1360             fiy0             = _mm_add_ps(fiy0,ty);
1361             fiz0             = _mm_add_ps(fiz0,tz);
1362
1363             fjx2             = _mm_add_ps(fjx2,tx);
1364             fjy2             = _mm_add_ps(fjy2,ty);
1365             fjz2             = _mm_add_ps(fjz2,tz);
1366
1367             /**************************
1368              * CALCULATE INTERACTIONS *
1369              **************************/
1370
1371             /* COULOMB ELECTROSTATICS */
1372             velec            = _mm_mul_ps(qq10,rinv10);
1373             felec            = _mm_mul_ps(velec,rinvsq10);
1374
1375             fscal            = felec;
1376
1377             /* Calculate temporary vectorial force */
1378             tx               = _mm_mul_ps(fscal,dx10);
1379             ty               = _mm_mul_ps(fscal,dy10);
1380             tz               = _mm_mul_ps(fscal,dz10);
1381
1382             /* Update vectorial force */
1383             fix1             = _mm_add_ps(fix1,tx);
1384             fiy1             = _mm_add_ps(fiy1,ty);
1385             fiz1             = _mm_add_ps(fiz1,tz);
1386
1387             fjx0             = _mm_add_ps(fjx0,tx);
1388             fjy0             = _mm_add_ps(fjy0,ty);
1389             fjz0             = _mm_add_ps(fjz0,tz);
1390
1391             /**************************
1392              * CALCULATE INTERACTIONS *
1393              **************************/
1394
1395             /* COULOMB ELECTROSTATICS */
1396             velec            = _mm_mul_ps(qq11,rinv11);
1397             felec            = _mm_mul_ps(velec,rinvsq11);
1398
1399             fscal            = felec;
1400
1401             /* Calculate temporary vectorial force */
1402             tx               = _mm_mul_ps(fscal,dx11);
1403             ty               = _mm_mul_ps(fscal,dy11);
1404             tz               = _mm_mul_ps(fscal,dz11);
1405
1406             /* Update vectorial force */
1407             fix1             = _mm_add_ps(fix1,tx);
1408             fiy1             = _mm_add_ps(fiy1,ty);
1409             fiz1             = _mm_add_ps(fiz1,tz);
1410
1411             fjx1             = _mm_add_ps(fjx1,tx);
1412             fjy1             = _mm_add_ps(fjy1,ty);
1413             fjz1             = _mm_add_ps(fjz1,tz);
1414
1415             /**************************
1416              * CALCULATE INTERACTIONS *
1417              **************************/
1418
1419             /* COULOMB ELECTROSTATICS */
1420             velec            = _mm_mul_ps(qq12,rinv12);
1421             felec            = _mm_mul_ps(velec,rinvsq12);
1422
1423             fscal            = felec;
1424
1425             /* Calculate temporary vectorial force */
1426             tx               = _mm_mul_ps(fscal,dx12);
1427             ty               = _mm_mul_ps(fscal,dy12);
1428             tz               = _mm_mul_ps(fscal,dz12);
1429
1430             /* Update vectorial force */
1431             fix1             = _mm_add_ps(fix1,tx);
1432             fiy1             = _mm_add_ps(fiy1,ty);
1433             fiz1             = _mm_add_ps(fiz1,tz);
1434
1435             fjx2             = _mm_add_ps(fjx2,tx);
1436             fjy2             = _mm_add_ps(fjy2,ty);
1437             fjz2             = _mm_add_ps(fjz2,tz);
1438
1439             /**************************
1440              * CALCULATE INTERACTIONS *
1441              **************************/
1442
1443             /* COULOMB ELECTROSTATICS */
1444             velec            = _mm_mul_ps(qq20,rinv20);
1445             felec            = _mm_mul_ps(velec,rinvsq20);
1446
1447             fscal            = felec;
1448
1449             /* Calculate temporary vectorial force */
1450             tx               = _mm_mul_ps(fscal,dx20);
1451             ty               = _mm_mul_ps(fscal,dy20);
1452             tz               = _mm_mul_ps(fscal,dz20);
1453
1454             /* Update vectorial force */
1455             fix2             = _mm_add_ps(fix2,tx);
1456             fiy2             = _mm_add_ps(fiy2,ty);
1457             fiz2             = _mm_add_ps(fiz2,tz);
1458
1459             fjx0             = _mm_add_ps(fjx0,tx);
1460             fjy0             = _mm_add_ps(fjy0,ty);
1461             fjz0             = _mm_add_ps(fjz0,tz);
1462
1463             /**************************
1464              * CALCULATE INTERACTIONS *
1465              **************************/
1466
1467             /* COULOMB ELECTROSTATICS */
1468             velec            = _mm_mul_ps(qq21,rinv21);
1469             felec            = _mm_mul_ps(velec,rinvsq21);
1470
1471             fscal            = felec;
1472
1473             /* Calculate temporary vectorial force */
1474             tx               = _mm_mul_ps(fscal,dx21);
1475             ty               = _mm_mul_ps(fscal,dy21);
1476             tz               = _mm_mul_ps(fscal,dz21);
1477
1478             /* Update vectorial force */
1479             fix2             = _mm_add_ps(fix2,tx);
1480             fiy2             = _mm_add_ps(fiy2,ty);
1481             fiz2             = _mm_add_ps(fiz2,tz);
1482
1483             fjx1             = _mm_add_ps(fjx1,tx);
1484             fjy1             = _mm_add_ps(fjy1,ty);
1485             fjz1             = _mm_add_ps(fjz1,tz);
1486
1487             /**************************
1488              * CALCULATE INTERACTIONS *
1489              **************************/
1490
1491             /* COULOMB ELECTROSTATICS */
1492             velec            = _mm_mul_ps(qq22,rinv22);
1493             felec            = _mm_mul_ps(velec,rinvsq22);
1494
1495             fscal            = felec;
1496
1497             /* Calculate temporary vectorial force */
1498             tx               = _mm_mul_ps(fscal,dx22);
1499             ty               = _mm_mul_ps(fscal,dy22);
1500             tz               = _mm_mul_ps(fscal,dz22);
1501
1502             /* Update vectorial force */
1503             fix2             = _mm_add_ps(fix2,tx);
1504             fiy2             = _mm_add_ps(fiy2,ty);
1505             fiz2             = _mm_add_ps(fiz2,tz);
1506
1507             fjx2             = _mm_add_ps(fjx2,tx);
1508             fjy2             = _mm_add_ps(fjy2,ty);
1509             fjz2             = _mm_add_ps(fjz2,tz);
1510
1511             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1512                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1513                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1514
1515             /* Inner loop uses 270 flops */
1516         }
1517
1518         if(jidx<j_index_end)
1519         {
1520
1521             /* Get j neighbor index, and coordinate index */
1522             jnrA             = jjnr[jidx];
1523             jnrB             = jjnr[jidx+1];
1524             jnrC             = jjnr[jidx+2];
1525             jnrD             = jjnr[jidx+3];
1526
1527             /* Sign of each element will be negative for non-real atoms.
1528              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1529              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1530              */
1531             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1532             jnrA       = (jnrA>=0) ? jnrA : 0;
1533             jnrB       = (jnrB>=0) ? jnrB : 0;
1534             jnrC       = (jnrC>=0) ? jnrC : 0;
1535             jnrD       = (jnrD>=0) ? jnrD : 0;
1536
1537             j_coord_offsetA  = DIM*jnrA;
1538             j_coord_offsetB  = DIM*jnrB;
1539             j_coord_offsetC  = DIM*jnrC;
1540             j_coord_offsetD  = DIM*jnrD;
1541
1542             /* load j atom coordinates */
1543             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1544                                               x+j_coord_offsetC,x+j_coord_offsetD,
1545                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1546
1547             /* Calculate displacement vector */
1548             dx00             = _mm_sub_ps(ix0,jx0);
1549             dy00             = _mm_sub_ps(iy0,jy0);
1550             dz00             = _mm_sub_ps(iz0,jz0);
1551             dx01             = _mm_sub_ps(ix0,jx1);
1552             dy01             = _mm_sub_ps(iy0,jy1);
1553             dz01             = _mm_sub_ps(iz0,jz1);
1554             dx02             = _mm_sub_ps(ix0,jx2);
1555             dy02             = _mm_sub_ps(iy0,jy2);
1556             dz02             = _mm_sub_ps(iz0,jz2);
1557             dx10             = _mm_sub_ps(ix1,jx0);
1558             dy10             = _mm_sub_ps(iy1,jy0);
1559             dz10             = _mm_sub_ps(iz1,jz0);
1560             dx11             = _mm_sub_ps(ix1,jx1);
1561             dy11             = _mm_sub_ps(iy1,jy1);
1562             dz11             = _mm_sub_ps(iz1,jz1);
1563             dx12             = _mm_sub_ps(ix1,jx2);
1564             dy12             = _mm_sub_ps(iy1,jy2);
1565             dz12             = _mm_sub_ps(iz1,jz2);
1566             dx20             = _mm_sub_ps(ix2,jx0);
1567             dy20             = _mm_sub_ps(iy2,jy0);
1568             dz20             = _mm_sub_ps(iz2,jz0);
1569             dx21             = _mm_sub_ps(ix2,jx1);
1570             dy21             = _mm_sub_ps(iy2,jy1);
1571             dz21             = _mm_sub_ps(iz2,jz1);
1572             dx22             = _mm_sub_ps(ix2,jx2);
1573             dy22             = _mm_sub_ps(iy2,jy2);
1574             dz22             = _mm_sub_ps(iz2,jz2);
1575
1576             /* Calculate squared distance and things based on it */
1577             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1578             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1579             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1580             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1581             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1582             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1583             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1584             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1585             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1586
1587             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1588             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1589             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1590             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1591             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1592             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1593             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1594             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1595             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1596
1597             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1598             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1599             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1600             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1601             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1602             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1603             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1604             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1605             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1606
1607             fjx0             = _mm_setzero_ps();
1608             fjy0             = _mm_setzero_ps();
1609             fjz0             = _mm_setzero_ps();
1610             fjx1             = _mm_setzero_ps();
1611             fjy1             = _mm_setzero_ps();
1612             fjz1             = _mm_setzero_ps();
1613             fjx2             = _mm_setzero_ps();
1614             fjy2             = _mm_setzero_ps();
1615             fjz2             = _mm_setzero_ps();
1616
1617             /**************************
1618              * CALCULATE INTERACTIONS *
1619              **************************/
1620
1621             r00              = _mm_mul_ps(rsq00,rinv00);
1622             r00              = _mm_andnot_ps(dummy_mask,r00);
1623
1624             /* Calculate table index by multiplying r with table scale and truncate to integer */
1625             rt               = _mm_mul_ps(r00,vftabscale);
1626             vfitab           = _mm_cvttps_epi32(rt);
1627             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1628             vfitab           = _mm_slli_epi32(vfitab,3);
1629
1630             /* COULOMB ELECTROSTATICS */
1631             velec            = _mm_mul_ps(qq00,rinv00);
1632             felec            = _mm_mul_ps(velec,rinvsq00);
1633
1634             /* CUBIC SPLINE TABLE DISPERSION */
1635             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1636             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1637             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1638             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1639             _MM_TRANSPOSE4_PS(Y,F,G,H);
1640             Heps             = _mm_mul_ps(vfeps,H);
1641             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1642             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1643             fvdw6            = _mm_mul_ps(c6_00,FF);
1644
1645             /* CUBIC SPLINE TABLE REPULSION */
1646             vfitab           = _mm_add_epi32(vfitab,ifour);
1647             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1648             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1649             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1650             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1651             _MM_TRANSPOSE4_PS(Y,F,G,H);
1652             Heps             = _mm_mul_ps(vfeps,H);
1653             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1654             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1655             fvdw12           = _mm_mul_ps(c12_00,FF);
1656             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1657
1658             fscal            = _mm_add_ps(felec,fvdw);
1659
1660             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1661
1662             /* Calculate temporary vectorial force */
1663             tx               = _mm_mul_ps(fscal,dx00);
1664             ty               = _mm_mul_ps(fscal,dy00);
1665             tz               = _mm_mul_ps(fscal,dz00);
1666
1667             /* Update vectorial force */
1668             fix0             = _mm_add_ps(fix0,tx);
1669             fiy0             = _mm_add_ps(fiy0,ty);
1670             fiz0             = _mm_add_ps(fiz0,tz);
1671
1672             fjx0             = _mm_add_ps(fjx0,tx);
1673             fjy0             = _mm_add_ps(fjy0,ty);
1674             fjz0             = _mm_add_ps(fjz0,tz);
1675
1676             /**************************
1677              * CALCULATE INTERACTIONS *
1678              **************************/
1679
1680             /* COULOMB ELECTROSTATICS */
1681             velec            = _mm_mul_ps(qq01,rinv01);
1682             felec            = _mm_mul_ps(velec,rinvsq01);
1683
1684             fscal            = felec;
1685
1686             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1687
1688             /* Calculate temporary vectorial force */
1689             tx               = _mm_mul_ps(fscal,dx01);
1690             ty               = _mm_mul_ps(fscal,dy01);
1691             tz               = _mm_mul_ps(fscal,dz01);
1692
1693             /* Update vectorial force */
1694             fix0             = _mm_add_ps(fix0,tx);
1695             fiy0             = _mm_add_ps(fiy0,ty);
1696             fiz0             = _mm_add_ps(fiz0,tz);
1697
1698             fjx1             = _mm_add_ps(fjx1,tx);
1699             fjy1             = _mm_add_ps(fjy1,ty);
1700             fjz1             = _mm_add_ps(fjz1,tz);
1701
1702             /**************************
1703              * CALCULATE INTERACTIONS *
1704              **************************/
1705
1706             /* COULOMB ELECTROSTATICS */
1707             velec            = _mm_mul_ps(qq02,rinv02);
1708             felec            = _mm_mul_ps(velec,rinvsq02);
1709
1710             fscal            = felec;
1711
1712             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1713
1714             /* Calculate temporary vectorial force */
1715             tx               = _mm_mul_ps(fscal,dx02);
1716             ty               = _mm_mul_ps(fscal,dy02);
1717             tz               = _mm_mul_ps(fscal,dz02);
1718
1719             /* Update vectorial force */
1720             fix0             = _mm_add_ps(fix0,tx);
1721             fiy0             = _mm_add_ps(fiy0,ty);
1722             fiz0             = _mm_add_ps(fiz0,tz);
1723
1724             fjx2             = _mm_add_ps(fjx2,tx);
1725             fjy2             = _mm_add_ps(fjy2,ty);
1726             fjz2             = _mm_add_ps(fjz2,tz);
1727
1728             /**************************
1729              * CALCULATE INTERACTIONS *
1730              **************************/
1731
1732             /* COULOMB ELECTROSTATICS */
1733             velec            = _mm_mul_ps(qq10,rinv10);
1734             felec            = _mm_mul_ps(velec,rinvsq10);
1735
1736             fscal            = felec;
1737
1738             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1739
1740             /* Calculate temporary vectorial force */
1741             tx               = _mm_mul_ps(fscal,dx10);
1742             ty               = _mm_mul_ps(fscal,dy10);
1743             tz               = _mm_mul_ps(fscal,dz10);
1744
1745             /* Update vectorial force */
1746             fix1             = _mm_add_ps(fix1,tx);
1747             fiy1             = _mm_add_ps(fiy1,ty);
1748             fiz1             = _mm_add_ps(fiz1,tz);
1749
1750             fjx0             = _mm_add_ps(fjx0,tx);
1751             fjy0             = _mm_add_ps(fjy0,ty);
1752             fjz0             = _mm_add_ps(fjz0,tz);
1753
1754             /**************************
1755              * CALCULATE INTERACTIONS *
1756              **************************/
1757
1758             /* COULOMB ELECTROSTATICS */
1759             velec            = _mm_mul_ps(qq11,rinv11);
1760             felec            = _mm_mul_ps(velec,rinvsq11);
1761
1762             fscal            = felec;
1763
1764             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1765
1766             /* Calculate temporary vectorial force */
1767             tx               = _mm_mul_ps(fscal,dx11);
1768             ty               = _mm_mul_ps(fscal,dy11);
1769             tz               = _mm_mul_ps(fscal,dz11);
1770
1771             /* Update vectorial force */
1772             fix1             = _mm_add_ps(fix1,tx);
1773             fiy1             = _mm_add_ps(fiy1,ty);
1774             fiz1             = _mm_add_ps(fiz1,tz);
1775
1776             fjx1             = _mm_add_ps(fjx1,tx);
1777             fjy1             = _mm_add_ps(fjy1,ty);
1778             fjz1             = _mm_add_ps(fjz1,tz);
1779
1780             /**************************
1781              * CALCULATE INTERACTIONS *
1782              **************************/
1783
1784             /* COULOMB ELECTROSTATICS */
1785             velec            = _mm_mul_ps(qq12,rinv12);
1786             felec            = _mm_mul_ps(velec,rinvsq12);
1787
1788             fscal            = felec;
1789
1790             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1791
1792             /* Calculate temporary vectorial force */
1793             tx               = _mm_mul_ps(fscal,dx12);
1794             ty               = _mm_mul_ps(fscal,dy12);
1795             tz               = _mm_mul_ps(fscal,dz12);
1796
1797             /* Update vectorial force */
1798             fix1             = _mm_add_ps(fix1,tx);
1799             fiy1             = _mm_add_ps(fiy1,ty);
1800             fiz1             = _mm_add_ps(fiz1,tz);
1801
1802             fjx2             = _mm_add_ps(fjx2,tx);
1803             fjy2             = _mm_add_ps(fjy2,ty);
1804             fjz2             = _mm_add_ps(fjz2,tz);
1805
1806             /**************************
1807              * CALCULATE INTERACTIONS *
1808              **************************/
1809
1810             /* COULOMB ELECTROSTATICS */
1811             velec            = _mm_mul_ps(qq20,rinv20);
1812             felec            = _mm_mul_ps(velec,rinvsq20);
1813
1814             fscal            = felec;
1815
1816             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1817
1818             /* Calculate temporary vectorial force */
1819             tx               = _mm_mul_ps(fscal,dx20);
1820             ty               = _mm_mul_ps(fscal,dy20);
1821             tz               = _mm_mul_ps(fscal,dz20);
1822
1823             /* Update vectorial force */
1824             fix2             = _mm_add_ps(fix2,tx);
1825             fiy2             = _mm_add_ps(fiy2,ty);
1826             fiz2             = _mm_add_ps(fiz2,tz);
1827
1828             fjx0             = _mm_add_ps(fjx0,tx);
1829             fjy0             = _mm_add_ps(fjy0,ty);
1830             fjz0             = _mm_add_ps(fjz0,tz);
1831
1832             /**************************
1833              * CALCULATE INTERACTIONS *
1834              **************************/
1835
1836             /* COULOMB ELECTROSTATICS */
1837             velec            = _mm_mul_ps(qq21,rinv21);
1838             felec            = _mm_mul_ps(velec,rinvsq21);
1839
1840             fscal            = felec;
1841
1842             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1843
1844             /* Calculate temporary vectorial force */
1845             tx               = _mm_mul_ps(fscal,dx21);
1846             ty               = _mm_mul_ps(fscal,dy21);
1847             tz               = _mm_mul_ps(fscal,dz21);
1848
1849             /* Update vectorial force */
1850             fix2             = _mm_add_ps(fix2,tx);
1851             fiy2             = _mm_add_ps(fiy2,ty);
1852             fiz2             = _mm_add_ps(fiz2,tz);
1853
1854             fjx1             = _mm_add_ps(fjx1,tx);
1855             fjy1             = _mm_add_ps(fjy1,ty);
1856             fjz1             = _mm_add_ps(fjz1,tz);
1857
1858             /**************************
1859              * CALCULATE INTERACTIONS *
1860              **************************/
1861
1862             /* COULOMB ELECTROSTATICS */
1863             velec            = _mm_mul_ps(qq22,rinv22);
1864             felec            = _mm_mul_ps(velec,rinvsq22);
1865
1866             fscal            = felec;
1867
1868             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1869
1870             /* Calculate temporary vectorial force */
1871             tx               = _mm_mul_ps(fscal,dx22);
1872             ty               = _mm_mul_ps(fscal,dy22);
1873             tz               = _mm_mul_ps(fscal,dz22);
1874
1875             /* Update vectorial force */
1876             fix2             = _mm_add_ps(fix2,tx);
1877             fiy2             = _mm_add_ps(fiy2,ty);
1878             fiz2             = _mm_add_ps(fiz2,tz);
1879
1880             fjx2             = _mm_add_ps(fjx2,tx);
1881             fjy2             = _mm_add_ps(fjy2,ty);
1882             fjz2             = _mm_add_ps(fjz2,tz);
1883
1884             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1885                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1886                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1887
1888             /* Inner loop uses 271 flops */
1889         }
1890
1891         /* End of innermost loop */
1892
1893         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1894                                               f+i_coord_offset,fshift+i_shift_offset);
1895
1896         /* Increment number of inner iterations */
1897         inneriter                  += j_index_end - j_index_start;
1898
1899         /* Outer loop uses 27 flops */
1900     }
1901
1902     /* Increment number of outer iterations */
1903     outeriter        += nri;
1904
1905     /* Update outer/inner flops */
1906
1907     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*271);
1908 }