Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sse4_1_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse4_1_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse4_1_single
51  * Electrostatics interaction: Coulomb
52  * VdW interaction:            CubicSplineTable
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse4_1_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwioffset3;
89     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
97     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
115     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
116     __m128i          vfitab;
117     __m128i          ifour       = _mm_set1_epi32(4);
118     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
119     real             *vftab;
120     __m128           dummy_mask,cutoff_mask;
121     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
122     __m128           one     = _mm_set1_ps(1.0);
123     __m128           two     = _mm_set1_ps(2.0);
124     x                = xx[0];
125     f                = ff[0];
126
127     nri              = nlist->nri;
128     iinr             = nlist->iinr;
129     jindex           = nlist->jindex;
130     jjnr             = nlist->jjnr;
131     shiftidx         = nlist->shift;
132     gid              = nlist->gid;
133     shiftvec         = fr->shift_vec[0];
134     fshift           = fr->fshift[0];
135     facel            = _mm_set1_ps(fr->ic->epsfac);
136     charge           = mdatoms->chargeA;
137     nvdwtype         = fr->ntype;
138     vdwparam         = fr->nbfp;
139     vdwtype          = mdatoms->typeA;
140
141     vftab            = kernel_data->table_vdw->data;
142     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
143
144     /* Setup water-specific parameters */
145     inr              = nlist->iinr[0];
146     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
147     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
148     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
149     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
150
151     jq1              = _mm_set1_ps(charge[inr+1]);
152     jq2              = _mm_set1_ps(charge[inr+2]);
153     jq3              = _mm_set1_ps(charge[inr+3]);
154     vdwjidx0A        = 2*vdwtype[inr+0];
155     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
156     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
157     qq11             = _mm_mul_ps(iq1,jq1);
158     qq12             = _mm_mul_ps(iq1,jq2);
159     qq13             = _mm_mul_ps(iq1,jq3);
160     qq21             = _mm_mul_ps(iq2,jq1);
161     qq22             = _mm_mul_ps(iq2,jq2);
162     qq23             = _mm_mul_ps(iq2,jq3);
163     qq31             = _mm_mul_ps(iq3,jq1);
164     qq32             = _mm_mul_ps(iq3,jq2);
165     qq33             = _mm_mul_ps(iq3,jq3);
166
167     /* Avoid stupid compiler warnings */
168     jnrA = jnrB = jnrC = jnrD = 0;
169     j_coord_offsetA = 0;
170     j_coord_offsetB = 0;
171     j_coord_offsetC = 0;
172     j_coord_offsetD = 0;
173
174     outeriter        = 0;
175     inneriter        = 0;
176
177     for(iidx=0;iidx<4*DIM;iidx++)
178     {
179         scratch[iidx] = 0.0;
180     }
181
182     /* Start outer loop over neighborlists */
183     for(iidx=0; iidx<nri; iidx++)
184     {
185         /* Load shift vector for this list */
186         i_shift_offset   = DIM*shiftidx[iidx];
187
188         /* Load limits for loop over neighbors */
189         j_index_start    = jindex[iidx];
190         j_index_end      = jindex[iidx+1];
191
192         /* Get outer coordinate index */
193         inr              = iinr[iidx];
194         i_coord_offset   = DIM*inr;
195
196         /* Load i particle coords and add shift vector */
197         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
198                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
199
200         fix0             = _mm_setzero_ps();
201         fiy0             = _mm_setzero_ps();
202         fiz0             = _mm_setzero_ps();
203         fix1             = _mm_setzero_ps();
204         fiy1             = _mm_setzero_ps();
205         fiz1             = _mm_setzero_ps();
206         fix2             = _mm_setzero_ps();
207         fiy2             = _mm_setzero_ps();
208         fiz2             = _mm_setzero_ps();
209         fix3             = _mm_setzero_ps();
210         fiy3             = _mm_setzero_ps();
211         fiz3             = _mm_setzero_ps();
212
213         /* Reset potential sums */
214         velecsum         = _mm_setzero_ps();
215         vvdwsum          = _mm_setzero_ps();
216
217         /* Start inner kernel loop */
218         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
219         {
220
221             /* Get j neighbor index, and coordinate index */
222             jnrA             = jjnr[jidx];
223             jnrB             = jjnr[jidx+1];
224             jnrC             = jjnr[jidx+2];
225             jnrD             = jjnr[jidx+3];
226             j_coord_offsetA  = DIM*jnrA;
227             j_coord_offsetB  = DIM*jnrB;
228             j_coord_offsetC  = DIM*jnrC;
229             j_coord_offsetD  = DIM*jnrD;
230
231             /* load j atom coordinates */
232             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
233                                               x+j_coord_offsetC,x+j_coord_offsetD,
234                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
235                                               &jy2,&jz2,&jx3,&jy3,&jz3);
236
237             /* Calculate displacement vector */
238             dx00             = _mm_sub_ps(ix0,jx0);
239             dy00             = _mm_sub_ps(iy0,jy0);
240             dz00             = _mm_sub_ps(iz0,jz0);
241             dx11             = _mm_sub_ps(ix1,jx1);
242             dy11             = _mm_sub_ps(iy1,jy1);
243             dz11             = _mm_sub_ps(iz1,jz1);
244             dx12             = _mm_sub_ps(ix1,jx2);
245             dy12             = _mm_sub_ps(iy1,jy2);
246             dz12             = _mm_sub_ps(iz1,jz2);
247             dx13             = _mm_sub_ps(ix1,jx3);
248             dy13             = _mm_sub_ps(iy1,jy3);
249             dz13             = _mm_sub_ps(iz1,jz3);
250             dx21             = _mm_sub_ps(ix2,jx1);
251             dy21             = _mm_sub_ps(iy2,jy1);
252             dz21             = _mm_sub_ps(iz2,jz1);
253             dx22             = _mm_sub_ps(ix2,jx2);
254             dy22             = _mm_sub_ps(iy2,jy2);
255             dz22             = _mm_sub_ps(iz2,jz2);
256             dx23             = _mm_sub_ps(ix2,jx3);
257             dy23             = _mm_sub_ps(iy2,jy3);
258             dz23             = _mm_sub_ps(iz2,jz3);
259             dx31             = _mm_sub_ps(ix3,jx1);
260             dy31             = _mm_sub_ps(iy3,jy1);
261             dz31             = _mm_sub_ps(iz3,jz1);
262             dx32             = _mm_sub_ps(ix3,jx2);
263             dy32             = _mm_sub_ps(iy3,jy2);
264             dz32             = _mm_sub_ps(iz3,jz2);
265             dx33             = _mm_sub_ps(ix3,jx3);
266             dy33             = _mm_sub_ps(iy3,jy3);
267             dz33             = _mm_sub_ps(iz3,jz3);
268
269             /* Calculate squared distance and things based on it */
270             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
271             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
272             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
273             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
274             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
275             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
276             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
277             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
278             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
279             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
280
281             rinv00           = sse41_invsqrt_f(rsq00);
282             rinv11           = sse41_invsqrt_f(rsq11);
283             rinv12           = sse41_invsqrt_f(rsq12);
284             rinv13           = sse41_invsqrt_f(rsq13);
285             rinv21           = sse41_invsqrt_f(rsq21);
286             rinv22           = sse41_invsqrt_f(rsq22);
287             rinv23           = sse41_invsqrt_f(rsq23);
288             rinv31           = sse41_invsqrt_f(rsq31);
289             rinv32           = sse41_invsqrt_f(rsq32);
290             rinv33           = sse41_invsqrt_f(rsq33);
291
292             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
293             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
294             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
295             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
296             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
297             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
298             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
299             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
300             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
301
302             fjx0             = _mm_setzero_ps();
303             fjy0             = _mm_setzero_ps();
304             fjz0             = _mm_setzero_ps();
305             fjx1             = _mm_setzero_ps();
306             fjy1             = _mm_setzero_ps();
307             fjz1             = _mm_setzero_ps();
308             fjx2             = _mm_setzero_ps();
309             fjy2             = _mm_setzero_ps();
310             fjz2             = _mm_setzero_ps();
311             fjx3             = _mm_setzero_ps();
312             fjy3             = _mm_setzero_ps();
313             fjz3             = _mm_setzero_ps();
314
315             /**************************
316              * CALCULATE INTERACTIONS *
317              **************************/
318
319             r00              = _mm_mul_ps(rsq00,rinv00);
320
321             /* Calculate table index by multiplying r with table scale and truncate to integer */
322             rt               = _mm_mul_ps(r00,vftabscale);
323             vfitab           = _mm_cvttps_epi32(rt);
324             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
325             vfitab           = _mm_slli_epi32(vfitab,3);
326
327             /* CUBIC SPLINE TABLE DISPERSION */
328             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
329             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
330             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
331             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
332             _MM_TRANSPOSE4_PS(Y,F,G,H);
333             Heps             = _mm_mul_ps(vfeps,H);
334             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
335             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
336             vvdw6            = _mm_mul_ps(c6_00,VV);
337             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
338             fvdw6            = _mm_mul_ps(c6_00,FF);
339
340             /* CUBIC SPLINE TABLE REPULSION */
341             vfitab           = _mm_add_epi32(vfitab,ifour);
342             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
343             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
344             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
345             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
346             _MM_TRANSPOSE4_PS(Y,F,G,H);
347             Heps             = _mm_mul_ps(vfeps,H);
348             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
349             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
350             vvdw12           = _mm_mul_ps(c12_00,VV);
351             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
352             fvdw12           = _mm_mul_ps(c12_00,FF);
353             vvdw             = _mm_add_ps(vvdw12,vvdw6);
354             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
355
356             /* Update potential sum for this i atom from the interaction with this j atom. */
357             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
358
359             fscal            = fvdw;
360
361             /* Calculate temporary vectorial force */
362             tx               = _mm_mul_ps(fscal,dx00);
363             ty               = _mm_mul_ps(fscal,dy00);
364             tz               = _mm_mul_ps(fscal,dz00);
365
366             /* Update vectorial force */
367             fix0             = _mm_add_ps(fix0,tx);
368             fiy0             = _mm_add_ps(fiy0,ty);
369             fiz0             = _mm_add_ps(fiz0,tz);
370
371             fjx0             = _mm_add_ps(fjx0,tx);
372             fjy0             = _mm_add_ps(fjy0,ty);
373             fjz0             = _mm_add_ps(fjz0,tz);
374
375             /**************************
376              * CALCULATE INTERACTIONS *
377              **************************/
378
379             /* COULOMB ELECTROSTATICS */
380             velec            = _mm_mul_ps(qq11,rinv11);
381             felec            = _mm_mul_ps(velec,rinvsq11);
382
383             /* Update potential sum for this i atom from the interaction with this j atom. */
384             velecsum         = _mm_add_ps(velecsum,velec);
385
386             fscal            = felec;
387
388             /* Calculate temporary vectorial force */
389             tx               = _mm_mul_ps(fscal,dx11);
390             ty               = _mm_mul_ps(fscal,dy11);
391             tz               = _mm_mul_ps(fscal,dz11);
392
393             /* Update vectorial force */
394             fix1             = _mm_add_ps(fix1,tx);
395             fiy1             = _mm_add_ps(fiy1,ty);
396             fiz1             = _mm_add_ps(fiz1,tz);
397
398             fjx1             = _mm_add_ps(fjx1,tx);
399             fjy1             = _mm_add_ps(fjy1,ty);
400             fjz1             = _mm_add_ps(fjz1,tz);
401
402             /**************************
403              * CALCULATE INTERACTIONS *
404              **************************/
405
406             /* COULOMB ELECTROSTATICS */
407             velec            = _mm_mul_ps(qq12,rinv12);
408             felec            = _mm_mul_ps(velec,rinvsq12);
409
410             /* Update potential sum for this i atom from the interaction with this j atom. */
411             velecsum         = _mm_add_ps(velecsum,velec);
412
413             fscal            = felec;
414
415             /* Calculate temporary vectorial force */
416             tx               = _mm_mul_ps(fscal,dx12);
417             ty               = _mm_mul_ps(fscal,dy12);
418             tz               = _mm_mul_ps(fscal,dz12);
419
420             /* Update vectorial force */
421             fix1             = _mm_add_ps(fix1,tx);
422             fiy1             = _mm_add_ps(fiy1,ty);
423             fiz1             = _mm_add_ps(fiz1,tz);
424
425             fjx2             = _mm_add_ps(fjx2,tx);
426             fjy2             = _mm_add_ps(fjy2,ty);
427             fjz2             = _mm_add_ps(fjz2,tz);
428
429             /**************************
430              * CALCULATE INTERACTIONS *
431              **************************/
432
433             /* COULOMB ELECTROSTATICS */
434             velec            = _mm_mul_ps(qq13,rinv13);
435             felec            = _mm_mul_ps(velec,rinvsq13);
436
437             /* Update potential sum for this i atom from the interaction with this j atom. */
438             velecsum         = _mm_add_ps(velecsum,velec);
439
440             fscal            = felec;
441
442             /* Calculate temporary vectorial force */
443             tx               = _mm_mul_ps(fscal,dx13);
444             ty               = _mm_mul_ps(fscal,dy13);
445             tz               = _mm_mul_ps(fscal,dz13);
446
447             /* Update vectorial force */
448             fix1             = _mm_add_ps(fix1,tx);
449             fiy1             = _mm_add_ps(fiy1,ty);
450             fiz1             = _mm_add_ps(fiz1,tz);
451
452             fjx3             = _mm_add_ps(fjx3,tx);
453             fjy3             = _mm_add_ps(fjy3,ty);
454             fjz3             = _mm_add_ps(fjz3,tz);
455
456             /**************************
457              * CALCULATE INTERACTIONS *
458              **************************/
459
460             /* COULOMB ELECTROSTATICS */
461             velec            = _mm_mul_ps(qq21,rinv21);
462             felec            = _mm_mul_ps(velec,rinvsq21);
463
464             /* Update potential sum for this i atom from the interaction with this j atom. */
465             velecsum         = _mm_add_ps(velecsum,velec);
466
467             fscal            = felec;
468
469             /* Calculate temporary vectorial force */
470             tx               = _mm_mul_ps(fscal,dx21);
471             ty               = _mm_mul_ps(fscal,dy21);
472             tz               = _mm_mul_ps(fscal,dz21);
473
474             /* Update vectorial force */
475             fix2             = _mm_add_ps(fix2,tx);
476             fiy2             = _mm_add_ps(fiy2,ty);
477             fiz2             = _mm_add_ps(fiz2,tz);
478
479             fjx1             = _mm_add_ps(fjx1,tx);
480             fjy1             = _mm_add_ps(fjy1,ty);
481             fjz1             = _mm_add_ps(fjz1,tz);
482
483             /**************************
484              * CALCULATE INTERACTIONS *
485              **************************/
486
487             /* COULOMB ELECTROSTATICS */
488             velec            = _mm_mul_ps(qq22,rinv22);
489             felec            = _mm_mul_ps(velec,rinvsq22);
490
491             /* Update potential sum for this i atom from the interaction with this j atom. */
492             velecsum         = _mm_add_ps(velecsum,velec);
493
494             fscal            = felec;
495
496             /* Calculate temporary vectorial force */
497             tx               = _mm_mul_ps(fscal,dx22);
498             ty               = _mm_mul_ps(fscal,dy22);
499             tz               = _mm_mul_ps(fscal,dz22);
500
501             /* Update vectorial force */
502             fix2             = _mm_add_ps(fix2,tx);
503             fiy2             = _mm_add_ps(fiy2,ty);
504             fiz2             = _mm_add_ps(fiz2,tz);
505
506             fjx2             = _mm_add_ps(fjx2,tx);
507             fjy2             = _mm_add_ps(fjy2,ty);
508             fjz2             = _mm_add_ps(fjz2,tz);
509
510             /**************************
511              * CALCULATE INTERACTIONS *
512              **************************/
513
514             /* COULOMB ELECTROSTATICS */
515             velec            = _mm_mul_ps(qq23,rinv23);
516             felec            = _mm_mul_ps(velec,rinvsq23);
517
518             /* Update potential sum for this i atom from the interaction with this j atom. */
519             velecsum         = _mm_add_ps(velecsum,velec);
520
521             fscal            = felec;
522
523             /* Calculate temporary vectorial force */
524             tx               = _mm_mul_ps(fscal,dx23);
525             ty               = _mm_mul_ps(fscal,dy23);
526             tz               = _mm_mul_ps(fscal,dz23);
527
528             /* Update vectorial force */
529             fix2             = _mm_add_ps(fix2,tx);
530             fiy2             = _mm_add_ps(fiy2,ty);
531             fiz2             = _mm_add_ps(fiz2,tz);
532
533             fjx3             = _mm_add_ps(fjx3,tx);
534             fjy3             = _mm_add_ps(fjy3,ty);
535             fjz3             = _mm_add_ps(fjz3,tz);
536
537             /**************************
538              * CALCULATE INTERACTIONS *
539              **************************/
540
541             /* COULOMB ELECTROSTATICS */
542             velec            = _mm_mul_ps(qq31,rinv31);
543             felec            = _mm_mul_ps(velec,rinvsq31);
544
545             /* Update potential sum for this i atom from the interaction with this j atom. */
546             velecsum         = _mm_add_ps(velecsum,velec);
547
548             fscal            = felec;
549
550             /* Calculate temporary vectorial force */
551             tx               = _mm_mul_ps(fscal,dx31);
552             ty               = _mm_mul_ps(fscal,dy31);
553             tz               = _mm_mul_ps(fscal,dz31);
554
555             /* Update vectorial force */
556             fix3             = _mm_add_ps(fix3,tx);
557             fiy3             = _mm_add_ps(fiy3,ty);
558             fiz3             = _mm_add_ps(fiz3,tz);
559
560             fjx1             = _mm_add_ps(fjx1,tx);
561             fjy1             = _mm_add_ps(fjy1,ty);
562             fjz1             = _mm_add_ps(fjz1,tz);
563
564             /**************************
565              * CALCULATE INTERACTIONS *
566              **************************/
567
568             /* COULOMB ELECTROSTATICS */
569             velec            = _mm_mul_ps(qq32,rinv32);
570             felec            = _mm_mul_ps(velec,rinvsq32);
571
572             /* Update potential sum for this i atom from the interaction with this j atom. */
573             velecsum         = _mm_add_ps(velecsum,velec);
574
575             fscal            = felec;
576
577             /* Calculate temporary vectorial force */
578             tx               = _mm_mul_ps(fscal,dx32);
579             ty               = _mm_mul_ps(fscal,dy32);
580             tz               = _mm_mul_ps(fscal,dz32);
581
582             /* Update vectorial force */
583             fix3             = _mm_add_ps(fix3,tx);
584             fiy3             = _mm_add_ps(fiy3,ty);
585             fiz3             = _mm_add_ps(fiz3,tz);
586
587             fjx2             = _mm_add_ps(fjx2,tx);
588             fjy2             = _mm_add_ps(fjy2,ty);
589             fjz2             = _mm_add_ps(fjz2,tz);
590
591             /**************************
592              * CALCULATE INTERACTIONS *
593              **************************/
594
595             /* COULOMB ELECTROSTATICS */
596             velec            = _mm_mul_ps(qq33,rinv33);
597             felec            = _mm_mul_ps(velec,rinvsq33);
598
599             /* Update potential sum for this i atom from the interaction with this j atom. */
600             velecsum         = _mm_add_ps(velecsum,velec);
601
602             fscal            = felec;
603
604             /* Calculate temporary vectorial force */
605             tx               = _mm_mul_ps(fscal,dx33);
606             ty               = _mm_mul_ps(fscal,dy33);
607             tz               = _mm_mul_ps(fscal,dz33);
608
609             /* Update vectorial force */
610             fix3             = _mm_add_ps(fix3,tx);
611             fiy3             = _mm_add_ps(fiy3,ty);
612             fiz3             = _mm_add_ps(fiz3,tz);
613
614             fjx3             = _mm_add_ps(fjx3,tx);
615             fjy3             = _mm_add_ps(fjy3,ty);
616             fjz3             = _mm_add_ps(fjz3,tz);
617
618             fjptrA             = f+j_coord_offsetA;
619             fjptrB             = f+j_coord_offsetB;
620             fjptrC             = f+j_coord_offsetC;
621             fjptrD             = f+j_coord_offsetD;
622
623             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
624                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
625                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
626
627             /* Inner loop uses 311 flops */
628         }
629
630         if(jidx<j_index_end)
631         {
632
633             /* Get j neighbor index, and coordinate index */
634             jnrlistA         = jjnr[jidx];
635             jnrlistB         = jjnr[jidx+1];
636             jnrlistC         = jjnr[jidx+2];
637             jnrlistD         = jjnr[jidx+3];
638             /* Sign of each element will be negative for non-real atoms.
639              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
640              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
641              */
642             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
643             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
644             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
645             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
646             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
647             j_coord_offsetA  = DIM*jnrA;
648             j_coord_offsetB  = DIM*jnrB;
649             j_coord_offsetC  = DIM*jnrC;
650             j_coord_offsetD  = DIM*jnrD;
651
652             /* load j atom coordinates */
653             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
654                                               x+j_coord_offsetC,x+j_coord_offsetD,
655                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
656                                               &jy2,&jz2,&jx3,&jy3,&jz3);
657
658             /* Calculate displacement vector */
659             dx00             = _mm_sub_ps(ix0,jx0);
660             dy00             = _mm_sub_ps(iy0,jy0);
661             dz00             = _mm_sub_ps(iz0,jz0);
662             dx11             = _mm_sub_ps(ix1,jx1);
663             dy11             = _mm_sub_ps(iy1,jy1);
664             dz11             = _mm_sub_ps(iz1,jz1);
665             dx12             = _mm_sub_ps(ix1,jx2);
666             dy12             = _mm_sub_ps(iy1,jy2);
667             dz12             = _mm_sub_ps(iz1,jz2);
668             dx13             = _mm_sub_ps(ix1,jx3);
669             dy13             = _mm_sub_ps(iy1,jy3);
670             dz13             = _mm_sub_ps(iz1,jz3);
671             dx21             = _mm_sub_ps(ix2,jx1);
672             dy21             = _mm_sub_ps(iy2,jy1);
673             dz21             = _mm_sub_ps(iz2,jz1);
674             dx22             = _mm_sub_ps(ix2,jx2);
675             dy22             = _mm_sub_ps(iy2,jy2);
676             dz22             = _mm_sub_ps(iz2,jz2);
677             dx23             = _mm_sub_ps(ix2,jx3);
678             dy23             = _mm_sub_ps(iy2,jy3);
679             dz23             = _mm_sub_ps(iz2,jz3);
680             dx31             = _mm_sub_ps(ix3,jx1);
681             dy31             = _mm_sub_ps(iy3,jy1);
682             dz31             = _mm_sub_ps(iz3,jz1);
683             dx32             = _mm_sub_ps(ix3,jx2);
684             dy32             = _mm_sub_ps(iy3,jy2);
685             dz32             = _mm_sub_ps(iz3,jz2);
686             dx33             = _mm_sub_ps(ix3,jx3);
687             dy33             = _mm_sub_ps(iy3,jy3);
688             dz33             = _mm_sub_ps(iz3,jz3);
689
690             /* Calculate squared distance and things based on it */
691             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
692             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
693             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
694             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
695             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
696             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
697             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
698             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
699             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
700             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
701
702             rinv00           = sse41_invsqrt_f(rsq00);
703             rinv11           = sse41_invsqrt_f(rsq11);
704             rinv12           = sse41_invsqrt_f(rsq12);
705             rinv13           = sse41_invsqrt_f(rsq13);
706             rinv21           = sse41_invsqrt_f(rsq21);
707             rinv22           = sse41_invsqrt_f(rsq22);
708             rinv23           = sse41_invsqrt_f(rsq23);
709             rinv31           = sse41_invsqrt_f(rsq31);
710             rinv32           = sse41_invsqrt_f(rsq32);
711             rinv33           = sse41_invsqrt_f(rsq33);
712
713             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
714             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
715             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
716             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
717             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
718             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
719             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
720             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
721             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
722
723             fjx0             = _mm_setzero_ps();
724             fjy0             = _mm_setzero_ps();
725             fjz0             = _mm_setzero_ps();
726             fjx1             = _mm_setzero_ps();
727             fjy1             = _mm_setzero_ps();
728             fjz1             = _mm_setzero_ps();
729             fjx2             = _mm_setzero_ps();
730             fjy2             = _mm_setzero_ps();
731             fjz2             = _mm_setzero_ps();
732             fjx3             = _mm_setzero_ps();
733             fjy3             = _mm_setzero_ps();
734             fjz3             = _mm_setzero_ps();
735
736             /**************************
737              * CALCULATE INTERACTIONS *
738              **************************/
739
740             r00              = _mm_mul_ps(rsq00,rinv00);
741             r00              = _mm_andnot_ps(dummy_mask,r00);
742
743             /* Calculate table index by multiplying r with table scale and truncate to integer */
744             rt               = _mm_mul_ps(r00,vftabscale);
745             vfitab           = _mm_cvttps_epi32(rt);
746             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
747             vfitab           = _mm_slli_epi32(vfitab,3);
748
749             /* CUBIC SPLINE TABLE DISPERSION */
750             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
751             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
752             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
753             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
754             _MM_TRANSPOSE4_PS(Y,F,G,H);
755             Heps             = _mm_mul_ps(vfeps,H);
756             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
757             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
758             vvdw6            = _mm_mul_ps(c6_00,VV);
759             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
760             fvdw6            = _mm_mul_ps(c6_00,FF);
761
762             /* CUBIC SPLINE TABLE REPULSION */
763             vfitab           = _mm_add_epi32(vfitab,ifour);
764             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
765             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
766             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
767             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
768             _MM_TRANSPOSE4_PS(Y,F,G,H);
769             Heps             = _mm_mul_ps(vfeps,H);
770             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
771             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
772             vvdw12           = _mm_mul_ps(c12_00,VV);
773             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
774             fvdw12           = _mm_mul_ps(c12_00,FF);
775             vvdw             = _mm_add_ps(vvdw12,vvdw6);
776             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
777
778             /* Update potential sum for this i atom from the interaction with this j atom. */
779             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
780             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
781
782             fscal            = fvdw;
783
784             fscal            = _mm_andnot_ps(dummy_mask,fscal);
785
786             /* Calculate temporary vectorial force */
787             tx               = _mm_mul_ps(fscal,dx00);
788             ty               = _mm_mul_ps(fscal,dy00);
789             tz               = _mm_mul_ps(fscal,dz00);
790
791             /* Update vectorial force */
792             fix0             = _mm_add_ps(fix0,tx);
793             fiy0             = _mm_add_ps(fiy0,ty);
794             fiz0             = _mm_add_ps(fiz0,tz);
795
796             fjx0             = _mm_add_ps(fjx0,tx);
797             fjy0             = _mm_add_ps(fjy0,ty);
798             fjz0             = _mm_add_ps(fjz0,tz);
799
800             /**************************
801              * CALCULATE INTERACTIONS *
802              **************************/
803
804             /* COULOMB ELECTROSTATICS */
805             velec            = _mm_mul_ps(qq11,rinv11);
806             felec            = _mm_mul_ps(velec,rinvsq11);
807
808             /* Update potential sum for this i atom from the interaction with this j atom. */
809             velec            = _mm_andnot_ps(dummy_mask,velec);
810             velecsum         = _mm_add_ps(velecsum,velec);
811
812             fscal            = felec;
813
814             fscal            = _mm_andnot_ps(dummy_mask,fscal);
815
816             /* Calculate temporary vectorial force */
817             tx               = _mm_mul_ps(fscal,dx11);
818             ty               = _mm_mul_ps(fscal,dy11);
819             tz               = _mm_mul_ps(fscal,dz11);
820
821             /* Update vectorial force */
822             fix1             = _mm_add_ps(fix1,tx);
823             fiy1             = _mm_add_ps(fiy1,ty);
824             fiz1             = _mm_add_ps(fiz1,tz);
825
826             fjx1             = _mm_add_ps(fjx1,tx);
827             fjy1             = _mm_add_ps(fjy1,ty);
828             fjz1             = _mm_add_ps(fjz1,tz);
829
830             /**************************
831              * CALCULATE INTERACTIONS *
832              **************************/
833
834             /* COULOMB ELECTROSTATICS */
835             velec            = _mm_mul_ps(qq12,rinv12);
836             felec            = _mm_mul_ps(velec,rinvsq12);
837
838             /* Update potential sum for this i atom from the interaction with this j atom. */
839             velec            = _mm_andnot_ps(dummy_mask,velec);
840             velecsum         = _mm_add_ps(velecsum,velec);
841
842             fscal            = felec;
843
844             fscal            = _mm_andnot_ps(dummy_mask,fscal);
845
846             /* Calculate temporary vectorial force */
847             tx               = _mm_mul_ps(fscal,dx12);
848             ty               = _mm_mul_ps(fscal,dy12);
849             tz               = _mm_mul_ps(fscal,dz12);
850
851             /* Update vectorial force */
852             fix1             = _mm_add_ps(fix1,tx);
853             fiy1             = _mm_add_ps(fiy1,ty);
854             fiz1             = _mm_add_ps(fiz1,tz);
855
856             fjx2             = _mm_add_ps(fjx2,tx);
857             fjy2             = _mm_add_ps(fjy2,ty);
858             fjz2             = _mm_add_ps(fjz2,tz);
859
860             /**************************
861              * CALCULATE INTERACTIONS *
862              **************************/
863
864             /* COULOMB ELECTROSTATICS */
865             velec            = _mm_mul_ps(qq13,rinv13);
866             felec            = _mm_mul_ps(velec,rinvsq13);
867
868             /* Update potential sum for this i atom from the interaction with this j atom. */
869             velec            = _mm_andnot_ps(dummy_mask,velec);
870             velecsum         = _mm_add_ps(velecsum,velec);
871
872             fscal            = felec;
873
874             fscal            = _mm_andnot_ps(dummy_mask,fscal);
875
876             /* Calculate temporary vectorial force */
877             tx               = _mm_mul_ps(fscal,dx13);
878             ty               = _mm_mul_ps(fscal,dy13);
879             tz               = _mm_mul_ps(fscal,dz13);
880
881             /* Update vectorial force */
882             fix1             = _mm_add_ps(fix1,tx);
883             fiy1             = _mm_add_ps(fiy1,ty);
884             fiz1             = _mm_add_ps(fiz1,tz);
885
886             fjx3             = _mm_add_ps(fjx3,tx);
887             fjy3             = _mm_add_ps(fjy3,ty);
888             fjz3             = _mm_add_ps(fjz3,tz);
889
890             /**************************
891              * CALCULATE INTERACTIONS *
892              **************************/
893
894             /* COULOMB ELECTROSTATICS */
895             velec            = _mm_mul_ps(qq21,rinv21);
896             felec            = _mm_mul_ps(velec,rinvsq21);
897
898             /* Update potential sum for this i atom from the interaction with this j atom. */
899             velec            = _mm_andnot_ps(dummy_mask,velec);
900             velecsum         = _mm_add_ps(velecsum,velec);
901
902             fscal            = felec;
903
904             fscal            = _mm_andnot_ps(dummy_mask,fscal);
905
906             /* Calculate temporary vectorial force */
907             tx               = _mm_mul_ps(fscal,dx21);
908             ty               = _mm_mul_ps(fscal,dy21);
909             tz               = _mm_mul_ps(fscal,dz21);
910
911             /* Update vectorial force */
912             fix2             = _mm_add_ps(fix2,tx);
913             fiy2             = _mm_add_ps(fiy2,ty);
914             fiz2             = _mm_add_ps(fiz2,tz);
915
916             fjx1             = _mm_add_ps(fjx1,tx);
917             fjy1             = _mm_add_ps(fjy1,ty);
918             fjz1             = _mm_add_ps(fjz1,tz);
919
920             /**************************
921              * CALCULATE INTERACTIONS *
922              **************************/
923
924             /* COULOMB ELECTROSTATICS */
925             velec            = _mm_mul_ps(qq22,rinv22);
926             felec            = _mm_mul_ps(velec,rinvsq22);
927
928             /* Update potential sum for this i atom from the interaction with this j atom. */
929             velec            = _mm_andnot_ps(dummy_mask,velec);
930             velecsum         = _mm_add_ps(velecsum,velec);
931
932             fscal            = felec;
933
934             fscal            = _mm_andnot_ps(dummy_mask,fscal);
935
936             /* Calculate temporary vectorial force */
937             tx               = _mm_mul_ps(fscal,dx22);
938             ty               = _mm_mul_ps(fscal,dy22);
939             tz               = _mm_mul_ps(fscal,dz22);
940
941             /* Update vectorial force */
942             fix2             = _mm_add_ps(fix2,tx);
943             fiy2             = _mm_add_ps(fiy2,ty);
944             fiz2             = _mm_add_ps(fiz2,tz);
945
946             fjx2             = _mm_add_ps(fjx2,tx);
947             fjy2             = _mm_add_ps(fjy2,ty);
948             fjz2             = _mm_add_ps(fjz2,tz);
949
950             /**************************
951              * CALCULATE INTERACTIONS *
952              **************************/
953
954             /* COULOMB ELECTROSTATICS */
955             velec            = _mm_mul_ps(qq23,rinv23);
956             felec            = _mm_mul_ps(velec,rinvsq23);
957
958             /* Update potential sum for this i atom from the interaction with this j atom. */
959             velec            = _mm_andnot_ps(dummy_mask,velec);
960             velecsum         = _mm_add_ps(velecsum,velec);
961
962             fscal            = felec;
963
964             fscal            = _mm_andnot_ps(dummy_mask,fscal);
965
966             /* Calculate temporary vectorial force */
967             tx               = _mm_mul_ps(fscal,dx23);
968             ty               = _mm_mul_ps(fscal,dy23);
969             tz               = _mm_mul_ps(fscal,dz23);
970
971             /* Update vectorial force */
972             fix2             = _mm_add_ps(fix2,tx);
973             fiy2             = _mm_add_ps(fiy2,ty);
974             fiz2             = _mm_add_ps(fiz2,tz);
975
976             fjx3             = _mm_add_ps(fjx3,tx);
977             fjy3             = _mm_add_ps(fjy3,ty);
978             fjz3             = _mm_add_ps(fjz3,tz);
979
980             /**************************
981              * CALCULATE INTERACTIONS *
982              **************************/
983
984             /* COULOMB ELECTROSTATICS */
985             velec            = _mm_mul_ps(qq31,rinv31);
986             felec            = _mm_mul_ps(velec,rinvsq31);
987
988             /* Update potential sum for this i atom from the interaction with this j atom. */
989             velec            = _mm_andnot_ps(dummy_mask,velec);
990             velecsum         = _mm_add_ps(velecsum,velec);
991
992             fscal            = felec;
993
994             fscal            = _mm_andnot_ps(dummy_mask,fscal);
995
996             /* Calculate temporary vectorial force */
997             tx               = _mm_mul_ps(fscal,dx31);
998             ty               = _mm_mul_ps(fscal,dy31);
999             tz               = _mm_mul_ps(fscal,dz31);
1000
1001             /* Update vectorial force */
1002             fix3             = _mm_add_ps(fix3,tx);
1003             fiy3             = _mm_add_ps(fiy3,ty);
1004             fiz3             = _mm_add_ps(fiz3,tz);
1005
1006             fjx1             = _mm_add_ps(fjx1,tx);
1007             fjy1             = _mm_add_ps(fjy1,ty);
1008             fjz1             = _mm_add_ps(fjz1,tz);
1009
1010             /**************************
1011              * CALCULATE INTERACTIONS *
1012              **************************/
1013
1014             /* COULOMB ELECTROSTATICS */
1015             velec            = _mm_mul_ps(qq32,rinv32);
1016             felec            = _mm_mul_ps(velec,rinvsq32);
1017
1018             /* Update potential sum for this i atom from the interaction with this j atom. */
1019             velec            = _mm_andnot_ps(dummy_mask,velec);
1020             velecsum         = _mm_add_ps(velecsum,velec);
1021
1022             fscal            = felec;
1023
1024             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1025
1026             /* Calculate temporary vectorial force */
1027             tx               = _mm_mul_ps(fscal,dx32);
1028             ty               = _mm_mul_ps(fscal,dy32);
1029             tz               = _mm_mul_ps(fscal,dz32);
1030
1031             /* Update vectorial force */
1032             fix3             = _mm_add_ps(fix3,tx);
1033             fiy3             = _mm_add_ps(fiy3,ty);
1034             fiz3             = _mm_add_ps(fiz3,tz);
1035
1036             fjx2             = _mm_add_ps(fjx2,tx);
1037             fjy2             = _mm_add_ps(fjy2,ty);
1038             fjz2             = _mm_add_ps(fjz2,tz);
1039
1040             /**************************
1041              * CALCULATE INTERACTIONS *
1042              **************************/
1043
1044             /* COULOMB ELECTROSTATICS */
1045             velec            = _mm_mul_ps(qq33,rinv33);
1046             felec            = _mm_mul_ps(velec,rinvsq33);
1047
1048             /* Update potential sum for this i atom from the interaction with this j atom. */
1049             velec            = _mm_andnot_ps(dummy_mask,velec);
1050             velecsum         = _mm_add_ps(velecsum,velec);
1051
1052             fscal            = felec;
1053
1054             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1055
1056             /* Calculate temporary vectorial force */
1057             tx               = _mm_mul_ps(fscal,dx33);
1058             ty               = _mm_mul_ps(fscal,dy33);
1059             tz               = _mm_mul_ps(fscal,dz33);
1060
1061             /* Update vectorial force */
1062             fix3             = _mm_add_ps(fix3,tx);
1063             fiy3             = _mm_add_ps(fiy3,ty);
1064             fiz3             = _mm_add_ps(fiz3,tz);
1065
1066             fjx3             = _mm_add_ps(fjx3,tx);
1067             fjy3             = _mm_add_ps(fjy3,ty);
1068             fjz3             = _mm_add_ps(fjz3,tz);
1069
1070             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1071             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1072             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1073             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1074
1075             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1076                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1077                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1078
1079             /* Inner loop uses 312 flops */
1080         }
1081
1082         /* End of innermost loop */
1083
1084         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1085                                               f+i_coord_offset,fshift+i_shift_offset);
1086
1087         ggid                        = gid[iidx];
1088         /* Update potential energies */
1089         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1090         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1091
1092         /* Increment number of inner iterations */
1093         inneriter                  += j_index_end - j_index_start;
1094
1095         /* Outer loop uses 26 flops */
1096     }
1097
1098     /* Increment number of outer iterations */
1099     outeriter        += nri;
1100
1101     /* Update outer/inner flops */
1102
1103     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*312);
1104 }
1105 /*
1106  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse4_1_single
1107  * Electrostatics interaction: Coulomb
1108  * VdW interaction:            CubicSplineTable
1109  * Geometry:                   Water4-Water4
1110  * Calculate force/pot:        Force
1111  */
1112 void
1113 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse4_1_single
1114                     (t_nblist                    * gmx_restrict       nlist,
1115                      rvec                        * gmx_restrict          xx,
1116                      rvec                        * gmx_restrict          ff,
1117                      struct t_forcerec           * gmx_restrict          fr,
1118                      t_mdatoms                   * gmx_restrict     mdatoms,
1119                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1120                      t_nrnb                      * gmx_restrict        nrnb)
1121 {
1122     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1123      * just 0 for non-waters.
1124      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1125      * jnr indices corresponding to data put in the four positions in the SIMD register.
1126      */
1127     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1128     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1129     int              jnrA,jnrB,jnrC,jnrD;
1130     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1131     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1132     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1133     real             rcutoff_scalar;
1134     real             *shiftvec,*fshift,*x,*f;
1135     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1136     real             scratch[4*DIM];
1137     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1138     int              vdwioffset0;
1139     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1140     int              vdwioffset1;
1141     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1142     int              vdwioffset2;
1143     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1144     int              vdwioffset3;
1145     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1146     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1147     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1148     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1149     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1150     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1151     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1152     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1153     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1154     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1155     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1156     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1157     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1158     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1159     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1160     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1161     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1162     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1163     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1164     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1165     real             *charge;
1166     int              nvdwtype;
1167     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1168     int              *vdwtype;
1169     real             *vdwparam;
1170     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1171     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1172     __m128i          vfitab;
1173     __m128i          ifour       = _mm_set1_epi32(4);
1174     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1175     real             *vftab;
1176     __m128           dummy_mask,cutoff_mask;
1177     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1178     __m128           one     = _mm_set1_ps(1.0);
1179     __m128           two     = _mm_set1_ps(2.0);
1180     x                = xx[0];
1181     f                = ff[0];
1182
1183     nri              = nlist->nri;
1184     iinr             = nlist->iinr;
1185     jindex           = nlist->jindex;
1186     jjnr             = nlist->jjnr;
1187     shiftidx         = nlist->shift;
1188     gid              = nlist->gid;
1189     shiftvec         = fr->shift_vec[0];
1190     fshift           = fr->fshift[0];
1191     facel            = _mm_set1_ps(fr->ic->epsfac);
1192     charge           = mdatoms->chargeA;
1193     nvdwtype         = fr->ntype;
1194     vdwparam         = fr->nbfp;
1195     vdwtype          = mdatoms->typeA;
1196
1197     vftab            = kernel_data->table_vdw->data;
1198     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
1199
1200     /* Setup water-specific parameters */
1201     inr              = nlist->iinr[0];
1202     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1203     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1204     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1205     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1206
1207     jq1              = _mm_set1_ps(charge[inr+1]);
1208     jq2              = _mm_set1_ps(charge[inr+2]);
1209     jq3              = _mm_set1_ps(charge[inr+3]);
1210     vdwjidx0A        = 2*vdwtype[inr+0];
1211     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1212     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1213     qq11             = _mm_mul_ps(iq1,jq1);
1214     qq12             = _mm_mul_ps(iq1,jq2);
1215     qq13             = _mm_mul_ps(iq1,jq3);
1216     qq21             = _mm_mul_ps(iq2,jq1);
1217     qq22             = _mm_mul_ps(iq2,jq2);
1218     qq23             = _mm_mul_ps(iq2,jq3);
1219     qq31             = _mm_mul_ps(iq3,jq1);
1220     qq32             = _mm_mul_ps(iq3,jq2);
1221     qq33             = _mm_mul_ps(iq3,jq3);
1222
1223     /* Avoid stupid compiler warnings */
1224     jnrA = jnrB = jnrC = jnrD = 0;
1225     j_coord_offsetA = 0;
1226     j_coord_offsetB = 0;
1227     j_coord_offsetC = 0;
1228     j_coord_offsetD = 0;
1229
1230     outeriter        = 0;
1231     inneriter        = 0;
1232
1233     for(iidx=0;iidx<4*DIM;iidx++)
1234     {
1235         scratch[iidx] = 0.0;
1236     }
1237
1238     /* Start outer loop over neighborlists */
1239     for(iidx=0; iidx<nri; iidx++)
1240     {
1241         /* Load shift vector for this list */
1242         i_shift_offset   = DIM*shiftidx[iidx];
1243
1244         /* Load limits for loop over neighbors */
1245         j_index_start    = jindex[iidx];
1246         j_index_end      = jindex[iidx+1];
1247
1248         /* Get outer coordinate index */
1249         inr              = iinr[iidx];
1250         i_coord_offset   = DIM*inr;
1251
1252         /* Load i particle coords and add shift vector */
1253         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1254                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1255
1256         fix0             = _mm_setzero_ps();
1257         fiy0             = _mm_setzero_ps();
1258         fiz0             = _mm_setzero_ps();
1259         fix1             = _mm_setzero_ps();
1260         fiy1             = _mm_setzero_ps();
1261         fiz1             = _mm_setzero_ps();
1262         fix2             = _mm_setzero_ps();
1263         fiy2             = _mm_setzero_ps();
1264         fiz2             = _mm_setzero_ps();
1265         fix3             = _mm_setzero_ps();
1266         fiy3             = _mm_setzero_ps();
1267         fiz3             = _mm_setzero_ps();
1268
1269         /* Start inner kernel loop */
1270         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1271         {
1272
1273             /* Get j neighbor index, and coordinate index */
1274             jnrA             = jjnr[jidx];
1275             jnrB             = jjnr[jidx+1];
1276             jnrC             = jjnr[jidx+2];
1277             jnrD             = jjnr[jidx+3];
1278             j_coord_offsetA  = DIM*jnrA;
1279             j_coord_offsetB  = DIM*jnrB;
1280             j_coord_offsetC  = DIM*jnrC;
1281             j_coord_offsetD  = DIM*jnrD;
1282
1283             /* load j atom coordinates */
1284             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1285                                               x+j_coord_offsetC,x+j_coord_offsetD,
1286                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1287                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1288
1289             /* Calculate displacement vector */
1290             dx00             = _mm_sub_ps(ix0,jx0);
1291             dy00             = _mm_sub_ps(iy0,jy0);
1292             dz00             = _mm_sub_ps(iz0,jz0);
1293             dx11             = _mm_sub_ps(ix1,jx1);
1294             dy11             = _mm_sub_ps(iy1,jy1);
1295             dz11             = _mm_sub_ps(iz1,jz1);
1296             dx12             = _mm_sub_ps(ix1,jx2);
1297             dy12             = _mm_sub_ps(iy1,jy2);
1298             dz12             = _mm_sub_ps(iz1,jz2);
1299             dx13             = _mm_sub_ps(ix1,jx3);
1300             dy13             = _mm_sub_ps(iy1,jy3);
1301             dz13             = _mm_sub_ps(iz1,jz3);
1302             dx21             = _mm_sub_ps(ix2,jx1);
1303             dy21             = _mm_sub_ps(iy2,jy1);
1304             dz21             = _mm_sub_ps(iz2,jz1);
1305             dx22             = _mm_sub_ps(ix2,jx2);
1306             dy22             = _mm_sub_ps(iy2,jy2);
1307             dz22             = _mm_sub_ps(iz2,jz2);
1308             dx23             = _mm_sub_ps(ix2,jx3);
1309             dy23             = _mm_sub_ps(iy2,jy3);
1310             dz23             = _mm_sub_ps(iz2,jz3);
1311             dx31             = _mm_sub_ps(ix3,jx1);
1312             dy31             = _mm_sub_ps(iy3,jy1);
1313             dz31             = _mm_sub_ps(iz3,jz1);
1314             dx32             = _mm_sub_ps(ix3,jx2);
1315             dy32             = _mm_sub_ps(iy3,jy2);
1316             dz32             = _mm_sub_ps(iz3,jz2);
1317             dx33             = _mm_sub_ps(ix3,jx3);
1318             dy33             = _mm_sub_ps(iy3,jy3);
1319             dz33             = _mm_sub_ps(iz3,jz3);
1320
1321             /* Calculate squared distance and things based on it */
1322             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1323             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1324             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1325             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1326             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1327             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1328             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1329             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1330             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1331             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1332
1333             rinv00           = sse41_invsqrt_f(rsq00);
1334             rinv11           = sse41_invsqrt_f(rsq11);
1335             rinv12           = sse41_invsqrt_f(rsq12);
1336             rinv13           = sse41_invsqrt_f(rsq13);
1337             rinv21           = sse41_invsqrt_f(rsq21);
1338             rinv22           = sse41_invsqrt_f(rsq22);
1339             rinv23           = sse41_invsqrt_f(rsq23);
1340             rinv31           = sse41_invsqrt_f(rsq31);
1341             rinv32           = sse41_invsqrt_f(rsq32);
1342             rinv33           = sse41_invsqrt_f(rsq33);
1343
1344             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1345             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1346             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1347             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1348             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1349             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1350             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1351             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1352             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1353
1354             fjx0             = _mm_setzero_ps();
1355             fjy0             = _mm_setzero_ps();
1356             fjz0             = _mm_setzero_ps();
1357             fjx1             = _mm_setzero_ps();
1358             fjy1             = _mm_setzero_ps();
1359             fjz1             = _mm_setzero_ps();
1360             fjx2             = _mm_setzero_ps();
1361             fjy2             = _mm_setzero_ps();
1362             fjz2             = _mm_setzero_ps();
1363             fjx3             = _mm_setzero_ps();
1364             fjy3             = _mm_setzero_ps();
1365             fjz3             = _mm_setzero_ps();
1366
1367             /**************************
1368              * CALCULATE INTERACTIONS *
1369              **************************/
1370
1371             r00              = _mm_mul_ps(rsq00,rinv00);
1372
1373             /* Calculate table index by multiplying r with table scale and truncate to integer */
1374             rt               = _mm_mul_ps(r00,vftabscale);
1375             vfitab           = _mm_cvttps_epi32(rt);
1376             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1377             vfitab           = _mm_slli_epi32(vfitab,3);
1378
1379             /* CUBIC SPLINE TABLE DISPERSION */
1380             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1381             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1382             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1383             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1384             _MM_TRANSPOSE4_PS(Y,F,G,H);
1385             Heps             = _mm_mul_ps(vfeps,H);
1386             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1387             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1388             fvdw6            = _mm_mul_ps(c6_00,FF);
1389
1390             /* CUBIC SPLINE TABLE REPULSION */
1391             vfitab           = _mm_add_epi32(vfitab,ifour);
1392             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1393             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1394             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1395             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1396             _MM_TRANSPOSE4_PS(Y,F,G,H);
1397             Heps             = _mm_mul_ps(vfeps,H);
1398             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1399             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1400             fvdw12           = _mm_mul_ps(c12_00,FF);
1401             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1402
1403             fscal            = fvdw;
1404
1405             /* Calculate temporary vectorial force */
1406             tx               = _mm_mul_ps(fscal,dx00);
1407             ty               = _mm_mul_ps(fscal,dy00);
1408             tz               = _mm_mul_ps(fscal,dz00);
1409
1410             /* Update vectorial force */
1411             fix0             = _mm_add_ps(fix0,tx);
1412             fiy0             = _mm_add_ps(fiy0,ty);
1413             fiz0             = _mm_add_ps(fiz0,tz);
1414
1415             fjx0             = _mm_add_ps(fjx0,tx);
1416             fjy0             = _mm_add_ps(fjy0,ty);
1417             fjz0             = _mm_add_ps(fjz0,tz);
1418
1419             /**************************
1420              * CALCULATE INTERACTIONS *
1421              **************************/
1422
1423             /* COULOMB ELECTROSTATICS */
1424             velec            = _mm_mul_ps(qq11,rinv11);
1425             felec            = _mm_mul_ps(velec,rinvsq11);
1426
1427             fscal            = felec;
1428
1429             /* Calculate temporary vectorial force */
1430             tx               = _mm_mul_ps(fscal,dx11);
1431             ty               = _mm_mul_ps(fscal,dy11);
1432             tz               = _mm_mul_ps(fscal,dz11);
1433
1434             /* Update vectorial force */
1435             fix1             = _mm_add_ps(fix1,tx);
1436             fiy1             = _mm_add_ps(fiy1,ty);
1437             fiz1             = _mm_add_ps(fiz1,tz);
1438
1439             fjx1             = _mm_add_ps(fjx1,tx);
1440             fjy1             = _mm_add_ps(fjy1,ty);
1441             fjz1             = _mm_add_ps(fjz1,tz);
1442
1443             /**************************
1444              * CALCULATE INTERACTIONS *
1445              **************************/
1446
1447             /* COULOMB ELECTROSTATICS */
1448             velec            = _mm_mul_ps(qq12,rinv12);
1449             felec            = _mm_mul_ps(velec,rinvsq12);
1450
1451             fscal            = felec;
1452
1453             /* Calculate temporary vectorial force */
1454             tx               = _mm_mul_ps(fscal,dx12);
1455             ty               = _mm_mul_ps(fscal,dy12);
1456             tz               = _mm_mul_ps(fscal,dz12);
1457
1458             /* Update vectorial force */
1459             fix1             = _mm_add_ps(fix1,tx);
1460             fiy1             = _mm_add_ps(fiy1,ty);
1461             fiz1             = _mm_add_ps(fiz1,tz);
1462
1463             fjx2             = _mm_add_ps(fjx2,tx);
1464             fjy2             = _mm_add_ps(fjy2,ty);
1465             fjz2             = _mm_add_ps(fjz2,tz);
1466
1467             /**************************
1468              * CALCULATE INTERACTIONS *
1469              **************************/
1470
1471             /* COULOMB ELECTROSTATICS */
1472             velec            = _mm_mul_ps(qq13,rinv13);
1473             felec            = _mm_mul_ps(velec,rinvsq13);
1474
1475             fscal            = felec;
1476
1477             /* Calculate temporary vectorial force */
1478             tx               = _mm_mul_ps(fscal,dx13);
1479             ty               = _mm_mul_ps(fscal,dy13);
1480             tz               = _mm_mul_ps(fscal,dz13);
1481
1482             /* Update vectorial force */
1483             fix1             = _mm_add_ps(fix1,tx);
1484             fiy1             = _mm_add_ps(fiy1,ty);
1485             fiz1             = _mm_add_ps(fiz1,tz);
1486
1487             fjx3             = _mm_add_ps(fjx3,tx);
1488             fjy3             = _mm_add_ps(fjy3,ty);
1489             fjz3             = _mm_add_ps(fjz3,tz);
1490
1491             /**************************
1492              * CALCULATE INTERACTIONS *
1493              **************************/
1494
1495             /* COULOMB ELECTROSTATICS */
1496             velec            = _mm_mul_ps(qq21,rinv21);
1497             felec            = _mm_mul_ps(velec,rinvsq21);
1498
1499             fscal            = felec;
1500
1501             /* Calculate temporary vectorial force */
1502             tx               = _mm_mul_ps(fscal,dx21);
1503             ty               = _mm_mul_ps(fscal,dy21);
1504             tz               = _mm_mul_ps(fscal,dz21);
1505
1506             /* Update vectorial force */
1507             fix2             = _mm_add_ps(fix2,tx);
1508             fiy2             = _mm_add_ps(fiy2,ty);
1509             fiz2             = _mm_add_ps(fiz2,tz);
1510
1511             fjx1             = _mm_add_ps(fjx1,tx);
1512             fjy1             = _mm_add_ps(fjy1,ty);
1513             fjz1             = _mm_add_ps(fjz1,tz);
1514
1515             /**************************
1516              * CALCULATE INTERACTIONS *
1517              **************************/
1518
1519             /* COULOMB ELECTROSTATICS */
1520             velec            = _mm_mul_ps(qq22,rinv22);
1521             felec            = _mm_mul_ps(velec,rinvsq22);
1522
1523             fscal            = felec;
1524
1525             /* Calculate temporary vectorial force */
1526             tx               = _mm_mul_ps(fscal,dx22);
1527             ty               = _mm_mul_ps(fscal,dy22);
1528             tz               = _mm_mul_ps(fscal,dz22);
1529
1530             /* Update vectorial force */
1531             fix2             = _mm_add_ps(fix2,tx);
1532             fiy2             = _mm_add_ps(fiy2,ty);
1533             fiz2             = _mm_add_ps(fiz2,tz);
1534
1535             fjx2             = _mm_add_ps(fjx2,tx);
1536             fjy2             = _mm_add_ps(fjy2,ty);
1537             fjz2             = _mm_add_ps(fjz2,tz);
1538
1539             /**************************
1540              * CALCULATE INTERACTIONS *
1541              **************************/
1542
1543             /* COULOMB ELECTROSTATICS */
1544             velec            = _mm_mul_ps(qq23,rinv23);
1545             felec            = _mm_mul_ps(velec,rinvsq23);
1546
1547             fscal            = felec;
1548
1549             /* Calculate temporary vectorial force */
1550             tx               = _mm_mul_ps(fscal,dx23);
1551             ty               = _mm_mul_ps(fscal,dy23);
1552             tz               = _mm_mul_ps(fscal,dz23);
1553
1554             /* Update vectorial force */
1555             fix2             = _mm_add_ps(fix2,tx);
1556             fiy2             = _mm_add_ps(fiy2,ty);
1557             fiz2             = _mm_add_ps(fiz2,tz);
1558
1559             fjx3             = _mm_add_ps(fjx3,tx);
1560             fjy3             = _mm_add_ps(fjy3,ty);
1561             fjz3             = _mm_add_ps(fjz3,tz);
1562
1563             /**************************
1564              * CALCULATE INTERACTIONS *
1565              **************************/
1566
1567             /* COULOMB ELECTROSTATICS */
1568             velec            = _mm_mul_ps(qq31,rinv31);
1569             felec            = _mm_mul_ps(velec,rinvsq31);
1570
1571             fscal            = felec;
1572
1573             /* Calculate temporary vectorial force */
1574             tx               = _mm_mul_ps(fscal,dx31);
1575             ty               = _mm_mul_ps(fscal,dy31);
1576             tz               = _mm_mul_ps(fscal,dz31);
1577
1578             /* Update vectorial force */
1579             fix3             = _mm_add_ps(fix3,tx);
1580             fiy3             = _mm_add_ps(fiy3,ty);
1581             fiz3             = _mm_add_ps(fiz3,tz);
1582
1583             fjx1             = _mm_add_ps(fjx1,tx);
1584             fjy1             = _mm_add_ps(fjy1,ty);
1585             fjz1             = _mm_add_ps(fjz1,tz);
1586
1587             /**************************
1588              * CALCULATE INTERACTIONS *
1589              **************************/
1590
1591             /* COULOMB ELECTROSTATICS */
1592             velec            = _mm_mul_ps(qq32,rinv32);
1593             felec            = _mm_mul_ps(velec,rinvsq32);
1594
1595             fscal            = felec;
1596
1597             /* Calculate temporary vectorial force */
1598             tx               = _mm_mul_ps(fscal,dx32);
1599             ty               = _mm_mul_ps(fscal,dy32);
1600             tz               = _mm_mul_ps(fscal,dz32);
1601
1602             /* Update vectorial force */
1603             fix3             = _mm_add_ps(fix3,tx);
1604             fiy3             = _mm_add_ps(fiy3,ty);
1605             fiz3             = _mm_add_ps(fiz3,tz);
1606
1607             fjx2             = _mm_add_ps(fjx2,tx);
1608             fjy2             = _mm_add_ps(fjy2,ty);
1609             fjz2             = _mm_add_ps(fjz2,tz);
1610
1611             /**************************
1612              * CALCULATE INTERACTIONS *
1613              **************************/
1614
1615             /* COULOMB ELECTROSTATICS */
1616             velec            = _mm_mul_ps(qq33,rinv33);
1617             felec            = _mm_mul_ps(velec,rinvsq33);
1618
1619             fscal            = felec;
1620
1621             /* Calculate temporary vectorial force */
1622             tx               = _mm_mul_ps(fscal,dx33);
1623             ty               = _mm_mul_ps(fscal,dy33);
1624             tz               = _mm_mul_ps(fscal,dz33);
1625
1626             /* Update vectorial force */
1627             fix3             = _mm_add_ps(fix3,tx);
1628             fiy3             = _mm_add_ps(fiy3,ty);
1629             fiz3             = _mm_add_ps(fiz3,tz);
1630
1631             fjx3             = _mm_add_ps(fjx3,tx);
1632             fjy3             = _mm_add_ps(fjy3,ty);
1633             fjz3             = _mm_add_ps(fjz3,tz);
1634
1635             fjptrA             = f+j_coord_offsetA;
1636             fjptrB             = f+j_coord_offsetB;
1637             fjptrC             = f+j_coord_offsetC;
1638             fjptrD             = f+j_coord_offsetD;
1639
1640             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1641                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1642                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1643
1644             /* Inner loop uses 294 flops */
1645         }
1646
1647         if(jidx<j_index_end)
1648         {
1649
1650             /* Get j neighbor index, and coordinate index */
1651             jnrlistA         = jjnr[jidx];
1652             jnrlistB         = jjnr[jidx+1];
1653             jnrlistC         = jjnr[jidx+2];
1654             jnrlistD         = jjnr[jidx+3];
1655             /* Sign of each element will be negative for non-real atoms.
1656              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1657              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1658              */
1659             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1660             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1661             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1662             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1663             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1664             j_coord_offsetA  = DIM*jnrA;
1665             j_coord_offsetB  = DIM*jnrB;
1666             j_coord_offsetC  = DIM*jnrC;
1667             j_coord_offsetD  = DIM*jnrD;
1668
1669             /* load j atom coordinates */
1670             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1671                                               x+j_coord_offsetC,x+j_coord_offsetD,
1672                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1673                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1674
1675             /* Calculate displacement vector */
1676             dx00             = _mm_sub_ps(ix0,jx0);
1677             dy00             = _mm_sub_ps(iy0,jy0);
1678             dz00             = _mm_sub_ps(iz0,jz0);
1679             dx11             = _mm_sub_ps(ix1,jx1);
1680             dy11             = _mm_sub_ps(iy1,jy1);
1681             dz11             = _mm_sub_ps(iz1,jz1);
1682             dx12             = _mm_sub_ps(ix1,jx2);
1683             dy12             = _mm_sub_ps(iy1,jy2);
1684             dz12             = _mm_sub_ps(iz1,jz2);
1685             dx13             = _mm_sub_ps(ix1,jx3);
1686             dy13             = _mm_sub_ps(iy1,jy3);
1687             dz13             = _mm_sub_ps(iz1,jz3);
1688             dx21             = _mm_sub_ps(ix2,jx1);
1689             dy21             = _mm_sub_ps(iy2,jy1);
1690             dz21             = _mm_sub_ps(iz2,jz1);
1691             dx22             = _mm_sub_ps(ix2,jx2);
1692             dy22             = _mm_sub_ps(iy2,jy2);
1693             dz22             = _mm_sub_ps(iz2,jz2);
1694             dx23             = _mm_sub_ps(ix2,jx3);
1695             dy23             = _mm_sub_ps(iy2,jy3);
1696             dz23             = _mm_sub_ps(iz2,jz3);
1697             dx31             = _mm_sub_ps(ix3,jx1);
1698             dy31             = _mm_sub_ps(iy3,jy1);
1699             dz31             = _mm_sub_ps(iz3,jz1);
1700             dx32             = _mm_sub_ps(ix3,jx2);
1701             dy32             = _mm_sub_ps(iy3,jy2);
1702             dz32             = _mm_sub_ps(iz3,jz2);
1703             dx33             = _mm_sub_ps(ix3,jx3);
1704             dy33             = _mm_sub_ps(iy3,jy3);
1705             dz33             = _mm_sub_ps(iz3,jz3);
1706
1707             /* Calculate squared distance and things based on it */
1708             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1709             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1710             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1711             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1712             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1713             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1714             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1715             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1716             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1717             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1718
1719             rinv00           = sse41_invsqrt_f(rsq00);
1720             rinv11           = sse41_invsqrt_f(rsq11);
1721             rinv12           = sse41_invsqrt_f(rsq12);
1722             rinv13           = sse41_invsqrt_f(rsq13);
1723             rinv21           = sse41_invsqrt_f(rsq21);
1724             rinv22           = sse41_invsqrt_f(rsq22);
1725             rinv23           = sse41_invsqrt_f(rsq23);
1726             rinv31           = sse41_invsqrt_f(rsq31);
1727             rinv32           = sse41_invsqrt_f(rsq32);
1728             rinv33           = sse41_invsqrt_f(rsq33);
1729
1730             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1731             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1732             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1733             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1734             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1735             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1736             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1737             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1738             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1739
1740             fjx0             = _mm_setzero_ps();
1741             fjy0             = _mm_setzero_ps();
1742             fjz0             = _mm_setzero_ps();
1743             fjx1             = _mm_setzero_ps();
1744             fjy1             = _mm_setzero_ps();
1745             fjz1             = _mm_setzero_ps();
1746             fjx2             = _mm_setzero_ps();
1747             fjy2             = _mm_setzero_ps();
1748             fjz2             = _mm_setzero_ps();
1749             fjx3             = _mm_setzero_ps();
1750             fjy3             = _mm_setzero_ps();
1751             fjz3             = _mm_setzero_ps();
1752
1753             /**************************
1754              * CALCULATE INTERACTIONS *
1755              **************************/
1756
1757             r00              = _mm_mul_ps(rsq00,rinv00);
1758             r00              = _mm_andnot_ps(dummy_mask,r00);
1759
1760             /* Calculate table index by multiplying r with table scale and truncate to integer */
1761             rt               = _mm_mul_ps(r00,vftabscale);
1762             vfitab           = _mm_cvttps_epi32(rt);
1763             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1764             vfitab           = _mm_slli_epi32(vfitab,3);
1765
1766             /* CUBIC SPLINE TABLE DISPERSION */
1767             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1768             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1769             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1770             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1771             _MM_TRANSPOSE4_PS(Y,F,G,H);
1772             Heps             = _mm_mul_ps(vfeps,H);
1773             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1774             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1775             fvdw6            = _mm_mul_ps(c6_00,FF);
1776
1777             /* CUBIC SPLINE TABLE REPULSION */
1778             vfitab           = _mm_add_epi32(vfitab,ifour);
1779             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1780             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1781             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1782             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1783             _MM_TRANSPOSE4_PS(Y,F,G,H);
1784             Heps             = _mm_mul_ps(vfeps,H);
1785             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1786             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1787             fvdw12           = _mm_mul_ps(c12_00,FF);
1788             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1789
1790             fscal            = fvdw;
1791
1792             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1793
1794             /* Calculate temporary vectorial force */
1795             tx               = _mm_mul_ps(fscal,dx00);
1796             ty               = _mm_mul_ps(fscal,dy00);
1797             tz               = _mm_mul_ps(fscal,dz00);
1798
1799             /* Update vectorial force */
1800             fix0             = _mm_add_ps(fix0,tx);
1801             fiy0             = _mm_add_ps(fiy0,ty);
1802             fiz0             = _mm_add_ps(fiz0,tz);
1803
1804             fjx0             = _mm_add_ps(fjx0,tx);
1805             fjy0             = _mm_add_ps(fjy0,ty);
1806             fjz0             = _mm_add_ps(fjz0,tz);
1807
1808             /**************************
1809              * CALCULATE INTERACTIONS *
1810              **************************/
1811
1812             /* COULOMB ELECTROSTATICS */
1813             velec            = _mm_mul_ps(qq11,rinv11);
1814             felec            = _mm_mul_ps(velec,rinvsq11);
1815
1816             fscal            = felec;
1817
1818             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1819
1820             /* Calculate temporary vectorial force */
1821             tx               = _mm_mul_ps(fscal,dx11);
1822             ty               = _mm_mul_ps(fscal,dy11);
1823             tz               = _mm_mul_ps(fscal,dz11);
1824
1825             /* Update vectorial force */
1826             fix1             = _mm_add_ps(fix1,tx);
1827             fiy1             = _mm_add_ps(fiy1,ty);
1828             fiz1             = _mm_add_ps(fiz1,tz);
1829
1830             fjx1             = _mm_add_ps(fjx1,tx);
1831             fjy1             = _mm_add_ps(fjy1,ty);
1832             fjz1             = _mm_add_ps(fjz1,tz);
1833
1834             /**************************
1835              * CALCULATE INTERACTIONS *
1836              **************************/
1837
1838             /* COULOMB ELECTROSTATICS */
1839             velec            = _mm_mul_ps(qq12,rinv12);
1840             felec            = _mm_mul_ps(velec,rinvsq12);
1841
1842             fscal            = felec;
1843
1844             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1845
1846             /* Calculate temporary vectorial force */
1847             tx               = _mm_mul_ps(fscal,dx12);
1848             ty               = _mm_mul_ps(fscal,dy12);
1849             tz               = _mm_mul_ps(fscal,dz12);
1850
1851             /* Update vectorial force */
1852             fix1             = _mm_add_ps(fix1,tx);
1853             fiy1             = _mm_add_ps(fiy1,ty);
1854             fiz1             = _mm_add_ps(fiz1,tz);
1855
1856             fjx2             = _mm_add_ps(fjx2,tx);
1857             fjy2             = _mm_add_ps(fjy2,ty);
1858             fjz2             = _mm_add_ps(fjz2,tz);
1859
1860             /**************************
1861              * CALCULATE INTERACTIONS *
1862              **************************/
1863
1864             /* COULOMB ELECTROSTATICS */
1865             velec            = _mm_mul_ps(qq13,rinv13);
1866             felec            = _mm_mul_ps(velec,rinvsq13);
1867
1868             fscal            = felec;
1869
1870             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1871
1872             /* Calculate temporary vectorial force */
1873             tx               = _mm_mul_ps(fscal,dx13);
1874             ty               = _mm_mul_ps(fscal,dy13);
1875             tz               = _mm_mul_ps(fscal,dz13);
1876
1877             /* Update vectorial force */
1878             fix1             = _mm_add_ps(fix1,tx);
1879             fiy1             = _mm_add_ps(fiy1,ty);
1880             fiz1             = _mm_add_ps(fiz1,tz);
1881
1882             fjx3             = _mm_add_ps(fjx3,tx);
1883             fjy3             = _mm_add_ps(fjy3,ty);
1884             fjz3             = _mm_add_ps(fjz3,tz);
1885
1886             /**************************
1887              * CALCULATE INTERACTIONS *
1888              **************************/
1889
1890             /* COULOMB ELECTROSTATICS */
1891             velec            = _mm_mul_ps(qq21,rinv21);
1892             felec            = _mm_mul_ps(velec,rinvsq21);
1893
1894             fscal            = felec;
1895
1896             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1897
1898             /* Calculate temporary vectorial force */
1899             tx               = _mm_mul_ps(fscal,dx21);
1900             ty               = _mm_mul_ps(fscal,dy21);
1901             tz               = _mm_mul_ps(fscal,dz21);
1902
1903             /* Update vectorial force */
1904             fix2             = _mm_add_ps(fix2,tx);
1905             fiy2             = _mm_add_ps(fiy2,ty);
1906             fiz2             = _mm_add_ps(fiz2,tz);
1907
1908             fjx1             = _mm_add_ps(fjx1,tx);
1909             fjy1             = _mm_add_ps(fjy1,ty);
1910             fjz1             = _mm_add_ps(fjz1,tz);
1911
1912             /**************************
1913              * CALCULATE INTERACTIONS *
1914              **************************/
1915
1916             /* COULOMB ELECTROSTATICS */
1917             velec            = _mm_mul_ps(qq22,rinv22);
1918             felec            = _mm_mul_ps(velec,rinvsq22);
1919
1920             fscal            = felec;
1921
1922             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1923
1924             /* Calculate temporary vectorial force */
1925             tx               = _mm_mul_ps(fscal,dx22);
1926             ty               = _mm_mul_ps(fscal,dy22);
1927             tz               = _mm_mul_ps(fscal,dz22);
1928
1929             /* Update vectorial force */
1930             fix2             = _mm_add_ps(fix2,tx);
1931             fiy2             = _mm_add_ps(fiy2,ty);
1932             fiz2             = _mm_add_ps(fiz2,tz);
1933
1934             fjx2             = _mm_add_ps(fjx2,tx);
1935             fjy2             = _mm_add_ps(fjy2,ty);
1936             fjz2             = _mm_add_ps(fjz2,tz);
1937
1938             /**************************
1939              * CALCULATE INTERACTIONS *
1940              **************************/
1941
1942             /* COULOMB ELECTROSTATICS */
1943             velec            = _mm_mul_ps(qq23,rinv23);
1944             felec            = _mm_mul_ps(velec,rinvsq23);
1945
1946             fscal            = felec;
1947
1948             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1949
1950             /* Calculate temporary vectorial force */
1951             tx               = _mm_mul_ps(fscal,dx23);
1952             ty               = _mm_mul_ps(fscal,dy23);
1953             tz               = _mm_mul_ps(fscal,dz23);
1954
1955             /* Update vectorial force */
1956             fix2             = _mm_add_ps(fix2,tx);
1957             fiy2             = _mm_add_ps(fiy2,ty);
1958             fiz2             = _mm_add_ps(fiz2,tz);
1959
1960             fjx3             = _mm_add_ps(fjx3,tx);
1961             fjy3             = _mm_add_ps(fjy3,ty);
1962             fjz3             = _mm_add_ps(fjz3,tz);
1963
1964             /**************************
1965              * CALCULATE INTERACTIONS *
1966              **************************/
1967
1968             /* COULOMB ELECTROSTATICS */
1969             velec            = _mm_mul_ps(qq31,rinv31);
1970             felec            = _mm_mul_ps(velec,rinvsq31);
1971
1972             fscal            = felec;
1973
1974             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1975
1976             /* Calculate temporary vectorial force */
1977             tx               = _mm_mul_ps(fscal,dx31);
1978             ty               = _mm_mul_ps(fscal,dy31);
1979             tz               = _mm_mul_ps(fscal,dz31);
1980
1981             /* Update vectorial force */
1982             fix3             = _mm_add_ps(fix3,tx);
1983             fiy3             = _mm_add_ps(fiy3,ty);
1984             fiz3             = _mm_add_ps(fiz3,tz);
1985
1986             fjx1             = _mm_add_ps(fjx1,tx);
1987             fjy1             = _mm_add_ps(fjy1,ty);
1988             fjz1             = _mm_add_ps(fjz1,tz);
1989
1990             /**************************
1991              * CALCULATE INTERACTIONS *
1992              **************************/
1993
1994             /* COULOMB ELECTROSTATICS */
1995             velec            = _mm_mul_ps(qq32,rinv32);
1996             felec            = _mm_mul_ps(velec,rinvsq32);
1997
1998             fscal            = felec;
1999
2000             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2001
2002             /* Calculate temporary vectorial force */
2003             tx               = _mm_mul_ps(fscal,dx32);
2004             ty               = _mm_mul_ps(fscal,dy32);
2005             tz               = _mm_mul_ps(fscal,dz32);
2006
2007             /* Update vectorial force */
2008             fix3             = _mm_add_ps(fix3,tx);
2009             fiy3             = _mm_add_ps(fiy3,ty);
2010             fiz3             = _mm_add_ps(fiz3,tz);
2011
2012             fjx2             = _mm_add_ps(fjx2,tx);
2013             fjy2             = _mm_add_ps(fjy2,ty);
2014             fjz2             = _mm_add_ps(fjz2,tz);
2015
2016             /**************************
2017              * CALCULATE INTERACTIONS *
2018              **************************/
2019
2020             /* COULOMB ELECTROSTATICS */
2021             velec            = _mm_mul_ps(qq33,rinv33);
2022             felec            = _mm_mul_ps(velec,rinvsq33);
2023
2024             fscal            = felec;
2025
2026             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2027
2028             /* Calculate temporary vectorial force */
2029             tx               = _mm_mul_ps(fscal,dx33);
2030             ty               = _mm_mul_ps(fscal,dy33);
2031             tz               = _mm_mul_ps(fscal,dz33);
2032
2033             /* Update vectorial force */
2034             fix3             = _mm_add_ps(fix3,tx);
2035             fiy3             = _mm_add_ps(fiy3,ty);
2036             fiz3             = _mm_add_ps(fiz3,tz);
2037
2038             fjx3             = _mm_add_ps(fjx3,tx);
2039             fjy3             = _mm_add_ps(fjy3,ty);
2040             fjz3             = _mm_add_ps(fjz3,tz);
2041
2042             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2043             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2044             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2045             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2046
2047             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2048                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2049                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2050
2051             /* Inner loop uses 295 flops */
2052         }
2053
2054         /* End of innermost loop */
2055
2056         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2057                                               f+i_coord_offset,fshift+i_shift_offset);
2058
2059         /* Increment number of inner iterations */
2060         inneriter                  += j_index_end - j_index_start;
2061
2062         /* Outer loop uses 24 flops */
2063     }
2064
2065     /* Increment number of outer iterations */
2066     outeriter        += nri;
2067
2068     /* Update outer/inner flops */
2069
2070     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*295);
2071 }