Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse4_1_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LJEwald
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwioffset3;
89     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
97     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
115     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
116     __m128           c6grid_00;
117     __m128           c6grid_11;
118     __m128           c6grid_12;
119     __m128           c6grid_13;
120     __m128           c6grid_21;
121     __m128           c6grid_22;
122     __m128           c6grid_23;
123     __m128           c6grid_31;
124     __m128           c6grid_32;
125     __m128           c6grid_33;
126     __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
127     real             *vdwgridparam;
128     __m128           one_half  = _mm_set1_ps(0.5);
129     __m128           minus_one = _mm_set1_ps(-1.0);
130     __m128i          ewitab;
131     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
132     real             *ewtab;
133     __m128           dummy_mask,cutoff_mask;
134     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
135     __m128           one     = _mm_set1_ps(1.0);
136     __m128           two     = _mm_set1_ps(2.0);
137     x                = xx[0];
138     f                = ff[0];
139
140     nri              = nlist->nri;
141     iinr             = nlist->iinr;
142     jindex           = nlist->jindex;
143     jjnr             = nlist->jjnr;
144     shiftidx         = nlist->shift;
145     gid              = nlist->gid;
146     shiftvec         = fr->shift_vec[0];
147     fshift           = fr->fshift[0];
148     facel            = _mm_set1_ps(fr->ic->epsfac);
149     charge           = mdatoms->chargeA;
150     nvdwtype         = fr->ntype;
151     vdwparam         = fr->nbfp;
152     vdwtype          = mdatoms->typeA;
153     vdwgridparam     = fr->ljpme_c6grid;
154     sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
155     ewclj            = _mm_set1_ps(fr->ic->ewaldcoeff_lj);
156     ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
157
158     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
159     ewtab            = fr->ic->tabq_coul_FDV0;
160     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
161     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
162
163     /* Setup water-specific parameters */
164     inr              = nlist->iinr[0];
165     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
166     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
167     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
168     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
169
170     jq1              = _mm_set1_ps(charge[inr+1]);
171     jq2              = _mm_set1_ps(charge[inr+2]);
172     jq3              = _mm_set1_ps(charge[inr+3]);
173     vdwjidx0A        = 2*vdwtype[inr+0];
174     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
175     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
176     c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
177     qq11             = _mm_mul_ps(iq1,jq1);
178     qq12             = _mm_mul_ps(iq1,jq2);
179     qq13             = _mm_mul_ps(iq1,jq3);
180     qq21             = _mm_mul_ps(iq2,jq1);
181     qq22             = _mm_mul_ps(iq2,jq2);
182     qq23             = _mm_mul_ps(iq2,jq3);
183     qq31             = _mm_mul_ps(iq3,jq1);
184     qq32             = _mm_mul_ps(iq3,jq2);
185     qq33             = _mm_mul_ps(iq3,jq3);
186
187     /* Avoid stupid compiler warnings */
188     jnrA = jnrB = jnrC = jnrD = 0;
189     j_coord_offsetA = 0;
190     j_coord_offsetB = 0;
191     j_coord_offsetC = 0;
192     j_coord_offsetD = 0;
193
194     outeriter        = 0;
195     inneriter        = 0;
196
197     for(iidx=0;iidx<4*DIM;iidx++)
198     {
199         scratch[iidx] = 0.0;
200     }
201
202     /* Start outer loop over neighborlists */
203     for(iidx=0; iidx<nri; iidx++)
204     {
205         /* Load shift vector for this list */
206         i_shift_offset   = DIM*shiftidx[iidx];
207
208         /* Load limits for loop over neighbors */
209         j_index_start    = jindex[iidx];
210         j_index_end      = jindex[iidx+1];
211
212         /* Get outer coordinate index */
213         inr              = iinr[iidx];
214         i_coord_offset   = DIM*inr;
215
216         /* Load i particle coords and add shift vector */
217         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
218                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
219
220         fix0             = _mm_setzero_ps();
221         fiy0             = _mm_setzero_ps();
222         fiz0             = _mm_setzero_ps();
223         fix1             = _mm_setzero_ps();
224         fiy1             = _mm_setzero_ps();
225         fiz1             = _mm_setzero_ps();
226         fix2             = _mm_setzero_ps();
227         fiy2             = _mm_setzero_ps();
228         fiz2             = _mm_setzero_ps();
229         fix3             = _mm_setzero_ps();
230         fiy3             = _mm_setzero_ps();
231         fiz3             = _mm_setzero_ps();
232
233         /* Reset potential sums */
234         velecsum         = _mm_setzero_ps();
235         vvdwsum          = _mm_setzero_ps();
236
237         /* Start inner kernel loop */
238         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
239         {
240
241             /* Get j neighbor index, and coordinate index */
242             jnrA             = jjnr[jidx];
243             jnrB             = jjnr[jidx+1];
244             jnrC             = jjnr[jidx+2];
245             jnrD             = jjnr[jidx+3];
246             j_coord_offsetA  = DIM*jnrA;
247             j_coord_offsetB  = DIM*jnrB;
248             j_coord_offsetC  = DIM*jnrC;
249             j_coord_offsetD  = DIM*jnrD;
250
251             /* load j atom coordinates */
252             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
253                                               x+j_coord_offsetC,x+j_coord_offsetD,
254                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
255                                               &jy2,&jz2,&jx3,&jy3,&jz3);
256
257             /* Calculate displacement vector */
258             dx00             = _mm_sub_ps(ix0,jx0);
259             dy00             = _mm_sub_ps(iy0,jy0);
260             dz00             = _mm_sub_ps(iz0,jz0);
261             dx11             = _mm_sub_ps(ix1,jx1);
262             dy11             = _mm_sub_ps(iy1,jy1);
263             dz11             = _mm_sub_ps(iz1,jz1);
264             dx12             = _mm_sub_ps(ix1,jx2);
265             dy12             = _mm_sub_ps(iy1,jy2);
266             dz12             = _mm_sub_ps(iz1,jz2);
267             dx13             = _mm_sub_ps(ix1,jx3);
268             dy13             = _mm_sub_ps(iy1,jy3);
269             dz13             = _mm_sub_ps(iz1,jz3);
270             dx21             = _mm_sub_ps(ix2,jx1);
271             dy21             = _mm_sub_ps(iy2,jy1);
272             dz21             = _mm_sub_ps(iz2,jz1);
273             dx22             = _mm_sub_ps(ix2,jx2);
274             dy22             = _mm_sub_ps(iy2,jy2);
275             dz22             = _mm_sub_ps(iz2,jz2);
276             dx23             = _mm_sub_ps(ix2,jx3);
277             dy23             = _mm_sub_ps(iy2,jy3);
278             dz23             = _mm_sub_ps(iz2,jz3);
279             dx31             = _mm_sub_ps(ix3,jx1);
280             dy31             = _mm_sub_ps(iy3,jy1);
281             dz31             = _mm_sub_ps(iz3,jz1);
282             dx32             = _mm_sub_ps(ix3,jx2);
283             dy32             = _mm_sub_ps(iy3,jy2);
284             dz32             = _mm_sub_ps(iz3,jz2);
285             dx33             = _mm_sub_ps(ix3,jx3);
286             dy33             = _mm_sub_ps(iy3,jy3);
287             dz33             = _mm_sub_ps(iz3,jz3);
288
289             /* Calculate squared distance and things based on it */
290             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
291             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
292             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
293             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
294             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
295             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
296             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
297             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
298             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
299             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
300
301             rinv00           = sse41_invsqrt_f(rsq00);
302             rinv11           = sse41_invsqrt_f(rsq11);
303             rinv12           = sse41_invsqrt_f(rsq12);
304             rinv13           = sse41_invsqrt_f(rsq13);
305             rinv21           = sse41_invsqrt_f(rsq21);
306             rinv22           = sse41_invsqrt_f(rsq22);
307             rinv23           = sse41_invsqrt_f(rsq23);
308             rinv31           = sse41_invsqrt_f(rsq31);
309             rinv32           = sse41_invsqrt_f(rsq32);
310             rinv33           = sse41_invsqrt_f(rsq33);
311
312             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
313             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
314             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
315             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
316             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
317             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
318             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
319             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
320             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
321             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
322
323             fjx0             = _mm_setzero_ps();
324             fjy0             = _mm_setzero_ps();
325             fjz0             = _mm_setzero_ps();
326             fjx1             = _mm_setzero_ps();
327             fjy1             = _mm_setzero_ps();
328             fjz1             = _mm_setzero_ps();
329             fjx2             = _mm_setzero_ps();
330             fjy2             = _mm_setzero_ps();
331             fjz2             = _mm_setzero_ps();
332             fjx3             = _mm_setzero_ps();
333             fjy3             = _mm_setzero_ps();
334             fjz3             = _mm_setzero_ps();
335
336             /**************************
337              * CALCULATE INTERACTIONS *
338              **************************/
339
340             r00              = _mm_mul_ps(rsq00,rinv00);
341
342             /* Analytical LJ-PME */
343             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
344             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
345             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
346             exponent         = sse41_exp_f(ewcljrsq);
347             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
348             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
349             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
350             vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
351             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
352             vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
353             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
354             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
355
356             /* Update potential sum for this i atom from the interaction with this j atom. */
357             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
358
359             fscal            = fvdw;
360
361             /* Calculate temporary vectorial force */
362             tx               = _mm_mul_ps(fscal,dx00);
363             ty               = _mm_mul_ps(fscal,dy00);
364             tz               = _mm_mul_ps(fscal,dz00);
365
366             /* Update vectorial force */
367             fix0             = _mm_add_ps(fix0,tx);
368             fiy0             = _mm_add_ps(fiy0,ty);
369             fiz0             = _mm_add_ps(fiz0,tz);
370
371             fjx0             = _mm_add_ps(fjx0,tx);
372             fjy0             = _mm_add_ps(fjy0,ty);
373             fjz0             = _mm_add_ps(fjz0,tz);
374
375             /**************************
376              * CALCULATE INTERACTIONS *
377              **************************/
378
379             r11              = _mm_mul_ps(rsq11,rinv11);
380
381             /* EWALD ELECTROSTATICS */
382
383             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
384             ewrt             = _mm_mul_ps(r11,ewtabscale);
385             ewitab           = _mm_cvttps_epi32(ewrt);
386             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
387             ewitab           = _mm_slli_epi32(ewitab,2);
388             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
389             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
390             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
391             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
392             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
393             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
394             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
395             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
396             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
397
398             /* Update potential sum for this i atom from the interaction with this j atom. */
399             velecsum         = _mm_add_ps(velecsum,velec);
400
401             fscal            = felec;
402
403             /* Calculate temporary vectorial force */
404             tx               = _mm_mul_ps(fscal,dx11);
405             ty               = _mm_mul_ps(fscal,dy11);
406             tz               = _mm_mul_ps(fscal,dz11);
407
408             /* Update vectorial force */
409             fix1             = _mm_add_ps(fix1,tx);
410             fiy1             = _mm_add_ps(fiy1,ty);
411             fiz1             = _mm_add_ps(fiz1,tz);
412
413             fjx1             = _mm_add_ps(fjx1,tx);
414             fjy1             = _mm_add_ps(fjy1,ty);
415             fjz1             = _mm_add_ps(fjz1,tz);
416
417             /**************************
418              * CALCULATE INTERACTIONS *
419              **************************/
420
421             r12              = _mm_mul_ps(rsq12,rinv12);
422
423             /* EWALD ELECTROSTATICS */
424
425             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
426             ewrt             = _mm_mul_ps(r12,ewtabscale);
427             ewitab           = _mm_cvttps_epi32(ewrt);
428             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
429             ewitab           = _mm_slli_epi32(ewitab,2);
430             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
431             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
432             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
433             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
434             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
435             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
436             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
437             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
438             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
439
440             /* Update potential sum for this i atom from the interaction with this j atom. */
441             velecsum         = _mm_add_ps(velecsum,velec);
442
443             fscal            = felec;
444
445             /* Calculate temporary vectorial force */
446             tx               = _mm_mul_ps(fscal,dx12);
447             ty               = _mm_mul_ps(fscal,dy12);
448             tz               = _mm_mul_ps(fscal,dz12);
449
450             /* Update vectorial force */
451             fix1             = _mm_add_ps(fix1,tx);
452             fiy1             = _mm_add_ps(fiy1,ty);
453             fiz1             = _mm_add_ps(fiz1,tz);
454
455             fjx2             = _mm_add_ps(fjx2,tx);
456             fjy2             = _mm_add_ps(fjy2,ty);
457             fjz2             = _mm_add_ps(fjz2,tz);
458
459             /**************************
460              * CALCULATE INTERACTIONS *
461              **************************/
462
463             r13              = _mm_mul_ps(rsq13,rinv13);
464
465             /* EWALD ELECTROSTATICS */
466
467             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
468             ewrt             = _mm_mul_ps(r13,ewtabscale);
469             ewitab           = _mm_cvttps_epi32(ewrt);
470             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
471             ewitab           = _mm_slli_epi32(ewitab,2);
472             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
473             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
474             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
475             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
476             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
477             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
478             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
479             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
480             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
481
482             /* Update potential sum for this i atom from the interaction with this j atom. */
483             velecsum         = _mm_add_ps(velecsum,velec);
484
485             fscal            = felec;
486
487             /* Calculate temporary vectorial force */
488             tx               = _mm_mul_ps(fscal,dx13);
489             ty               = _mm_mul_ps(fscal,dy13);
490             tz               = _mm_mul_ps(fscal,dz13);
491
492             /* Update vectorial force */
493             fix1             = _mm_add_ps(fix1,tx);
494             fiy1             = _mm_add_ps(fiy1,ty);
495             fiz1             = _mm_add_ps(fiz1,tz);
496
497             fjx3             = _mm_add_ps(fjx3,tx);
498             fjy3             = _mm_add_ps(fjy3,ty);
499             fjz3             = _mm_add_ps(fjz3,tz);
500
501             /**************************
502              * CALCULATE INTERACTIONS *
503              **************************/
504
505             r21              = _mm_mul_ps(rsq21,rinv21);
506
507             /* EWALD ELECTROSTATICS */
508
509             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
510             ewrt             = _mm_mul_ps(r21,ewtabscale);
511             ewitab           = _mm_cvttps_epi32(ewrt);
512             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
513             ewitab           = _mm_slli_epi32(ewitab,2);
514             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
515             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
516             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
517             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
518             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
519             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
520             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
521             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
522             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
523
524             /* Update potential sum for this i atom from the interaction with this j atom. */
525             velecsum         = _mm_add_ps(velecsum,velec);
526
527             fscal            = felec;
528
529             /* Calculate temporary vectorial force */
530             tx               = _mm_mul_ps(fscal,dx21);
531             ty               = _mm_mul_ps(fscal,dy21);
532             tz               = _mm_mul_ps(fscal,dz21);
533
534             /* Update vectorial force */
535             fix2             = _mm_add_ps(fix2,tx);
536             fiy2             = _mm_add_ps(fiy2,ty);
537             fiz2             = _mm_add_ps(fiz2,tz);
538
539             fjx1             = _mm_add_ps(fjx1,tx);
540             fjy1             = _mm_add_ps(fjy1,ty);
541             fjz1             = _mm_add_ps(fjz1,tz);
542
543             /**************************
544              * CALCULATE INTERACTIONS *
545              **************************/
546
547             r22              = _mm_mul_ps(rsq22,rinv22);
548
549             /* EWALD ELECTROSTATICS */
550
551             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
552             ewrt             = _mm_mul_ps(r22,ewtabscale);
553             ewitab           = _mm_cvttps_epi32(ewrt);
554             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
555             ewitab           = _mm_slli_epi32(ewitab,2);
556             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
557             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
558             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
559             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
560             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
561             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
562             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
563             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
564             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
565
566             /* Update potential sum for this i atom from the interaction with this j atom. */
567             velecsum         = _mm_add_ps(velecsum,velec);
568
569             fscal            = felec;
570
571             /* Calculate temporary vectorial force */
572             tx               = _mm_mul_ps(fscal,dx22);
573             ty               = _mm_mul_ps(fscal,dy22);
574             tz               = _mm_mul_ps(fscal,dz22);
575
576             /* Update vectorial force */
577             fix2             = _mm_add_ps(fix2,tx);
578             fiy2             = _mm_add_ps(fiy2,ty);
579             fiz2             = _mm_add_ps(fiz2,tz);
580
581             fjx2             = _mm_add_ps(fjx2,tx);
582             fjy2             = _mm_add_ps(fjy2,ty);
583             fjz2             = _mm_add_ps(fjz2,tz);
584
585             /**************************
586              * CALCULATE INTERACTIONS *
587              **************************/
588
589             r23              = _mm_mul_ps(rsq23,rinv23);
590
591             /* EWALD ELECTROSTATICS */
592
593             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
594             ewrt             = _mm_mul_ps(r23,ewtabscale);
595             ewitab           = _mm_cvttps_epi32(ewrt);
596             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
597             ewitab           = _mm_slli_epi32(ewitab,2);
598             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
599             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
600             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
601             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
602             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
603             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
604             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
605             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
606             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
607
608             /* Update potential sum for this i atom from the interaction with this j atom. */
609             velecsum         = _mm_add_ps(velecsum,velec);
610
611             fscal            = felec;
612
613             /* Calculate temporary vectorial force */
614             tx               = _mm_mul_ps(fscal,dx23);
615             ty               = _mm_mul_ps(fscal,dy23);
616             tz               = _mm_mul_ps(fscal,dz23);
617
618             /* Update vectorial force */
619             fix2             = _mm_add_ps(fix2,tx);
620             fiy2             = _mm_add_ps(fiy2,ty);
621             fiz2             = _mm_add_ps(fiz2,tz);
622
623             fjx3             = _mm_add_ps(fjx3,tx);
624             fjy3             = _mm_add_ps(fjy3,ty);
625             fjz3             = _mm_add_ps(fjz3,tz);
626
627             /**************************
628              * CALCULATE INTERACTIONS *
629              **************************/
630
631             r31              = _mm_mul_ps(rsq31,rinv31);
632
633             /* EWALD ELECTROSTATICS */
634
635             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
636             ewrt             = _mm_mul_ps(r31,ewtabscale);
637             ewitab           = _mm_cvttps_epi32(ewrt);
638             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
639             ewitab           = _mm_slli_epi32(ewitab,2);
640             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
641             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
642             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
643             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
644             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
645             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
646             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
647             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
648             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
649
650             /* Update potential sum for this i atom from the interaction with this j atom. */
651             velecsum         = _mm_add_ps(velecsum,velec);
652
653             fscal            = felec;
654
655             /* Calculate temporary vectorial force */
656             tx               = _mm_mul_ps(fscal,dx31);
657             ty               = _mm_mul_ps(fscal,dy31);
658             tz               = _mm_mul_ps(fscal,dz31);
659
660             /* Update vectorial force */
661             fix3             = _mm_add_ps(fix3,tx);
662             fiy3             = _mm_add_ps(fiy3,ty);
663             fiz3             = _mm_add_ps(fiz3,tz);
664
665             fjx1             = _mm_add_ps(fjx1,tx);
666             fjy1             = _mm_add_ps(fjy1,ty);
667             fjz1             = _mm_add_ps(fjz1,tz);
668
669             /**************************
670              * CALCULATE INTERACTIONS *
671              **************************/
672
673             r32              = _mm_mul_ps(rsq32,rinv32);
674
675             /* EWALD ELECTROSTATICS */
676
677             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
678             ewrt             = _mm_mul_ps(r32,ewtabscale);
679             ewitab           = _mm_cvttps_epi32(ewrt);
680             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
681             ewitab           = _mm_slli_epi32(ewitab,2);
682             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
683             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
684             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
685             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
686             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
687             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
688             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
689             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
690             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
691
692             /* Update potential sum for this i atom from the interaction with this j atom. */
693             velecsum         = _mm_add_ps(velecsum,velec);
694
695             fscal            = felec;
696
697             /* Calculate temporary vectorial force */
698             tx               = _mm_mul_ps(fscal,dx32);
699             ty               = _mm_mul_ps(fscal,dy32);
700             tz               = _mm_mul_ps(fscal,dz32);
701
702             /* Update vectorial force */
703             fix3             = _mm_add_ps(fix3,tx);
704             fiy3             = _mm_add_ps(fiy3,ty);
705             fiz3             = _mm_add_ps(fiz3,tz);
706
707             fjx2             = _mm_add_ps(fjx2,tx);
708             fjy2             = _mm_add_ps(fjy2,ty);
709             fjz2             = _mm_add_ps(fjz2,tz);
710
711             /**************************
712              * CALCULATE INTERACTIONS *
713              **************************/
714
715             r33              = _mm_mul_ps(rsq33,rinv33);
716
717             /* EWALD ELECTROSTATICS */
718
719             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
720             ewrt             = _mm_mul_ps(r33,ewtabscale);
721             ewitab           = _mm_cvttps_epi32(ewrt);
722             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
723             ewitab           = _mm_slli_epi32(ewitab,2);
724             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
725             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
726             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
727             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
728             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
729             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
730             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
731             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
732             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
733
734             /* Update potential sum for this i atom from the interaction with this j atom. */
735             velecsum         = _mm_add_ps(velecsum,velec);
736
737             fscal            = felec;
738
739             /* Calculate temporary vectorial force */
740             tx               = _mm_mul_ps(fscal,dx33);
741             ty               = _mm_mul_ps(fscal,dy33);
742             tz               = _mm_mul_ps(fscal,dz33);
743
744             /* Update vectorial force */
745             fix3             = _mm_add_ps(fix3,tx);
746             fiy3             = _mm_add_ps(fiy3,ty);
747             fiz3             = _mm_add_ps(fiz3,tz);
748
749             fjx3             = _mm_add_ps(fjx3,tx);
750             fjy3             = _mm_add_ps(fjy3,ty);
751             fjz3             = _mm_add_ps(fjz3,tz);
752
753             fjptrA             = f+j_coord_offsetA;
754             fjptrB             = f+j_coord_offsetB;
755             fjptrC             = f+j_coord_offsetC;
756             fjptrD             = f+j_coord_offsetD;
757
758             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
759                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
760                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
761
762             /* Inner loop uses 423 flops */
763         }
764
765         if(jidx<j_index_end)
766         {
767
768             /* Get j neighbor index, and coordinate index */
769             jnrlistA         = jjnr[jidx];
770             jnrlistB         = jjnr[jidx+1];
771             jnrlistC         = jjnr[jidx+2];
772             jnrlistD         = jjnr[jidx+3];
773             /* Sign of each element will be negative for non-real atoms.
774              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
775              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
776              */
777             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
778             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
779             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
780             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
781             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
782             j_coord_offsetA  = DIM*jnrA;
783             j_coord_offsetB  = DIM*jnrB;
784             j_coord_offsetC  = DIM*jnrC;
785             j_coord_offsetD  = DIM*jnrD;
786
787             /* load j atom coordinates */
788             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
789                                               x+j_coord_offsetC,x+j_coord_offsetD,
790                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
791                                               &jy2,&jz2,&jx3,&jy3,&jz3);
792
793             /* Calculate displacement vector */
794             dx00             = _mm_sub_ps(ix0,jx0);
795             dy00             = _mm_sub_ps(iy0,jy0);
796             dz00             = _mm_sub_ps(iz0,jz0);
797             dx11             = _mm_sub_ps(ix1,jx1);
798             dy11             = _mm_sub_ps(iy1,jy1);
799             dz11             = _mm_sub_ps(iz1,jz1);
800             dx12             = _mm_sub_ps(ix1,jx2);
801             dy12             = _mm_sub_ps(iy1,jy2);
802             dz12             = _mm_sub_ps(iz1,jz2);
803             dx13             = _mm_sub_ps(ix1,jx3);
804             dy13             = _mm_sub_ps(iy1,jy3);
805             dz13             = _mm_sub_ps(iz1,jz3);
806             dx21             = _mm_sub_ps(ix2,jx1);
807             dy21             = _mm_sub_ps(iy2,jy1);
808             dz21             = _mm_sub_ps(iz2,jz1);
809             dx22             = _mm_sub_ps(ix2,jx2);
810             dy22             = _mm_sub_ps(iy2,jy2);
811             dz22             = _mm_sub_ps(iz2,jz2);
812             dx23             = _mm_sub_ps(ix2,jx3);
813             dy23             = _mm_sub_ps(iy2,jy3);
814             dz23             = _mm_sub_ps(iz2,jz3);
815             dx31             = _mm_sub_ps(ix3,jx1);
816             dy31             = _mm_sub_ps(iy3,jy1);
817             dz31             = _mm_sub_ps(iz3,jz1);
818             dx32             = _mm_sub_ps(ix3,jx2);
819             dy32             = _mm_sub_ps(iy3,jy2);
820             dz32             = _mm_sub_ps(iz3,jz2);
821             dx33             = _mm_sub_ps(ix3,jx3);
822             dy33             = _mm_sub_ps(iy3,jy3);
823             dz33             = _mm_sub_ps(iz3,jz3);
824
825             /* Calculate squared distance and things based on it */
826             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
827             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
828             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
829             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
830             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
831             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
832             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
833             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
834             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
835             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
836
837             rinv00           = sse41_invsqrt_f(rsq00);
838             rinv11           = sse41_invsqrt_f(rsq11);
839             rinv12           = sse41_invsqrt_f(rsq12);
840             rinv13           = sse41_invsqrt_f(rsq13);
841             rinv21           = sse41_invsqrt_f(rsq21);
842             rinv22           = sse41_invsqrt_f(rsq22);
843             rinv23           = sse41_invsqrt_f(rsq23);
844             rinv31           = sse41_invsqrt_f(rsq31);
845             rinv32           = sse41_invsqrt_f(rsq32);
846             rinv33           = sse41_invsqrt_f(rsq33);
847
848             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
849             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
850             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
851             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
852             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
853             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
854             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
855             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
856             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
857             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
858
859             fjx0             = _mm_setzero_ps();
860             fjy0             = _mm_setzero_ps();
861             fjz0             = _mm_setzero_ps();
862             fjx1             = _mm_setzero_ps();
863             fjy1             = _mm_setzero_ps();
864             fjz1             = _mm_setzero_ps();
865             fjx2             = _mm_setzero_ps();
866             fjy2             = _mm_setzero_ps();
867             fjz2             = _mm_setzero_ps();
868             fjx3             = _mm_setzero_ps();
869             fjy3             = _mm_setzero_ps();
870             fjz3             = _mm_setzero_ps();
871
872             /**************************
873              * CALCULATE INTERACTIONS *
874              **************************/
875
876             r00              = _mm_mul_ps(rsq00,rinv00);
877             r00              = _mm_andnot_ps(dummy_mask,r00);
878
879             /* Analytical LJ-PME */
880             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
881             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
882             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
883             exponent         = sse41_exp_f(ewcljrsq);
884             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
885             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
886             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
887             vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
888             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
889             vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
890             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
891             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
892
893             /* Update potential sum for this i atom from the interaction with this j atom. */
894             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
895             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
896
897             fscal            = fvdw;
898
899             fscal            = _mm_andnot_ps(dummy_mask,fscal);
900
901             /* Calculate temporary vectorial force */
902             tx               = _mm_mul_ps(fscal,dx00);
903             ty               = _mm_mul_ps(fscal,dy00);
904             tz               = _mm_mul_ps(fscal,dz00);
905
906             /* Update vectorial force */
907             fix0             = _mm_add_ps(fix0,tx);
908             fiy0             = _mm_add_ps(fiy0,ty);
909             fiz0             = _mm_add_ps(fiz0,tz);
910
911             fjx0             = _mm_add_ps(fjx0,tx);
912             fjy0             = _mm_add_ps(fjy0,ty);
913             fjz0             = _mm_add_ps(fjz0,tz);
914
915             /**************************
916              * CALCULATE INTERACTIONS *
917              **************************/
918
919             r11              = _mm_mul_ps(rsq11,rinv11);
920             r11              = _mm_andnot_ps(dummy_mask,r11);
921
922             /* EWALD ELECTROSTATICS */
923
924             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
925             ewrt             = _mm_mul_ps(r11,ewtabscale);
926             ewitab           = _mm_cvttps_epi32(ewrt);
927             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
928             ewitab           = _mm_slli_epi32(ewitab,2);
929             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
930             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
931             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
932             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
933             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
934             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
935             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
936             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
937             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
938
939             /* Update potential sum for this i atom from the interaction with this j atom. */
940             velec            = _mm_andnot_ps(dummy_mask,velec);
941             velecsum         = _mm_add_ps(velecsum,velec);
942
943             fscal            = felec;
944
945             fscal            = _mm_andnot_ps(dummy_mask,fscal);
946
947             /* Calculate temporary vectorial force */
948             tx               = _mm_mul_ps(fscal,dx11);
949             ty               = _mm_mul_ps(fscal,dy11);
950             tz               = _mm_mul_ps(fscal,dz11);
951
952             /* Update vectorial force */
953             fix1             = _mm_add_ps(fix1,tx);
954             fiy1             = _mm_add_ps(fiy1,ty);
955             fiz1             = _mm_add_ps(fiz1,tz);
956
957             fjx1             = _mm_add_ps(fjx1,tx);
958             fjy1             = _mm_add_ps(fjy1,ty);
959             fjz1             = _mm_add_ps(fjz1,tz);
960
961             /**************************
962              * CALCULATE INTERACTIONS *
963              **************************/
964
965             r12              = _mm_mul_ps(rsq12,rinv12);
966             r12              = _mm_andnot_ps(dummy_mask,r12);
967
968             /* EWALD ELECTROSTATICS */
969
970             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
971             ewrt             = _mm_mul_ps(r12,ewtabscale);
972             ewitab           = _mm_cvttps_epi32(ewrt);
973             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
974             ewitab           = _mm_slli_epi32(ewitab,2);
975             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
976             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
977             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
978             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
979             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
980             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
981             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
982             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
983             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
984
985             /* Update potential sum for this i atom from the interaction with this j atom. */
986             velec            = _mm_andnot_ps(dummy_mask,velec);
987             velecsum         = _mm_add_ps(velecsum,velec);
988
989             fscal            = felec;
990
991             fscal            = _mm_andnot_ps(dummy_mask,fscal);
992
993             /* Calculate temporary vectorial force */
994             tx               = _mm_mul_ps(fscal,dx12);
995             ty               = _mm_mul_ps(fscal,dy12);
996             tz               = _mm_mul_ps(fscal,dz12);
997
998             /* Update vectorial force */
999             fix1             = _mm_add_ps(fix1,tx);
1000             fiy1             = _mm_add_ps(fiy1,ty);
1001             fiz1             = _mm_add_ps(fiz1,tz);
1002
1003             fjx2             = _mm_add_ps(fjx2,tx);
1004             fjy2             = _mm_add_ps(fjy2,ty);
1005             fjz2             = _mm_add_ps(fjz2,tz);
1006
1007             /**************************
1008              * CALCULATE INTERACTIONS *
1009              **************************/
1010
1011             r13              = _mm_mul_ps(rsq13,rinv13);
1012             r13              = _mm_andnot_ps(dummy_mask,r13);
1013
1014             /* EWALD ELECTROSTATICS */
1015
1016             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1017             ewrt             = _mm_mul_ps(r13,ewtabscale);
1018             ewitab           = _mm_cvttps_epi32(ewrt);
1019             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1020             ewitab           = _mm_slli_epi32(ewitab,2);
1021             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1022             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1023             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1024             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1025             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1026             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1027             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1028             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1029             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1030
1031             /* Update potential sum for this i atom from the interaction with this j atom. */
1032             velec            = _mm_andnot_ps(dummy_mask,velec);
1033             velecsum         = _mm_add_ps(velecsum,velec);
1034
1035             fscal            = felec;
1036
1037             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1038
1039             /* Calculate temporary vectorial force */
1040             tx               = _mm_mul_ps(fscal,dx13);
1041             ty               = _mm_mul_ps(fscal,dy13);
1042             tz               = _mm_mul_ps(fscal,dz13);
1043
1044             /* Update vectorial force */
1045             fix1             = _mm_add_ps(fix1,tx);
1046             fiy1             = _mm_add_ps(fiy1,ty);
1047             fiz1             = _mm_add_ps(fiz1,tz);
1048
1049             fjx3             = _mm_add_ps(fjx3,tx);
1050             fjy3             = _mm_add_ps(fjy3,ty);
1051             fjz3             = _mm_add_ps(fjz3,tz);
1052
1053             /**************************
1054              * CALCULATE INTERACTIONS *
1055              **************************/
1056
1057             r21              = _mm_mul_ps(rsq21,rinv21);
1058             r21              = _mm_andnot_ps(dummy_mask,r21);
1059
1060             /* EWALD ELECTROSTATICS */
1061
1062             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1063             ewrt             = _mm_mul_ps(r21,ewtabscale);
1064             ewitab           = _mm_cvttps_epi32(ewrt);
1065             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1066             ewitab           = _mm_slli_epi32(ewitab,2);
1067             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1068             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1069             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1070             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1071             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1072             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1073             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1074             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1075             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1076
1077             /* Update potential sum for this i atom from the interaction with this j atom. */
1078             velec            = _mm_andnot_ps(dummy_mask,velec);
1079             velecsum         = _mm_add_ps(velecsum,velec);
1080
1081             fscal            = felec;
1082
1083             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1084
1085             /* Calculate temporary vectorial force */
1086             tx               = _mm_mul_ps(fscal,dx21);
1087             ty               = _mm_mul_ps(fscal,dy21);
1088             tz               = _mm_mul_ps(fscal,dz21);
1089
1090             /* Update vectorial force */
1091             fix2             = _mm_add_ps(fix2,tx);
1092             fiy2             = _mm_add_ps(fiy2,ty);
1093             fiz2             = _mm_add_ps(fiz2,tz);
1094
1095             fjx1             = _mm_add_ps(fjx1,tx);
1096             fjy1             = _mm_add_ps(fjy1,ty);
1097             fjz1             = _mm_add_ps(fjz1,tz);
1098
1099             /**************************
1100              * CALCULATE INTERACTIONS *
1101              **************************/
1102
1103             r22              = _mm_mul_ps(rsq22,rinv22);
1104             r22              = _mm_andnot_ps(dummy_mask,r22);
1105
1106             /* EWALD ELECTROSTATICS */
1107
1108             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1109             ewrt             = _mm_mul_ps(r22,ewtabscale);
1110             ewitab           = _mm_cvttps_epi32(ewrt);
1111             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1112             ewitab           = _mm_slli_epi32(ewitab,2);
1113             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1114             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1115             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1116             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1117             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1118             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1119             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1120             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1121             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1122
1123             /* Update potential sum for this i atom from the interaction with this j atom. */
1124             velec            = _mm_andnot_ps(dummy_mask,velec);
1125             velecsum         = _mm_add_ps(velecsum,velec);
1126
1127             fscal            = felec;
1128
1129             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1130
1131             /* Calculate temporary vectorial force */
1132             tx               = _mm_mul_ps(fscal,dx22);
1133             ty               = _mm_mul_ps(fscal,dy22);
1134             tz               = _mm_mul_ps(fscal,dz22);
1135
1136             /* Update vectorial force */
1137             fix2             = _mm_add_ps(fix2,tx);
1138             fiy2             = _mm_add_ps(fiy2,ty);
1139             fiz2             = _mm_add_ps(fiz2,tz);
1140
1141             fjx2             = _mm_add_ps(fjx2,tx);
1142             fjy2             = _mm_add_ps(fjy2,ty);
1143             fjz2             = _mm_add_ps(fjz2,tz);
1144
1145             /**************************
1146              * CALCULATE INTERACTIONS *
1147              **************************/
1148
1149             r23              = _mm_mul_ps(rsq23,rinv23);
1150             r23              = _mm_andnot_ps(dummy_mask,r23);
1151
1152             /* EWALD ELECTROSTATICS */
1153
1154             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1155             ewrt             = _mm_mul_ps(r23,ewtabscale);
1156             ewitab           = _mm_cvttps_epi32(ewrt);
1157             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1158             ewitab           = _mm_slli_epi32(ewitab,2);
1159             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1160             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1161             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1162             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1163             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1164             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1165             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1166             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1167             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1168
1169             /* Update potential sum for this i atom from the interaction with this j atom. */
1170             velec            = _mm_andnot_ps(dummy_mask,velec);
1171             velecsum         = _mm_add_ps(velecsum,velec);
1172
1173             fscal            = felec;
1174
1175             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1176
1177             /* Calculate temporary vectorial force */
1178             tx               = _mm_mul_ps(fscal,dx23);
1179             ty               = _mm_mul_ps(fscal,dy23);
1180             tz               = _mm_mul_ps(fscal,dz23);
1181
1182             /* Update vectorial force */
1183             fix2             = _mm_add_ps(fix2,tx);
1184             fiy2             = _mm_add_ps(fiy2,ty);
1185             fiz2             = _mm_add_ps(fiz2,tz);
1186
1187             fjx3             = _mm_add_ps(fjx3,tx);
1188             fjy3             = _mm_add_ps(fjy3,ty);
1189             fjz3             = _mm_add_ps(fjz3,tz);
1190
1191             /**************************
1192              * CALCULATE INTERACTIONS *
1193              **************************/
1194
1195             r31              = _mm_mul_ps(rsq31,rinv31);
1196             r31              = _mm_andnot_ps(dummy_mask,r31);
1197
1198             /* EWALD ELECTROSTATICS */
1199
1200             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1201             ewrt             = _mm_mul_ps(r31,ewtabscale);
1202             ewitab           = _mm_cvttps_epi32(ewrt);
1203             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1204             ewitab           = _mm_slli_epi32(ewitab,2);
1205             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1206             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1207             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1208             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1209             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1210             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1211             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1212             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1213             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1214
1215             /* Update potential sum for this i atom from the interaction with this j atom. */
1216             velec            = _mm_andnot_ps(dummy_mask,velec);
1217             velecsum         = _mm_add_ps(velecsum,velec);
1218
1219             fscal            = felec;
1220
1221             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1222
1223             /* Calculate temporary vectorial force */
1224             tx               = _mm_mul_ps(fscal,dx31);
1225             ty               = _mm_mul_ps(fscal,dy31);
1226             tz               = _mm_mul_ps(fscal,dz31);
1227
1228             /* Update vectorial force */
1229             fix3             = _mm_add_ps(fix3,tx);
1230             fiy3             = _mm_add_ps(fiy3,ty);
1231             fiz3             = _mm_add_ps(fiz3,tz);
1232
1233             fjx1             = _mm_add_ps(fjx1,tx);
1234             fjy1             = _mm_add_ps(fjy1,ty);
1235             fjz1             = _mm_add_ps(fjz1,tz);
1236
1237             /**************************
1238              * CALCULATE INTERACTIONS *
1239              **************************/
1240
1241             r32              = _mm_mul_ps(rsq32,rinv32);
1242             r32              = _mm_andnot_ps(dummy_mask,r32);
1243
1244             /* EWALD ELECTROSTATICS */
1245
1246             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1247             ewrt             = _mm_mul_ps(r32,ewtabscale);
1248             ewitab           = _mm_cvttps_epi32(ewrt);
1249             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1250             ewitab           = _mm_slli_epi32(ewitab,2);
1251             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1252             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1253             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1254             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1255             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1256             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1257             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1258             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1259             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1260
1261             /* Update potential sum for this i atom from the interaction with this j atom. */
1262             velec            = _mm_andnot_ps(dummy_mask,velec);
1263             velecsum         = _mm_add_ps(velecsum,velec);
1264
1265             fscal            = felec;
1266
1267             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1268
1269             /* Calculate temporary vectorial force */
1270             tx               = _mm_mul_ps(fscal,dx32);
1271             ty               = _mm_mul_ps(fscal,dy32);
1272             tz               = _mm_mul_ps(fscal,dz32);
1273
1274             /* Update vectorial force */
1275             fix3             = _mm_add_ps(fix3,tx);
1276             fiy3             = _mm_add_ps(fiy3,ty);
1277             fiz3             = _mm_add_ps(fiz3,tz);
1278
1279             fjx2             = _mm_add_ps(fjx2,tx);
1280             fjy2             = _mm_add_ps(fjy2,ty);
1281             fjz2             = _mm_add_ps(fjz2,tz);
1282
1283             /**************************
1284              * CALCULATE INTERACTIONS *
1285              **************************/
1286
1287             r33              = _mm_mul_ps(rsq33,rinv33);
1288             r33              = _mm_andnot_ps(dummy_mask,r33);
1289
1290             /* EWALD ELECTROSTATICS */
1291
1292             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1293             ewrt             = _mm_mul_ps(r33,ewtabscale);
1294             ewitab           = _mm_cvttps_epi32(ewrt);
1295             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1296             ewitab           = _mm_slli_epi32(ewitab,2);
1297             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1298             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1299             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1300             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1301             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1302             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1303             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1304             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1305             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1306
1307             /* Update potential sum for this i atom from the interaction with this j atom. */
1308             velec            = _mm_andnot_ps(dummy_mask,velec);
1309             velecsum         = _mm_add_ps(velecsum,velec);
1310
1311             fscal            = felec;
1312
1313             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1314
1315             /* Calculate temporary vectorial force */
1316             tx               = _mm_mul_ps(fscal,dx33);
1317             ty               = _mm_mul_ps(fscal,dy33);
1318             tz               = _mm_mul_ps(fscal,dz33);
1319
1320             /* Update vectorial force */
1321             fix3             = _mm_add_ps(fix3,tx);
1322             fiy3             = _mm_add_ps(fiy3,ty);
1323             fiz3             = _mm_add_ps(fiz3,tz);
1324
1325             fjx3             = _mm_add_ps(fjx3,tx);
1326             fjy3             = _mm_add_ps(fjy3,ty);
1327             fjz3             = _mm_add_ps(fjz3,tz);
1328
1329             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1330             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1331             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1332             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1333
1334             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1335                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1336                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1337
1338             /* Inner loop uses 433 flops */
1339         }
1340
1341         /* End of innermost loop */
1342
1343         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1344                                               f+i_coord_offset,fshift+i_shift_offset);
1345
1346         ggid                        = gid[iidx];
1347         /* Update potential energies */
1348         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1349         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1350
1351         /* Increment number of inner iterations */
1352         inneriter                  += j_index_end - j_index_start;
1353
1354         /* Outer loop uses 26 flops */
1355     }
1356
1357     /* Increment number of outer iterations */
1358     outeriter        += nri;
1359
1360     /* Update outer/inner flops */
1361
1362     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*433);
1363 }
1364 /*
1365  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single
1366  * Electrostatics interaction: Ewald
1367  * VdW interaction:            LJEwald
1368  * Geometry:                   Water4-Water4
1369  * Calculate force/pot:        Force
1370  */
1371 void
1372 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single
1373                     (t_nblist                    * gmx_restrict       nlist,
1374                      rvec                        * gmx_restrict          xx,
1375                      rvec                        * gmx_restrict          ff,
1376                      struct t_forcerec           * gmx_restrict          fr,
1377                      t_mdatoms                   * gmx_restrict     mdatoms,
1378                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1379                      t_nrnb                      * gmx_restrict        nrnb)
1380 {
1381     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1382      * just 0 for non-waters.
1383      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1384      * jnr indices corresponding to data put in the four positions in the SIMD register.
1385      */
1386     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1387     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1388     int              jnrA,jnrB,jnrC,jnrD;
1389     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1390     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1391     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1392     real             rcutoff_scalar;
1393     real             *shiftvec,*fshift,*x,*f;
1394     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1395     real             scratch[4*DIM];
1396     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1397     int              vdwioffset0;
1398     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1399     int              vdwioffset1;
1400     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1401     int              vdwioffset2;
1402     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1403     int              vdwioffset3;
1404     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1405     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1406     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1407     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1408     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1409     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1410     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1411     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1412     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1413     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1414     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1415     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1416     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1417     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1418     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1419     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1420     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1421     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1422     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1423     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1424     real             *charge;
1425     int              nvdwtype;
1426     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1427     int              *vdwtype;
1428     real             *vdwparam;
1429     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1430     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1431     __m128           c6grid_00;
1432     __m128           c6grid_11;
1433     __m128           c6grid_12;
1434     __m128           c6grid_13;
1435     __m128           c6grid_21;
1436     __m128           c6grid_22;
1437     __m128           c6grid_23;
1438     __m128           c6grid_31;
1439     __m128           c6grid_32;
1440     __m128           c6grid_33;
1441     __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1442     real             *vdwgridparam;
1443     __m128           one_half  = _mm_set1_ps(0.5);
1444     __m128           minus_one = _mm_set1_ps(-1.0);
1445     __m128i          ewitab;
1446     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1447     real             *ewtab;
1448     __m128           dummy_mask,cutoff_mask;
1449     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1450     __m128           one     = _mm_set1_ps(1.0);
1451     __m128           two     = _mm_set1_ps(2.0);
1452     x                = xx[0];
1453     f                = ff[0];
1454
1455     nri              = nlist->nri;
1456     iinr             = nlist->iinr;
1457     jindex           = nlist->jindex;
1458     jjnr             = nlist->jjnr;
1459     shiftidx         = nlist->shift;
1460     gid              = nlist->gid;
1461     shiftvec         = fr->shift_vec[0];
1462     fshift           = fr->fshift[0];
1463     facel            = _mm_set1_ps(fr->ic->epsfac);
1464     charge           = mdatoms->chargeA;
1465     nvdwtype         = fr->ntype;
1466     vdwparam         = fr->nbfp;
1467     vdwtype          = mdatoms->typeA;
1468     vdwgridparam     = fr->ljpme_c6grid;
1469     sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
1470     ewclj            = _mm_set1_ps(fr->ic->ewaldcoeff_lj);
1471     ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1472
1473     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1474     ewtab            = fr->ic->tabq_coul_F;
1475     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1476     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1477
1478     /* Setup water-specific parameters */
1479     inr              = nlist->iinr[0];
1480     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1481     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1482     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1483     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1484
1485     jq1              = _mm_set1_ps(charge[inr+1]);
1486     jq2              = _mm_set1_ps(charge[inr+2]);
1487     jq3              = _mm_set1_ps(charge[inr+3]);
1488     vdwjidx0A        = 2*vdwtype[inr+0];
1489     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1490     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1491     c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1492     qq11             = _mm_mul_ps(iq1,jq1);
1493     qq12             = _mm_mul_ps(iq1,jq2);
1494     qq13             = _mm_mul_ps(iq1,jq3);
1495     qq21             = _mm_mul_ps(iq2,jq1);
1496     qq22             = _mm_mul_ps(iq2,jq2);
1497     qq23             = _mm_mul_ps(iq2,jq3);
1498     qq31             = _mm_mul_ps(iq3,jq1);
1499     qq32             = _mm_mul_ps(iq3,jq2);
1500     qq33             = _mm_mul_ps(iq3,jq3);
1501
1502     /* Avoid stupid compiler warnings */
1503     jnrA = jnrB = jnrC = jnrD = 0;
1504     j_coord_offsetA = 0;
1505     j_coord_offsetB = 0;
1506     j_coord_offsetC = 0;
1507     j_coord_offsetD = 0;
1508
1509     outeriter        = 0;
1510     inneriter        = 0;
1511
1512     for(iidx=0;iidx<4*DIM;iidx++)
1513     {
1514         scratch[iidx] = 0.0;
1515     }
1516
1517     /* Start outer loop over neighborlists */
1518     for(iidx=0; iidx<nri; iidx++)
1519     {
1520         /* Load shift vector for this list */
1521         i_shift_offset   = DIM*shiftidx[iidx];
1522
1523         /* Load limits for loop over neighbors */
1524         j_index_start    = jindex[iidx];
1525         j_index_end      = jindex[iidx+1];
1526
1527         /* Get outer coordinate index */
1528         inr              = iinr[iidx];
1529         i_coord_offset   = DIM*inr;
1530
1531         /* Load i particle coords and add shift vector */
1532         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1533                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1534
1535         fix0             = _mm_setzero_ps();
1536         fiy0             = _mm_setzero_ps();
1537         fiz0             = _mm_setzero_ps();
1538         fix1             = _mm_setzero_ps();
1539         fiy1             = _mm_setzero_ps();
1540         fiz1             = _mm_setzero_ps();
1541         fix2             = _mm_setzero_ps();
1542         fiy2             = _mm_setzero_ps();
1543         fiz2             = _mm_setzero_ps();
1544         fix3             = _mm_setzero_ps();
1545         fiy3             = _mm_setzero_ps();
1546         fiz3             = _mm_setzero_ps();
1547
1548         /* Start inner kernel loop */
1549         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1550         {
1551
1552             /* Get j neighbor index, and coordinate index */
1553             jnrA             = jjnr[jidx];
1554             jnrB             = jjnr[jidx+1];
1555             jnrC             = jjnr[jidx+2];
1556             jnrD             = jjnr[jidx+3];
1557             j_coord_offsetA  = DIM*jnrA;
1558             j_coord_offsetB  = DIM*jnrB;
1559             j_coord_offsetC  = DIM*jnrC;
1560             j_coord_offsetD  = DIM*jnrD;
1561
1562             /* load j atom coordinates */
1563             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1564                                               x+j_coord_offsetC,x+j_coord_offsetD,
1565                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1566                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1567
1568             /* Calculate displacement vector */
1569             dx00             = _mm_sub_ps(ix0,jx0);
1570             dy00             = _mm_sub_ps(iy0,jy0);
1571             dz00             = _mm_sub_ps(iz0,jz0);
1572             dx11             = _mm_sub_ps(ix1,jx1);
1573             dy11             = _mm_sub_ps(iy1,jy1);
1574             dz11             = _mm_sub_ps(iz1,jz1);
1575             dx12             = _mm_sub_ps(ix1,jx2);
1576             dy12             = _mm_sub_ps(iy1,jy2);
1577             dz12             = _mm_sub_ps(iz1,jz2);
1578             dx13             = _mm_sub_ps(ix1,jx3);
1579             dy13             = _mm_sub_ps(iy1,jy3);
1580             dz13             = _mm_sub_ps(iz1,jz3);
1581             dx21             = _mm_sub_ps(ix2,jx1);
1582             dy21             = _mm_sub_ps(iy2,jy1);
1583             dz21             = _mm_sub_ps(iz2,jz1);
1584             dx22             = _mm_sub_ps(ix2,jx2);
1585             dy22             = _mm_sub_ps(iy2,jy2);
1586             dz22             = _mm_sub_ps(iz2,jz2);
1587             dx23             = _mm_sub_ps(ix2,jx3);
1588             dy23             = _mm_sub_ps(iy2,jy3);
1589             dz23             = _mm_sub_ps(iz2,jz3);
1590             dx31             = _mm_sub_ps(ix3,jx1);
1591             dy31             = _mm_sub_ps(iy3,jy1);
1592             dz31             = _mm_sub_ps(iz3,jz1);
1593             dx32             = _mm_sub_ps(ix3,jx2);
1594             dy32             = _mm_sub_ps(iy3,jy2);
1595             dz32             = _mm_sub_ps(iz3,jz2);
1596             dx33             = _mm_sub_ps(ix3,jx3);
1597             dy33             = _mm_sub_ps(iy3,jy3);
1598             dz33             = _mm_sub_ps(iz3,jz3);
1599
1600             /* Calculate squared distance and things based on it */
1601             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1602             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1603             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1604             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1605             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1606             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1607             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1608             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1609             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1610             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1611
1612             rinv00           = sse41_invsqrt_f(rsq00);
1613             rinv11           = sse41_invsqrt_f(rsq11);
1614             rinv12           = sse41_invsqrt_f(rsq12);
1615             rinv13           = sse41_invsqrt_f(rsq13);
1616             rinv21           = sse41_invsqrt_f(rsq21);
1617             rinv22           = sse41_invsqrt_f(rsq22);
1618             rinv23           = sse41_invsqrt_f(rsq23);
1619             rinv31           = sse41_invsqrt_f(rsq31);
1620             rinv32           = sse41_invsqrt_f(rsq32);
1621             rinv33           = sse41_invsqrt_f(rsq33);
1622
1623             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1624             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1625             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1626             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1627             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1628             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1629             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1630             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1631             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1632             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1633
1634             fjx0             = _mm_setzero_ps();
1635             fjy0             = _mm_setzero_ps();
1636             fjz0             = _mm_setzero_ps();
1637             fjx1             = _mm_setzero_ps();
1638             fjy1             = _mm_setzero_ps();
1639             fjz1             = _mm_setzero_ps();
1640             fjx2             = _mm_setzero_ps();
1641             fjy2             = _mm_setzero_ps();
1642             fjz2             = _mm_setzero_ps();
1643             fjx3             = _mm_setzero_ps();
1644             fjy3             = _mm_setzero_ps();
1645             fjz3             = _mm_setzero_ps();
1646
1647             /**************************
1648              * CALCULATE INTERACTIONS *
1649              **************************/
1650
1651             r00              = _mm_mul_ps(rsq00,rinv00);
1652
1653             /* Analytical LJ-PME */
1654             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1655             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
1656             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1657             exponent         = sse41_exp_f(ewcljrsq);
1658             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1659             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1660             /* f6A = 6 * C6grid * (1 - poly) */
1661             f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1662             /* f6B = C6grid * exponent * beta^6 */
1663             f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1664             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1665             fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1666
1667             fscal            = fvdw;
1668
1669             /* Calculate temporary vectorial force */
1670             tx               = _mm_mul_ps(fscal,dx00);
1671             ty               = _mm_mul_ps(fscal,dy00);
1672             tz               = _mm_mul_ps(fscal,dz00);
1673
1674             /* Update vectorial force */
1675             fix0             = _mm_add_ps(fix0,tx);
1676             fiy0             = _mm_add_ps(fiy0,ty);
1677             fiz0             = _mm_add_ps(fiz0,tz);
1678
1679             fjx0             = _mm_add_ps(fjx0,tx);
1680             fjy0             = _mm_add_ps(fjy0,ty);
1681             fjz0             = _mm_add_ps(fjz0,tz);
1682
1683             /**************************
1684              * CALCULATE INTERACTIONS *
1685              **************************/
1686
1687             r11              = _mm_mul_ps(rsq11,rinv11);
1688
1689             /* EWALD ELECTROSTATICS */
1690
1691             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1692             ewrt             = _mm_mul_ps(r11,ewtabscale);
1693             ewitab           = _mm_cvttps_epi32(ewrt);
1694             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1695             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1696                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1697                                          &ewtabF,&ewtabFn);
1698             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1699             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1700
1701             fscal            = felec;
1702
1703             /* Calculate temporary vectorial force */
1704             tx               = _mm_mul_ps(fscal,dx11);
1705             ty               = _mm_mul_ps(fscal,dy11);
1706             tz               = _mm_mul_ps(fscal,dz11);
1707
1708             /* Update vectorial force */
1709             fix1             = _mm_add_ps(fix1,tx);
1710             fiy1             = _mm_add_ps(fiy1,ty);
1711             fiz1             = _mm_add_ps(fiz1,tz);
1712
1713             fjx1             = _mm_add_ps(fjx1,tx);
1714             fjy1             = _mm_add_ps(fjy1,ty);
1715             fjz1             = _mm_add_ps(fjz1,tz);
1716
1717             /**************************
1718              * CALCULATE INTERACTIONS *
1719              **************************/
1720
1721             r12              = _mm_mul_ps(rsq12,rinv12);
1722
1723             /* EWALD ELECTROSTATICS */
1724
1725             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1726             ewrt             = _mm_mul_ps(r12,ewtabscale);
1727             ewitab           = _mm_cvttps_epi32(ewrt);
1728             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1729             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1730                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1731                                          &ewtabF,&ewtabFn);
1732             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1733             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1734
1735             fscal            = felec;
1736
1737             /* Calculate temporary vectorial force */
1738             tx               = _mm_mul_ps(fscal,dx12);
1739             ty               = _mm_mul_ps(fscal,dy12);
1740             tz               = _mm_mul_ps(fscal,dz12);
1741
1742             /* Update vectorial force */
1743             fix1             = _mm_add_ps(fix1,tx);
1744             fiy1             = _mm_add_ps(fiy1,ty);
1745             fiz1             = _mm_add_ps(fiz1,tz);
1746
1747             fjx2             = _mm_add_ps(fjx2,tx);
1748             fjy2             = _mm_add_ps(fjy2,ty);
1749             fjz2             = _mm_add_ps(fjz2,tz);
1750
1751             /**************************
1752              * CALCULATE INTERACTIONS *
1753              **************************/
1754
1755             r13              = _mm_mul_ps(rsq13,rinv13);
1756
1757             /* EWALD ELECTROSTATICS */
1758
1759             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1760             ewrt             = _mm_mul_ps(r13,ewtabscale);
1761             ewitab           = _mm_cvttps_epi32(ewrt);
1762             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1763             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1764                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1765                                          &ewtabF,&ewtabFn);
1766             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1767             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1768
1769             fscal            = felec;
1770
1771             /* Calculate temporary vectorial force */
1772             tx               = _mm_mul_ps(fscal,dx13);
1773             ty               = _mm_mul_ps(fscal,dy13);
1774             tz               = _mm_mul_ps(fscal,dz13);
1775
1776             /* Update vectorial force */
1777             fix1             = _mm_add_ps(fix1,tx);
1778             fiy1             = _mm_add_ps(fiy1,ty);
1779             fiz1             = _mm_add_ps(fiz1,tz);
1780
1781             fjx3             = _mm_add_ps(fjx3,tx);
1782             fjy3             = _mm_add_ps(fjy3,ty);
1783             fjz3             = _mm_add_ps(fjz3,tz);
1784
1785             /**************************
1786              * CALCULATE INTERACTIONS *
1787              **************************/
1788
1789             r21              = _mm_mul_ps(rsq21,rinv21);
1790
1791             /* EWALD ELECTROSTATICS */
1792
1793             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1794             ewrt             = _mm_mul_ps(r21,ewtabscale);
1795             ewitab           = _mm_cvttps_epi32(ewrt);
1796             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1797             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1798                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1799                                          &ewtabF,&ewtabFn);
1800             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1801             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1802
1803             fscal            = felec;
1804
1805             /* Calculate temporary vectorial force */
1806             tx               = _mm_mul_ps(fscal,dx21);
1807             ty               = _mm_mul_ps(fscal,dy21);
1808             tz               = _mm_mul_ps(fscal,dz21);
1809
1810             /* Update vectorial force */
1811             fix2             = _mm_add_ps(fix2,tx);
1812             fiy2             = _mm_add_ps(fiy2,ty);
1813             fiz2             = _mm_add_ps(fiz2,tz);
1814
1815             fjx1             = _mm_add_ps(fjx1,tx);
1816             fjy1             = _mm_add_ps(fjy1,ty);
1817             fjz1             = _mm_add_ps(fjz1,tz);
1818
1819             /**************************
1820              * CALCULATE INTERACTIONS *
1821              **************************/
1822
1823             r22              = _mm_mul_ps(rsq22,rinv22);
1824
1825             /* EWALD ELECTROSTATICS */
1826
1827             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1828             ewrt             = _mm_mul_ps(r22,ewtabscale);
1829             ewitab           = _mm_cvttps_epi32(ewrt);
1830             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1831             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1832                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1833                                          &ewtabF,&ewtabFn);
1834             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1835             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1836
1837             fscal            = felec;
1838
1839             /* Calculate temporary vectorial force */
1840             tx               = _mm_mul_ps(fscal,dx22);
1841             ty               = _mm_mul_ps(fscal,dy22);
1842             tz               = _mm_mul_ps(fscal,dz22);
1843
1844             /* Update vectorial force */
1845             fix2             = _mm_add_ps(fix2,tx);
1846             fiy2             = _mm_add_ps(fiy2,ty);
1847             fiz2             = _mm_add_ps(fiz2,tz);
1848
1849             fjx2             = _mm_add_ps(fjx2,tx);
1850             fjy2             = _mm_add_ps(fjy2,ty);
1851             fjz2             = _mm_add_ps(fjz2,tz);
1852
1853             /**************************
1854              * CALCULATE INTERACTIONS *
1855              **************************/
1856
1857             r23              = _mm_mul_ps(rsq23,rinv23);
1858
1859             /* EWALD ELECTROSTATICS */
1860
1861             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1862             ewrt             = _mm_mul_ps(r23,ewtabscale);
1863             ewitab           = _mm_cvttps_epi32(ewrt);
1864             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1865             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1866                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1867                                          &ewtabF,&ewtabFn);
1868             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1869             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1870
1871             fscal            = felec;
1872
1873             /* Calculate temporary vectorial force */
1874             tx               = _mm_mul_ps(fscal,dx23);
1875             ty               = _mm_mul_ps(fscal,dy23);
1876             tz               = _mm_mul_ps(fscal,dz23);
1877
1878             /* Update vectorial force */
1879             fix2             = _mm_add_ps(fix2,tx);
1880             fiy2             = _mm_add_ps(fiy2,ty);
1881             fiz2             = _mm_add_ps(fiz2,tz);
1882
1883             fjx3             = _mm_add_ps(fjx3,tx);
1884             fjy3             = _mm_add_ps(fjy3,ty);
1885             fjz3             = _mm_add_ps(fjz3,tz);
1886
1887             /**************************
1888              * CALCULATE INTERACTIONS *
1889              **************************/
1890
1891             r31              = _mm_mul_ps(rsq31,rinv31);
1892
1893             /* EWALD ELECTROSTATICS */
1894
1895             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1896             ewrt             = _mm_mul_ps(r31,ewtabscale);
1897             ewitab           = _mm_cvttps_epi32(ewrt);
1898             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1899             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1900                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1901                                          &ewtabF,&ewtabFn);
1902             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1903             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1904
1905             fscal            = felec;
1906
1907             /* Calculate temporary vectorial force */
1908             tx               = _mm_mul_ps(fscal,dx31);
1909             ty               = _mm_mul_ps(fscal,dy31);
1910             tz               = _mm_mul_ps(fscal,dz31);
1911
1912             /* Update vectorial force */
1913             fix3             = _mm_add_ps(fix3,tx);
1914             fiy3             = _mm_add_ps(fiy3,ty);
1915             fiz3             = _mm_add_ps(fiz3,tz);
1916
1917             fjx1             = _mm_add_ps(fjx1,tx);
1918             fjy1             = _mm_add_ps(fjy1,ty);
1919             fjz1             = _mm_add_ps(fjz1,tz);
1920
1921             /**************************
1922              * CALCULATE INTERACTIONS *
1923              **************************/
1924
1925             r32              = _mm_mul_ps(rsq32,rinv32);
1926
1927             /* EWALD ELECTROSTATICS */
1928
1929             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1930             ewrt             = _mm_mul_ps(r32,ewtabscale);
1931             ewitab           = _mm_cvttps_epi32(ewrt);
1932             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1933             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1934                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1935                                          &ewtabF,&ewtabFn);
1936             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1937             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1938
1939             fscal            = felec;
1940
1941             /* Calculate temporary vectorial force */
1942             tx               = _mm_mul_ps(fscal,dx32);
1943             ty               = _mm_mul_ps(fscal,dy32);
1944             tz               = _mm_mul_ps(fscal,dz32);
1945
1946             /* Update vectorial force */
1947             fix3             = _mm_add_ps(fix3,tx);
1948             fiy3             = _mm_add_ps(fiy3,ty);
1949             fiz3             = _mm_add_ps(fiz3,tz);
1950
1951             fjx2             = _mm_add_ps(fjx2,tx);
1952             fjy2             = _mm_add_ps(fjy2,ty);
1953             fjz2             = _mm_add_ps(fjz2,tz);
1954
1955             /**************************
1956              * CALCULATE INTERACTIONS *
1957              **************************/
1958
1959             r33              = _mm_mul_ps(rsq33,rinv33);
1960
1961             /* EWALD ELECTROSTATICS */
1962
1963             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1964             ewrt             = _mm_mul_ps(r33,ewtabscale);
1965             ewitab           = _mm_cvttps_epi32(ewrt);
1966             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1967             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1968                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1969                                          &ewtabF,&ewtabFn);
1970             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1971             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1972
1973             fscal            = felec;
1974
1975             /* Calculate temporary vectorial force */
1976             tx               = _mm_mul_ps(fscal,dx33);
1977             ty               = _mm_mul_ps(fscal,dy33);
1978             tz               = _mm_mul_ps(fscal,dz33);
1979
1980             /* Update vectorial force */
1981             fix3             = _mm_add_ps(fix3,tx);
1982             fiy3             = _mm_add_ps(fiy3,ty);
1983             fiz3             = _mm_add_ps(fiz3,tz);
1984
1985             fjx3             = _mm_add_ps(fjx3,tx);
1986             fjy3             = _mm_add_ps(fjy3,ty);
1987             fjz3             = _mm_add_ps(fjz3,tz);
1988
1989             fjptrA             = f+j_coord_offsetA;
1990             fjptrB             = f+j_coord_offsetB;
1991             fjptrC             = f+j_coord_offsetC;
1992             fjptrD             = f+j_coord_offsetD;
1993
1994             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1995                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1996                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1997
1998             /* Inner loop uses 373 flops */
1999         }
2000
2001         if(jidx<j_index_end)
2002         {
2003
2004             /* Get j neighbor index, and coordinate index */
2005             jnrlistA         = jjnr[jidx];
2006             jnrlistB         = jjnr[jidx+1];
2007             jnrlistC         = jjnr[jidx+2];
2008             jnrlistD         = jjnr[jidx+3];
2009             /* Sign of each element will be negative for non-real atoms.
2010              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2011              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2012              */
2013             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2014             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2015             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2016             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2017             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2018             j_coord_offsetA  = DIM*jnrA;
2019             j_coord_offsetB  = DIM*jnrB;
2020             j_coord_offsetC  = DIM*jnrC;
2021             j_coord_offsetD  = DIM*jnrD;
2022
2023             /* load j atom coordinates */
2024             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2025                                               x+j_coord_offsetC,x+j_coord_offsetD,
2026                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2027                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2028
2029             /* Calculate displacement vector */
2030             dx00             = _mm_sub_ps(ix0,jx0);
2031             dy00             = _mm_sub_ps(iy0,jy0);
2032             dz00             = _mm_sub_ps(iz0,jz0);
2033             dx11             = _mm_sub_ps(ix1,jx1);
2034             dy11             = _mm_sub_ps(iy1,jy1);
2035             dz11             = _mm_sub_ps(iz1,jz1);
2036             dx12             = _mm_sub_ps(ix1,jx2);
2037             dy12             = _mm_sub_ps(iy1,jy2);
2038             dz12             = _mm_sub_ps(iz1,jz2);
2039             dx13             = _mm_sub_ps(ix1,jx3);
2040             dy13             = _mm_sub_ps(iy1,jy3);
2041             dz13             = _mm_sub_ps(iz1,jz3);
2042             dx21             = _mm_sub_ps(ix2,jx1);
2043             dy21             = _mm_sub_ps(iy2,jy1);
2044             dz21             = _mm_sub_ps(iz2,jz1);
2045             dx22             = _mm_sub_ps(ix2,jx2);
2046             dy22             = _mm_sub_ps(iy2,jy2);
2047             dz22             = _mm_sub_ps(iz2,jz2);
2048             dx23             = _mm_sub_ps(ix2,jx3);
2049             dy23             = _mm_sub_ps(iy2,jy3);
2050             dz23             = _mm_sub_ps(iz2,jz3);
2051             dx31             = _mm_sub_ps(ix3,jx1);
2052             dy31             = _mm_sub_ps(iy3,jy1);
2053             dz31             = _mm_sub_ps(iz3,jz1);
2054             dx32             = _mm_sub_ps(ix3,jx2);
2055             dy32             = _mm_sub_ps(iy3,jy2);
2056             dz32             = _mm_sub_ps(iz3,jz2);
2057             dx33             = _mm_sub_ps(ix3,jx3);
2058             dy33             = _mm_sub_ps(iy3,jy3);
2059             dz33             = _mm_sub_ps(iz3,jz3);
2060
2061             /* Calculate squared distance and things based on it */
2062             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2063             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2064             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2065             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2066             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2067             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2068             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2069             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2070             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2071             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2072
2073             rinv00           = sse41_invsqrt_f(rsq00);
2074             rinv11           = sse41_invsqrt_f(rsq11);
2075             rinv12           = sse41_invsqrt_f(rsq12);
2076             rinv13           = sse41_invsqrt_f(rsq13);
2077             rinv21           = sse41_invsqrt_f(rsq21);
2078             rinv22           = sse41_invsqrt_f(rsq22);
2079             rinv23           = sse41_invsqrt_f(rsq23);
2080             rinv31           = sse41_invsqrt_f(rsq31);
2081             rinv32           = sse41_invsqrt_f(rsq32);
2082             rinv33           = sse41_invsqrt_f(rsq33);
2083
2084             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2085             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2086             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2087             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2088             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2089             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2090             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2091             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2092             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2093             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2094
2095             fjx0             = _mm_setzero_ps();
2096             fjy0             = _mm_setzero_ps();
2097             fjz0             = _mm_setzero_ps();
2098             fjx1             = _mm_setzero_ps();
2099             fjy1             = _mm_setzero_ps();
2100             fjz1             = _mm_setzero_ps();
2101             fjx2             = _mm_setzero_ps();
2102             fjy2             = _mm_setzero_ps();
2103             fjz2             = _mm_setzero_ps();
2104             fjx3             = _mm_setzero_ps();
2105             fjy3             = _mm_setzero_ps();
2106             fjz3             = _mm_setzero_ps();
2107
2108             /**************************
2109              * CALCULATE INTERACTIONS *
2110              **************************/
2111
2112             r00              = _mm_mul_ps(rsq00,rinv00);
2113             r00              = _mm_andnot_ps(dummy_mask,r00);
2114
2115             /* Analytical LJ-PME */
2116             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2117             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
2118             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
2119             exponent         = sse41_exp_f(ewcljrsq);
2120             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2121             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2122             /* f6A = 6 * C6grid * (1 - poly) */
2123             f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2124             /* f6B = C6grid * exponent * beta^6 */
2125             f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2126             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2127             fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2128
2129             fscal            = fvdw;
2130
2131             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2132
2133             /* Calculate temporary vectorial force */
2134             tx               = _mm_mul_ps(fscal,dx00);
2135             ty               = _mm_mul_ps(fscal,dy00);
2136             tz               = _mm_mul_ps(fscal,dz00);
2137
2138             /* Update vectorial force */
2139             fix0             = _mm_add_ps(fix0,tx);
2140             fiy0             = _mm_add_ps(fiy0,ty);
2141             fiz0             = _mm_add_ps(fiz0,tz);
2142
2143             fjx0             = _mm_add_ps(fjx0,tx);
2144             fjy0             = _mm_add_ps(fjy0,ty);
2145             fjz0             = _mm_add_ps(fjz0,tz);
2146
2147             /**************************
2148              * CALCULATE INTERACTIONS *
2149              **************************/
2150
2151             r11              = _mm_mul_ps(rsq11,rinv11);
2152             r11              = _mm_andnot_ps(dummy_mask,r11);
2153
2154             /* EWALD ELECTROSTATICS */
2155
2156             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2157             ewrt             = _mm_mul_ps(r11,ewtabscale);
2158             ewitab           = _mm_cvttps_epi32(ewrt);
2159             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2160             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2161                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2162                                          &ewtabF,&ewtabFn);
2163             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2164             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2165
2166             fscal            = felec;
2167
2168             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2169
2170             /* Calculate temporary vectorial force */
2171             tx               = _mm_mul_ps(fscal,dx11);
2172             ty               = _mm_mul_ps(fscal,dy11);
2173             tz               = _mm_mul_ps(fscal,dz11);
2174
2175             /* Update vectorial force */
2176             fix1             = _mm_add_ps(fix1,tx);
2177             fiy1             = _mm_add_ps(fiy1,ty);
2178             fiz1             = _mm_add_ps(fiz1,tz);
2179
2180             fjx1             = _mm_add_ps(fjx1,tx);
2181             fjy1             = _mm_add_ps(fjy1,ty);
2182             fjz1             = _mm_add_ps(fjz1,tz);
2183
2184             /**************************
2185              * CALCULATE INTERACTIONS *
2186              **************************/
2187
2188             r12              = _mm_mul_ps(rsq12,rinv12);
2189             r12              = _mm_andnot_ps(dummy_mask,r12);
2190
2191             /* EWALD ELECTROSTATICS */
2192
2193             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2194             ewrt             = _mm_mul_ps(r12,ewtabscale);
2195             ewitab           = _mm_cvttps_epi32(ewrt);
2196             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2197             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2198                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2199                                          &ewtabF,&ewtabFn);
2200             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2201             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2202
2203             fscal            = felec;
2204
2205             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2206
2207             /* Calculate temporary vectorial force */
2208             tx               = _mm_mul_ps(fscal,dx12);
2209             ty               = _mm_mul_ps(fscal,dy12);
2210             tz               = _mm_mul_ps(fscal,dz12);
2211
2212             /* Update vectorial force */
2213             fix1             = _mm_add_ps(fix1,tx);
2214             fiy1             = _mm_add_ps(fiy1,ty);
2215             fiz1             = _mm_add_ps(fiz1,tz);
2216
2217             fjx2             = _mm_add_ps(fjx2,tx);
2218             fjy2             = _mm_add_ps(fjy2,ty);
2219             fjz2             = _mm_add_ps(fjz2,tz);
2220
2221             /**************************
2222              * CALCULATE INTERACTIONS *
2223              **************************/
2224
2225             r13              = _mm_mul_ps(rsq13,rinv13);
2226             r13              = _mm_andnot_ps(dummy_mask,r13);
2227
2228             /* EWALD ELECTROSTATICS */
2229
2230             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2231             ewrt             = _mm_mul_ps(r13,ewtabscale);
2232             ewitab           = _mm_cvttps_epi32(ewrt);
2233             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2234             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2235                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2236                                          &ewtabF,&ewtabFn);
2237             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2238             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2239
2240             fscal            = felec;
2241
2242             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2243
2244             /* Calculate temporary vectorial force */
2245             tx               = _mm_mul_ps(fscal,dx13);
2246             ty               = _mm_mul_ps(fscal,dy13);
2247             tz               = _mm_mul_ps(fscal,dz13);
2248
2249             /* Update vectorial force */
2250             fix1             = _mm_add_ps(fix1,tx);
2251             fiy1             = _mm_add_ps(fiy1,ty);
2252             fiz1             = _mm_add_ps(fiz1,tz);
2253
2254             fjx3             = _mm_add_ps(fjx3,tx);
2255             fjy3             = _mm_add_ps(fjy3,ty);
2256             fjz3             = _mm_add_ps(fjz3,tz);
2257
2258             /**************************
2259              * CALCULATE INTERACTIONS *
2260              **************************/
2261
2262             r21              = _mm_mul_ps(rsq21,rinv21);
2263             r21              = _mm_andnot_ps(dummy_mask,r21);
2264
2265             /* EWALD ELECTROSTATICS */
2266
2267             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2268             ewrt             = _mm_mul_ps(r21,ewtabscale);
2269             ewitab           = _mm_cvttps_epi32(ewrt);
2270             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2271             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2272                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2273                                          &ewtabF,&ewtabFn);
2274             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2275             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2276
2277             fscal            = felec;
2278
2279             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2280
2281             /* Calculate temporary vectorial force */
2282             tx               = _mm_mul_ps(fscal,dx21);
2283             ty               = _mm_mul_ps(fscal,dy21);
2284             tz               = _mm_mul_ps(fscal,dz21);
2285
2286             /* Update vectorial force */
2287             fix2             = _mm_add_ps(fix2,tx);
2288             fiy2             = _mm_add_ps(fiy2,ty);
2289             fiz2             = _mm_add_ps(fiz2,tz);
2290
2291             fjx1             = _mm_add_ps(fjx1,tx);
2292             fjy1             = _mm_add_ps(fjy1,ty);
2293             fjz1             = _mm_add_ps(fjz1,tz);
2294
2295             /**************************
2296              * CALCULATE INTERACTIONS *
2297              **************************/
2298
2299             r22              = _mm_mul_ps(rsq22,rinv22);
2300             r22              = _mm_andnot_ps(dummy_mask,r22);
2301
2302             /* EWALD ELECTROSTATICS */
2303
2304             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2305             ewrt             = _mm_mul_ps(r22,ewtabscale);
2306             ewitab           = _mm_cvttps_epi32(ewrt);
2307             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2308             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2309                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2310                                          &ewtabF,&ewtabFn);
2311             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2312             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2313
2314             fscal            = felec;
2315
2316             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2317
2318             /* Calculate temporary vectorial force */
2319             tx               = _mm_mul_ps(fscal,dx22);
2320             ty               = _mm_mul_ps(fscal,dy22);
2321             tz               = _mm_mul_ps(fscal,dz22);
2322
2323             /* Update vectorial force */
2324             fix2             = _mm_add_ps(fix2,tx);
2325             fiy2             = _mm_add_ps(fiy2,ty);
2326             fiz2             = _mm_add_ps(fiz2,tz);
2327
2328             fjx2             = _mm_add_ps(fjx2,tx);
2329             fjy2             = _mm_add_ps(fjy2,ty);
2330             fjz2             = _mm_add_ps(fjz2,tz);
2331
2332             /**************************
2333              * CALCULATE INTERACTIONS *
2334              **************************/
2335
2336             r23              = _mm_mul_ps(rsq23,rinv23);
2337             r23              = _mm_andnot_ps(dummy_mask,r23);
2338
2339             /* EWALD ELECTROSTATICS */
2340
2341             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2342             ewrt             = _mm_mul_ps(r23,ewtabscale);
2343             ewitab           = _mm_cvttps_epi32(ewrt);
2344             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2345             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2346                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2347                                          &ewtabF,&ewtabFn);
2348             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2349             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2350
2351             fscal            = felec;
2352
2353             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2354
2355             /* Calculate temporary vectorial force */
2356             tx               = _mm_mul_ps(fscal,dx23);
2357             ty               = _mm_mul_ps(fscal,dy23);
2358             tz               = _mm_mul_ps(fscal,dz23);
2359
2360             /* Update vectorial force */
2361             fix2             = _mm_add_ps(fix2,tx);
2362             fiy2             = _mm_add_ps(fiy2,ty);
2363             fiz2             = _mm_add_ps(fiz2,tz);
2364
2365             fjx3             = _mm_add_ps(fjx3,tx);
2366             fjy3             = _mm_add_ps(fjy3,ty);
2367             fjz3             = _mm_add_ps(fjz3,tz);
2368
2369             /**************************
2370              * CALCULATE INTERACTIONS *
2371              **************************/
2372
2373             r31              = _mm_mul_ps(rsq31,rinv31);
2374             r31              = _mm_andnot_ps(dummy_mask,r31);
2375
2376             /* EWALD ELECTROSTATICS */
2377
2378             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2379             ewrt             = _mm_mul_ps(r31,ewtabscale);
2380             ewitab           = _mm_cvttps_epi32(ewrt);
2381             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2382             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2383                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2384                                          &ewtabF,&ewtabFn);
2385             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2386             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2387
2388             fscal            = felec;
2389
2390             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2391
2392             /* Calculate temporary vectorial force */
2393             tx               = _mm_mul_ps(fscal,dx31);
2394             ty               = _mm_mul_ps(fscal,dy31);
2395             tz               = _mm_mul_ps(fscal,dz31);
2396
2397             /* Update vectorial force */
2398             fix3             = _mm_add_ps(fix3,tx);
2399             fiy3             = _mm_add_ps(fiy3,ty);
2400             fiz3             = _mm_add_ps(fiz3,tz);
2401
2402             fjx1             = _mm_add_ps(fjx1,tx);
2403             fjy1             = _mm_add_ps(fjy1,ty);
2404             fjz1             = _mm_add_ps(fjz1,tz);
2405
2406             /**************************
2407              * CALCULATE INTERACTIONS *
2408              **************************/
2409
2410             r32              = _mm_mul_ps(rsq32,rinv32);
2411             r32              = _mm_andnot_ps(dummy_mask,r32);
2412
2413             /* EWALD ELECTROSTATICS */
2414
2415             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2416             ewrt             = _mm_mul_ps(r32,ewtabscale);
2417             ewitab           = _mm_cvttps_epi32(ewrt);
2418             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2419             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2420                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2421                                          &ewtabF,&ewtabFn);
2422             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2423             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2424
2425             fscal            = felec;
2426
2427             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2428
2429             /* Calculate temporary vectorial force */
2430             tx               = _mm_mul_ps(fscal,dx32);
2431             ty               = _mm_mul_ps(fscal,dy32);
2432             tz               = _mm_mul_ps(fscal,dz32);
2433
2434             /* Update vectorial force */
2435             fix3             = _mm_add_ps(fix3,tx);
2436             fiy3             = _mm_add_ps(fiy3,ty);
2437             fiz3             = _mm_add_ps(fiz3,tz);
2438
2439             fjx2             = _mm_add_ps(fjx2,tx);
2440             fjy2             = _mm_add_ps(fjy2,ty);
2441             fjz2             = _mm_add_ps(fjz2,tz);
2442
2443             /**************************
2444              * CALCULATE INTERACTIONS *
2445              **************************/
2446
2447             r33              = _mm_mul_ps(rsq33,rinv33);
2448             r33              = _mm_andnot_ps(dummy_mask,r33);
2449
2450             /* EWALD ELECTROSTATICS */
2451
2452             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2453             ewrt             = _mm_mul_ps(r33,ewtabscale);
2454             ewitab           = _mm_cvttps_epi32(ewrt);
2455             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2456             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2457                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2458                                          &ewtabF,&ewtabFn);
2459             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2460             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2461
2462             fscal            = felec;
2463
2464             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2465
2466             /* Calculate temporary vectorial force */
2467             tx               = _mm_mul_ps(fscal,dx33);
2468             ty               = _mm_mul_ps(fscal,dy33);
2469             tz               = _mm_mul_ps(fscal,dz33);
2470
2471             /* Update vectorial force */
2472             fix3             = _mm_add_ps(fix3,tx);
2473             fiy3             = _mm_add_ps(fiy3,ty);
2474             fiz3             = _mm_add_ps(fiz3,tz);
2475
2476             fjx3             = _mm_add_ps(fjx3,tx);
2477             fjy3             = _mm_add_ps(fjy3,ty);
2478             fjz3             = _mm_add_ps(fjz3,tz);
2479
2480             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2481             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2482             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2483             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2484
2485             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2486                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2487                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2488
2489             /* Inner loop uses 383 flops */
2490         }
2491
2492         /* End of innermost loop */
2493
2494         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2495                                               f+i_coord_offset,fshift+i_shift_offset);
2496
2497         /* Increment number of inner iterations */
2498         inneriter                  += j_index_end - j_index_start;
2499
2500         /* Outer loop uses 24 flops */
2501     }
2502
2503     /* Increment number of outer iterations */
2504     outeriter        += nri;
2505
2506     /* Update outer/inner flops */
2507
2508     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*383);
2509 }