Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse2_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_single
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LJEwald
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwioffset3;
89     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
97     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
115     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
116     __m128           c6grid_00;
117     __m128           c6grid_11;
118     __m128           c6grid_12;
119     __m128           c6grid_13;
120     __m128           c6grid_21;
121     __m128           c6grid_22;
122     __m128           c6grid_23;
123     __m128           c6grid_31;
124     __m128           c6grid_32;
125     __m128           c6grid_33;
126     __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
127     real             *vdwgridparam;
128     __m128           one_half = _mm_set1_ps(0.5);
129     __m128           minus_one = _mm_set1_ps(-1.0);
130     __m128i          ewitab;
131     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
132     real             *ewtab;
133     __m128           dummy_mask,cutoff_mask;
134     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
135     __m128           one     = _mm_set1_ps(1.0);
136     __m128           two     = _mm_set1_ps(2.0);
137     x                = xx[0];
138     f                = ff[0];
139
140     nri              = nlist->nri;
141     iinr             = nlist->iinr;
142     jindex           = nlist->jindex;
143     jjnr             = nlist->jjnr;
144     shiftidx         = nlist->shift;
145     gid              = nlist->gid;
146     shiftvec         = fr->shift_vec[0];
147     fshift           = fr->fshift[0];
148     facel            = _mm_set1_ps(fr->ic->epsfac);
149     charge           = mdatoms->chargeA;
150     nvdwtype         = fr->ntype;
151     vdwparam         = fr->nbfp;
152     vdwtype          = mdatoms->typeA;
153     vdwgridparam     = fr->ljpme_c6grid;
154     sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
155     ewclj            = _mm_set1_ps(fr->ic->ewaldcoeff_lj);
156     ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
157
158     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
159     ewtab            = fr->ic->tabq_coul_FDV0;
160     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
161     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
162
163     /* Setup water-specific parameters */
164     inr              = nlist->iinr[0];
165     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
166     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
167     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
168     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
169
170     jq1              = _mm_set1_ps(charge[inr+1]);
171     jq2              = _mm_set1_ps(charge[inr+2]);
172     jq3              = _mm_set1_ps(charge[inr+3]);
173     vdwjidx0A        = 2*vdwtype[inr+0];
174     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
175     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
176     c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
177     qq11             = _mm_mul_ps(iq1,jq1);
178     qq12             = _mm_mul_ps(iq1,jq2);
179     qq13             = _mm_mul_ps(iq1,jq3);
180     qq21             = _mm_mul_ps(iq2,jq1);
181     qq22             = _mm_mul_ps(iq2,jq2);
182     qq23             = _mm_mul_ps(iq2,jq3);
183     qq31             = _mm_mul_ps(iq3,jq1);
184     qq32             = _mm_mul_ps(iq3,jq2);
185     qq33             = _mm_mul_ps(iq3,jq3);
186
187     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
188     rcutoff_scalar   = fr->ic->rcoulomb;
189     rcutoff          = _mm_set1_ps(rcutoff_scalar);
190     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
191
192     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
193     rvdw             = _mm_set1_ps(fr->ic->rvdw);
194
195     /* Avoid stupid compiler warnings */
196     jnrA = jnrB = jnrC = jnrD = 0;
197     j_coord_offsetA = 0;
198     j_coord_offsetB = 0;
199     j_coord_offsetC = 0;
200     j_coord_offsetD = 0;
201
202     outeriter        = 0;
203     inneriter        = 0;
204
205     for(iidx=0;iidx<4*DIM;iidx++)
206     {
207         scratch[iidx] = 0.0;
208     }  
209
210     /* Start outer loop over neighborlists */
211     for(iidx=0; iidx<nri; iidx++)
212     {
213         /* Load shift vector for this list */
214         i_shift_offset   = DIM*shiftidx[iidx];
215
216         /* Load limits for loop over neighbors */
217         j_index_start    = jindex[iidx];
218         j_index_end      = jindex[iidx+1];
219
220         /* Get outer coordinate index */
221         inr              = iinr[iidx];
222         i_coord_offset   = DIM*inr;
223
224         /* Load i particle coords and add shift vector */
225         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
226                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
227         
228         fix0             = _mm_setzero_ps();
229         fiy0             = _mm_setzero_ps();
230         fiz0             = _mm_setzero_ps();
231         fix1             = _mm_setzero_ps();
232         fiy1             = _mm_setzero_ps();
233         fiz1             = _mm_setzero_ps();
234         fix2             = _mm_setzero_ps();
235         fiy2             = _mm_setzero_ps();
236         fiz2             = _mm_setzero_ps();
237         fix3             = _mm_setzero_ps();
238         fiy3             = _mm_setzero_ps();
239         fiz3             = _mm_setzero_ps();
240
241         /* Reset potential sums */
242         velecsum         = _mm_setzero_ps();
243         vvdwsum          = _mm_setzero_ps();
244
245         /* Start inner kernel loop */
246         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
247         {
248
249             /* Get j neighbor index, and coordinate index */
250             jnrA             = jjnr[jidx];
251             jnrB             = jjnr[jidx+1];
252             jnrC             = jjnr[jidx+2];
253             jnrD             = jjnr[jidx+3];
254             j_coord_offsetA  = DIM*jnrA;
255             j_coord_offsetB  = DIM*jnrB;
256             j_coord_offsetC  = DIM*jnrC;
257             j_coord_offsetD  = DIM*jnrD;
258
259             /* load j atom coordinates */
260             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
261                                               x+j_coord_offsetC,x+j_coord_offsetD,
262                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
263                                               &jy2,&jz2,&jx3,&jy3,&jz3);
264
265             /* Calculate displacement vector */
266             dx00             = _mm_sub_ps(ix0,jx0);
267             dy00             = _mm_sub_ps(iy0,jy0);
268             dz00             = _mm_sub_ps(iz0,jz0);
269             dx11             = _mm_sub_ps(ix1,jx1);
270             dy11             = _mm_sub_ps(iy1,jy1);
271             dz11             = _mm_sub_ps(iz1,jz1);
272             dx12             = _mm_sub_ps(ix1,jx2);
273             dy12             = _mm_sub_ps(iy1,jy2);
274             dz12             = _mm_sub_ps(iz1,jz2);
275             dx13             = _mm_sub_ps(ix1,jx3);
276             dy13             = _mm_sub_ps(iy1,jy3);
277             dz13             = _mm_sub_ps(iz1,jz3);
278             dx21             = _mm_sub_ps(ix2,jx1);
279             dy21             = _mm_sub_ps(iy2,jy1);
280             dz21             = _mm_sub_ps(iz2,jz1);
281             dx22             = _mm_sub_ps(ix2,jx2);
282             dy22             = _mm_sub_ps(iy2,jy2);
283             dz22             = _mm_sub_ps(iz2,jz2);
284             dx23             = _mm_sub_ps(ix2,jx3);
285             dy23             = _mm_sub_ps(iy2,jy3);
286             dz23             = _mm_sub_ps(iz2,jz3);
287             dx31             = _mm_sub_ps(ix3,jx1);
288             dy31             = _mm_sub_ps(iy3,jy1);
289             dz31             = _mm_sub_ps(iz3,jz1);
290             dx32             = _mm_sub_ps(ix3,jx2);
291             dy32             = _mm_sub_ps(iy3,jy2);
292             dz32             = _mm_sub_ps(iz3,jz2);
293             dx33             = _mm_sub_ps(ix3,jx3);
294             dy33             = _mm_sub_ps(iy3,jy3);
295             dz33             = _mm_sub_ps(iz3,jz3);
296
297             /* Calculate squared distance and things based on it */
298             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
299             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
300             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
301             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
302             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
303             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
304             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
305             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
306             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
307             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
308
309             rinv00           = sse2_invsqrt_f(rsq00);
310             rinv11           = sse2_invsqrt_f(rsq11);
311             rinv12           = sse2_invsqrt_f(rsq12);
312             rinv13           = sse2_invsqrt_f(rsq13);
313             rinv21           = sse2_invsqrt_f(rsq21);
314             rinv22           = sse2_invsqrt_f(rsq22);
315             rinv23           = sse2_invsqrt_f(rsq23);
316             rinv31           = sse2_invsqrt_f(rsq31);
317             rinv32           = sse2_invsqrt_f(rsq32);
318             rinv33           = sse2_invsqrt_f(rsq33);
319
320             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
321             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
322             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
323             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
324             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
325             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
326             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
327             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
328             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
329             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
330
331             fjx0             = _mm_setzero_ps();
332             fjy0             = _mm_setzero_ps();
333             fjz0             = _mm_setzero_ps();
334             fjx1             = _mm_setzero_ps();
335             fjy1             = _mm_setzero_ps();
336             fjz1             = _mm_setzero_ps();
337             fjx2             = _mm_setzero_ps();
338             fjy2             = _mm_setzero_ps();
339             fjz2             = _mm_setzero_ps();
340             fjx3             = _mm_setzero_ps();
341             fjy3             = _mm_setzero_ps();
342             fjz3             = _mm_setzero_ps();
343
344             /**************************
345              * CALCULATE INTERACTIONS *
346              **************************/
347
348             if (gmx_mm_any_lt(rsq00,rcutoff2))
349             {
350
351             r00              = _mm_mul_ps(rsq00,rinv00);
352
353             /* Analytical LJ-PME */
354             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
355             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
356             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
357             exponent         = sse2_exp_f(ewcljrsq);
358             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
359             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
360             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
361             vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
362             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
363             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
364                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
365             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
366             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
367
368             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
369
370             /* Update potential sum for this i atom from the interaction with this j atom. */
371             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
372             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
373
374             fscal            = fvdw;
375
376             fscal            = _mm_and_ps(fscal,cutoff_mask);
377
378             /* Calculate temporary vectorial force */
379             tx               = _mm_mul_ps(fscal,dx00);
380             ty               = _mm_mul_ps(fscal,dy00);
381             tz               = _mm_mul_ps(fscal,dz00);
382
383             /* Update vectorial force */
384             fix0             = _mm_add_ps(fix0,tx);
385             fiy0             = _mm_add_ps(fiy0,ty);
386             fiz0             = _mm_add_ps(fiz0,tz);
387
388             fjx0             = _mm_add_ps(fjx0,tx);
389             fjy0             = _mm_add_ps(fjy0,ty);
390             fjz0             = _mm_add_ps(fjz0,tz);
391             
392             }
393
394             /**************************
395              * CALCULATE INTERACTIONS *
396              **************************/
397
398             if (gmx_mm_any_lt(rsq11,rcutoff2))
399             {
400
401             r11              = _mm_mul_ps(rsq11,rinv11);
402
403             /* EWALD ELECTROSTATICS */
404
405             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
406             ewrt             = _mm_mul_ps(r11,ewtabscale);
407             ewitab           = _mm_cvttps_epi32(ewrt);
408             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
409             ewitab           = _mm_slli_epi32(ewitab,2);
410             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
411             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
412             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
413             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
414             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
415             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
416             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
417             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
418             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
419
420             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
421
422             /* Update potential sum for this i atom from the interaction with this j atom. */
423             velec            = _mm_and_ps(velec,cutoff_mask);
424             velecsum         = _mm_add_ps(velecsum,velec);
425
426             fscal            = felec;
427
428             fscal            = _mm_and_ps(fscal,cutoff_mask);
429
430             /* Calculate temporary vectorial force */
431             tx               = _mm_mul_ps(fscal,dx11);
432             ty               = _mm_mul_ps(fscal,dy11);
433             tz               = _mm_mul_ps(fscal,dz11);
434
435             /* Update vectorial force */
436             fix1             = _mm_add_ps(fix1,tx);
437             fiy1             = _mm_add_ps(fiy1,ty);
438             fiz1             = _mm_add_ps(fiz1,tz);
439
440             fjx1             = _mm_add_ps(fjx1,tx);
441             fjy1             = _mm_add_ps(fjy1,ty);
442             fjz1             = _mm_add_ps(fjz1,tz);
443             
444             }
445
446             /**************************
447              * CALCULATE INTERACTIONS *
448              **************************/
449
450             if (gmx_mm_any_lt(rsq12,rcutoff2))
451             {
452
453             r12              = _mm_mul_ps(rsq12,rinv12);
454
455             /* EWALD ELECTROSTATICS */
456
457             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
458             ewrt             = _mm_mul_ps(r12,ewtabscale);
459             ewitab           = _mm_cvttps_epi32(ewrt);
460             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
461             ewitab           = _mm_slli_epi32(ewitab,2);
462             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
463             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
464             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
465             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
466             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
467             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
468             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
469             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
470             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
471
472             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
473
474             /* Update potential sum for this i atom from the interaction with this j atom. */
475             velec            = _mm_and_ps(velec,cutoff_mask);
476             velecsum         = _mm_add_ps(velecsum,velec);
477
478             fscal            = felec;
479
480             fscal            = _mm_and_ps(fscal,cutoff_mask);
481
482             /* Calculate temporary vectorial force */
483             tx               = _mm_mul_ps(fscal,dx12);
484             ty               = _mm_mul_ps(fscal,dy12);
485             tz               = _mm_mul_ps(fscal,dz12);
486
487             /* Update vectorial force */
488             fix1             = _mm_add_ps(fix1,tx);
489             fiy1             = _mm_add_ps(fiy1,ty);
490             fiz1             = _mm_add_ps(fiz1,tz);
491
492             fjx2             = _mm_add_ps(fjx2,tx);
493             fjy2             = _mm_add_ps(fjy2,ty);
494             fjz2             = _mm_add_ps(fjz2,tz);
495             
496             }
497
498             /**************************
499              * CALCULATE INTERACTIONS *
500              **************************/
501
502             if (gmx_mm_any_lt(rsq13,rcutoff2))
503             {
504
505             r13              = _mm_mul_ps(rsq13,rinv13);
506
507             /* EWALD ELECTROSTATICS */
508
509             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
510             ewrt             = _mm_mul_ps(r13,ewtabscale);
511             ewitab           = _mm_cvttps_epi32(ewrt);
512             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
513             ewitab           = _mm_slli_epi32(ewitab,2);
514             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
515             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
516             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
517             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
518             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
519             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
520             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
521             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
522             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
523
524             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
525
526             /* Update potential sum for this i atom from the interaction with this j atom. */
527             velec            = _mm_and_ps(velec,cutoff_mask);
528             velecsum         = _mm_add_ps(velecsum,velec);
529
530             fscal            = felec;
531
532             fscal            = _mm_and_ps(fscal,cutoff_mask);
533
534             /* Calculate temporary vectorial force */
535             tx               = _mm_mul_ps(fscal,dx13);
536             ty               = _mm_mul_ps(fscal,dy13);
537             tz               = _mm_mul_ps(fscal,dz13);
538
539             /* Update vectorial force */
540             fix1             = _mm_add_ps(fix1,tx);
541             fiy1             = _mm_add_ps(fiy1,ty);
542             fiz1             = _mm_add_ps(fiz1,tz);
543
544             fjx3             = _mm_add_ps(fjx3,tx);
545             fjy3             = _mm_add_ps(fjy3,ty);
546             fjz3             = _mm_add_ps(fjz3,tz);
547             
548             }
549
550             /**************************
551              * CALCULATE INTERACTIONS *
552              **************************/
553
554             if (gmx_mm_any_lt(rsq21,rcutoff2))
555             {
556
557             r21              = _mm_mul_ps(rsq21,rinv21);
558
559             /* EWALD ELECTROSTATICS */
560
561             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
562             ewrt             = _mm_mul_ps(r21,ewtabscale);
563             ewitab           = _mm_cvttps_epi32(ewrt);
564             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
565             ewitab           = _mm_slli_epi32(ewitab,2);
566             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
567             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
568             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
569             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
570             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
571             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
572             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
573             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
574             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
575
576             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
577
578             /* Update potential sum for this i atom from the interaction with this j atom. */
579             velec            = _mm_and_ps(velec,cutoff_mask);
580             velecsum         = _mm_add_ps(velecsum,velec);
581
582             fscal            = felec;
583
584             fscal            = _mm_and_ps(fscal,cutoff_mask);
585
586             /* Calculate temporary vectorial force */
587             tx               = _mm_mul_ps(fscal,dx21);
588             ty               = _mm_mul_ps(fscal,dy21);
589             tz               = _mm_mul_ps(fscal,dz21);
590
591             /* Update vectorial force */
592             fix2             = _mm_add_ps(fix2,tx);
593             fiy2             = _mm_add_ps(fiy2,ty);
594             fiz2             = _mm_add_ps(fiz2,tz);
595
596             fjx1             = _mm_add_ps(fjx1,tx);
597             fjy1             = _mm_add_ps(fjy1,ty);
598             fjz1             = _mm_add_ps(fjz1,tz);
599             
600             }
601
602             /**************************
603              * CALCULATE INTERACTIONS *
604              **************************/
605
606             if (gmx_mm_any_lt(rsq22,rcutoff2))
607             {
608
609             r22              = _mm_mul_ps(rsq22,rinv22);
610
611             /* EWALD ELECTROSTATICS */
612
613             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
614             ewrt             = _mm_mul_ps(r22,ewtabscale);
615             ewitab           = _mm_cvttps_epi32(ewrt);
616             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
617             ewitab           = _mm_slli_epi32(ewitab,2);
618             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
619             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
620             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
621             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
622             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
623             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
624             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
625             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
626             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
627
628             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
629
630             /* Update potential sum for this i atom from the interaction with this j atom. */
631             velec            = _mm_and_ps(velec,cutoff_mask);
632             velecsum         = _mm_add_ps(velecsum,velec);
633
634             fscal            = felec;
635
636             fscal            = _mm_and_ps(fscal,cutoff_mask);
637
638             /* Calculate temporary vectorial force */
639             tx               = _mm_mul_ps(fscal,dx22);
640             ty               = _mm_mul_ps(fscal,dy22);
641             tz               = _mm_mul_ps(fscal,dz22);
642
643             /* Update vectorial force */
644             fix2             = _mm_add_ps(fix2,tx);
645             fiy2             = _mm_add_ps(fiy2,ty);
646             fiz2             = _mm_add_ps(fiz2,tz);
647
648             fjx2             = _mm_add_ps(fjx2,tx);
649             fjy2             = _mm_add_ps(fjy2,ty);
650             fjz2             = _mm_add_ps(fjz2,tz);
651             
652             }
653
654             /**************************
655              * CALCULATE INTERACTIONS *
656              **************************/
657
658             if (gmx_mm_any_lt(rsq23,rcutoff2))
659             {
660
661             r23              = _mm_mul_ps(rsq23,rinv23);
662
663             /* EWALD ELECTROSTATICS */
664
665             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
666             ewrt             = _mm_mul_ps(r23,ewtabscale);
667             ewitab           = _mm_cvttps_epi32(ewrt);
668             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
669             ewitab           = _mm_slli_epi32(ewitab,2);
670             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
671             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
672             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
673             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
674             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
675             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
676             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
677             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
678             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
679
680             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
681
682             /* Update potential sum for this i atom from the interaction with this j atom. */
683             velec            = _mm_and_ps(velec,cutoff_mask);
684             velecsum         = _mm_add_ps(velecsum,velec);
685
686             fscal            = felec;
687
688             fscal            = _mm_and_ps(fscal,cutoff_mask);
689
690             /* Calculate temporary vectorial force */
691             tx               = _mm_mul_ps(fscal,dx23);
692             ty               = _mm_mul_ps(fscal,dy23);
693             tz               = _mm_mul_ps(fscal,dz23);
694
695             /* Update vectorial force */
696             fix2             = _mm_add_ps(fix2,tx);
697             fiy2             = _mm_add_ps(fiy2,ty);
698             fiz2             = _mm_add_ps(fiz2,tz);
699
700             fjx3             = _mm_add_ps(fjx3,tx);
701             fjy3             = _mm_add_ps(fjy3,ty);
702             fjz3             = _mm_add_ps(fjz3,tz);
703             
704             }
705
706             /**************************
707              * CALCULATE INTERACTIONS *
708              **************************/
709
710             if (gmx_mm_any_lt(rsq31,rcutoff2))
711             {
712
713             r31              = _mm_mul_ps(rsq31,rinv31);
714
715             /* EWALD ELECTROSTATICS */
716
717             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
718             ewrt             = _mm_mul_ps(r31,ewtabscale);
719             ewitab           = _mm_cvttps_epi32(ewrt);
720             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
721             ewitab           = _mm_slli_epi32(ewitab,2);
722             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
723             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
724             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
725             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
726             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
727             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
728             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
729             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
730             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
731
732             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
733
734             /* Update potential sum for this i atom from the interaction with this j atom. */
735             velec            = _mm_and_ps(velec,cutoff_mask);
736             velecsum         = _mm_add_ps(velecsum,velec);
737
738             fscal            = felec;
739
740             fscal            = _mm_and_ps(fscal,cutoff_mask);
741
742             /* Calculate temporary vectorial force */
743             tx               = _mm_mul_ps(fscal,dx31);
744             ty               = _mm_mul_ps(fscal,dy31);
745             tz               = _mm_mul_ps(fscal,dz31);
746
747             /* Update vectorial force */
748             fix3             = _mm_add_ps(fix3,tx);
749             fiy3             = _mm_add_ps(fiy3,ty);
750             fiz3             = _mm_add_ps(fiz3,tz);
751
752             fjx1             = _mm_add_ps(fjx1,tx);
753             fjy1             = _mm_add_ps(fjy1,ty);
754             fjz1             = _mm_add_ps(fjz1,tz);
755             
756             }
757
758             /**************************
759              * CALCULATE INTERACTIONS *
760              **************************/
761
762             if (gmx_mm_any_lt(rsq32,rcutoff2))
763             {
764
765             r32              = _mm_mul_ps(rsq32,rinv32);
766
767             /* EWALD ELECTROSTATICS */
768
769             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
770             ewrt             = _mm_mul_ps(r32,ewtabscale);
771             ewitab           = _mm_cvttps_epi32(ewrt);
772             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
773             ewitab           = _mm_slli_epi32(ewitab,2);
774             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
775             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
776             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
777             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
778             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
779             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
780             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
781             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
782             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
783
784             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
785
786             /* Update potential sum for this i atom from the interaction with this j atom. */
787             velec            = _mm_and_ps(velec,cutoff_mask);
788             velecsum         = _mm_add_ps(velecsum,velec);
789
790             fscal            = felec;
791
792             fscal            = _mm_and_ps(fscal,cutoff_mask);
793
794             /* Calculate temporary vectorial force */
795             tx               = _mm_mul_ps(fscal,dx32);
796             ty               = _mm_mul_ps(fscal,dy32);
797             tz               = _mm_mul_ps(fscal,dz32);
798
799             /* Update vectorial force */
800             fix3             = _mm_add_ps(fix3,tx);
801             fiy3             = _mm_add_ps(fiy3,ty);
802             fiz3             = _mm_add_ps(fiz3,tz);
803
804             fjx2             = _mm_add_ps(fjx2,tx);
805             fjy2             = _mm_add_ps(fjy2,ty);
806             fjz2             = _mm_add_ps(fjz2,tz);
807             
808             }
809
810             /**************************
811              * CALCULATE INTERACTIONS *
812              **************************/
813
814             if (gmx_mm_any_lt(rsq33,rcutoff2))
815             {
816
817             r33              = _mm_mul_ps(rsq33,rinv33);
818
819             /* EWALD ELECTROSTATICS */
820
821             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
822             ewrt             = _mm_mul_ps(r33,ewtabscale);
823             ewitab           = _mm_cvttps_epi32(ewrt);
824             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
825             ewitab           = _mm_slli_epi32(ewitab,2);
826             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
827             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
828             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
829             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
830             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
831             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
832             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
833             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
834             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
835
836             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
837
838             /* Update potential sum for this i atom from the interaction with this j atom. */
839             velec            = _mm_and_ps(velec,cutoff_mask);
840             velecsum         = _mm_add_ps(velecsum,velec);
841
842             fscal            = felec;
843
844             fscal            = _mm_and_ps(fscal,cutoff_mask);
845
846             /* Calculate temporary vectorial force */
847             tx               = _mm_mul_ps(fscal,dx33);
848             ty               = _mm_mul_ps(fscal,dy33);
849             tz               = _mm_mul_ps(fscal,dz33);
850
851             /* Update vectorial force */
852             fix3             = _mm_add_ps(fix3,tx);
853             fiy3             = _mm_add_ps(fiy3,ty);
854             fiz3             = _mm_add_ps(fiz3,tz);
855
856             fjx3             = _mm_add_ps(fjx3,tx);
857             fjy3             = _mm_add_ps(fjy3,ty);
858             fjz3             = _mm_add_ps(fjz3,tz);
859             
860             }
861
862             fjptrA             = f+j_coord_offsetA;
863             fjptrB             = f+j_coord_offsetB;
864             fjptrC             = f+j_coord_offsetC;
865             fjptrD             = f+j_coord_offsetD;
866
867             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
868                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
869                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
870
871             /* Inner loop uses 479 flops */
872         }
873
874         if(jidx<j_index_end)
875         {
876
877             /* Get j neighbor index, and coordinate index */
878             jnrlistA         = jjnr[jidx];
879             jnrlistB         = jjnr[jidx+1];
880             jnrlistC         = jjnr[jidx+2];
881             jnrlistD         = jjnr[jidx+3];
882             /* Sign of each element will be negative for non-real atoms.
883              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
884              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
885              */
886             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
887             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
888             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
889             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
890             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
891             j_coord_offsetA  = DIM*jnrA;
892             j_coord_offsetB  = DIM*jnrB;
893             j_coord_offsetC  = DIM*jnrC;
894             j_coord_offsetD  = DIM*jnrD;
895
896             /* load j atom coordinates */
897             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
898                                               x+j_coord_offsetC,x+j_coord_offsetD,
899                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
900                                               &jy2,&jz2,&jx3,&jy3,&jz3);
901
902             /* Calculate displacement vector */
903             dx00             = _mm_sub_ps(ix0,jx0);
904             dy00             = _mm_sub_ps(iy0,jy0);
905             dz00             = _mm_sub_ps(iz0,jz0);
906             dx11             = _mm_sub_ps(ix1,jx1);
907             dy11             = _mm_sub_ps(iy1,jy1);
908             dz11             = _mm_sub_ps(iz1,jz1);
909             dx12             = _mm_sub_ps(ix1,jx2);
910             dy12             = _mm_sub_ps(iy1,jy2);
911             dz12             = _mm_sub_ps(iz1,jz2);
912             dx13             = _mm_sub_ps(ix1,jx3);
913             dy13             = _mm_sub_ps(iy1,jy3);
914             dz13             = _mm_sub_ps(iz1,jz3);
915             dx21             = _mm_sub_ps(ix2,jx1);
916             dy21             = _mm_sub_ps(iy2,jy1);
917             dz21             = _mm_sub_ps(iz2,jz1);
918             dx22             = _mm_sub_ps(ix2,jx2);
919             dy22             = _mm_sub_ps(iy2,jy2);
920             dz22             = _mm_sub_ps(iz2,jz2);
921             dx23             = _mm_sub_ps(ix2,jx3);
922             dy23             = _mm_sub_ps(iy2,jy3);
923             dz23             = _mm_sub_ps(iz2,jz3);
924             dx31             = _mm_sub_ps(ix3,jx1);
925             dy31             = _mm_sub_ps(iy3,jy1);
926             dz31             = _mm_sub_ps(iz3,jz1);
927             dx32             = _mm_sub_ps(ix3,jx2);
928             dy32             = _mm_sub_ps(iy3,jy2);
929             dz32             = _mm_sub_ps(iz3,jz2);
930             dx33             = _mm_sub_ps(ix3,jx3);
931             dy33             = _mm_sub_ps(iy3,jy3);
932             dz33             = _mm_sub_ps(iz3,jz3);
933
934             /* Calculate squared distance and things based on it */
935             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
936             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
937             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
938             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
939             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
940             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
941             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
942             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
943             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
944             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
945
946             rinv00           = sse2_invsqrt_f(rsq00);
947             rinv11           = sse2_invsqrt_f(rsq11);
948             rinv12           = sse2_invsqrt_f(rsq12);
949             rinv13           = sse2_invsqrt_f(rsq13);
950             rinv21           = sse2_invsqrt_f(rsq21);
951             rinv22           = sse2_invsqrt_f(rsq22);
952             rinv23           = sse2_invsqrt_f(rsq23);
953             rinv31           = sse2_invsqrt_f(rsq31);
954             rinv32           = sse2_invsqrt_f(rsq32);
955             rinv33           = sse2_invsqrt_f(rsq33);
956
957             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
958             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
959             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
960             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
961             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
962             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
963             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
964             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
965             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
966             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
967
968             fjx0             = _mm_setzero_ps();
969             fjy0             = _mm_setzero_ps();
970             fjz0             = _mm_setzero_ps();
971             fjx1             = _mm_setzero_ps();
972             fjy1             = _mm_setzero_ps();
973             fjz1             = _mm_setzero_ps();
974             fjx2             = _mm_setzero_ps();
975             fjy2             = _mm_setzero_ps();
976             fjz2             = _mm_setzero_ps();
977             fjx3             = _mm_setzero_ps();
978             fjy3             = _mm_setzero_ps();
979             fjz3             = _mm_setzero_ps();
980
981             /**************************
982              * CALCULATE INTERACTIONS *
983              **************************/
984
985             if (gmx_mm_any_lt(rsq00,rcutoff2))
986             {
987
988             r00              = _mm_mul_ps(rsq00,rinv00);
989             r00              = _mm_andnot_ps(dummy_mask,r00);
990
991             /* Analytical LJ-PME */
992             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
993             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
994             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
995             exponent         = sse2_exp_f(ewcljrsq);
996             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
997             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
998             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
999             vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
1000             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1001             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
1002                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
1003             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
1004             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
1005
1006             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1007
1008             /* Update potential sum for this i atom from the interaction with this j atom. */
1009             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
1010             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
1011             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
1012
1013             fscal            = fvdw;
1014
1015             fscal            = _mm_and_ps(fscal,cutoff_mask);
1016
1017             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1018
1019             /* Calculate temporary vectorial force */
1020             tx               = _mm_mul_ps(fscal,dx00);
1021             ty               = _mm_mul_ps(fscal,dy00);
1022             tz               = _mm_mul_ps(fscal,dz00);
1023
1024             /* Update vectorial force */
1025             fix0             = _mm_add_ps(fix0,tx);
1026             fiy0             = _mm_add_ps(fiy0,ty);
1027             fiz0             = _mm_add_ps(fiz0,tz);
1028
1029             fjx0             = _mm_add_ps(fjx0,tx);
1030             fjy0             = _mm_add_ps(fjy0,ty);
1031             fjz0             = _mm_add_ps(fjz0,tz);
1032             
1033             }
1034
1035             /**************************
1036              * CALCULATE INTERACTIONS *
1037              **************************/
1038
1039             if (gmx_mm_any_lt(rsq11,rcutoff2))
1040             {
1041
1042             r11              = _mm_mul_ps(rsq11,rinv11);
1043             r11              = _mm_andnot_ps(dummy_mask,r11);
1044
1045             /* EWALD ELECTROSTATICS */
1046
1047             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1048             ewrt             = _mm_mul_ps(r11,ewtabscale);
1049             ewitab           = _mm_cvttps_epi32(ewrt);
1050             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1051             ewitab           = _mm_slli_epi32(ewitab,2);
1052             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1053             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1054             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1055             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1056             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1057             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1058             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1059             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1060             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1061
1062             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1063
1064             /* Update potential sum for this i atom from the interaction with this j atom. */
1065             velec            = _mm_and_ps(velec,cutoff_mask);
1066             velec            = _mm_andnot_ps(dummy_mask,velec);
1067             velecsum         = _mm_add_ps(velecsum,velec);
1068
1069             fscal            = felec;
1070
1071             fscal            = _mm_and_ps(fscal,cutoff_mask);
1072
1073             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1074
1075             /* Calculate temporary vectorial force */
1076             tx               = _mm_mul_ps(fscal,dx11);
1077             ty               = _mm_mul_ps(fscal,dy11);
1078             tz               = _mm_mul_ps(fscal,dz11);
1079
1080             /* Update vectorial force */
1081             fix1             = _mm_add_ps(fix1,tx);
1082             fiy1             = _mm_add_ps(fiy1,ty);
1083             fiz1             = _mm_add_ps(fiz1,tz);
1084
1085             fjx1             = _mm_add_ps(fjx1,tx);
1086             fjy1             = _mm_add_ps(fjy1,ty);
1087             fjz1             = _mm_add_ps(fjz1,tz);
1088             
1089             }
1090
1091             /**************************
1092              * CALCULATE INTERACTIONS *
1093              **************************/
1094
1095             if (gmx_mm_any_lt(rsq12,rcutoff2))
1096             {
1097
1098             r12              = _mm_mul_ps(rsq12,rinv12);
1099             r12              = _mm_andnot_ps(dummy_mask,r12);
1100
1101             /* EWALD ELECTROSTATICS */
1102
1103             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1104             ewrt             = _mm_mul_ps(r12,ewtabscale);
1105             ewitab           = _mm_cvttps_epi32(ewrt);
1106             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1107             ewitab           = _mm_slli_epi32(ewitab,2);
1108             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1109             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1110             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1111             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1112             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1113             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1114             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1115             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1116             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1117
1118             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1119
1120             /* Update potential sum for this i atom from the interaction with this j atom. */
1121             velec            = _mm_and_ps(velec,cutoff_mask);
1122             velec            = _mm_andnot_ps(dummy_mask,velec);
1123             velecsum         = _mm_add_ps(velecsum,velec);
1124
1125             fscal            = felec;
1126
1127             fscal            = _mm_and_ps(fscal,cutoff_mask);
1128
1129             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1130
1131             /* Calculate temporary vectorial force */
1132             tx               = _mm_mul_ps(fscal,dx12);
1133             ty               = _mm_mul_ps(fscal,dy12);
1134             tz               = _mm_mul_ps(fscal,dz12);
1135
1136             /* Update vectorial force */
1137             fix1             = _mm_add_ps(fix1,tx);
1138             fiy1             = _mm_add_ps(fiy1,ty);
1139             fiz1             = _mm_add_ps(fiz1,tz);
1140
1141             fjx2             = _mm_add_ps(fjx2,tx);
1142             fjy2             = _mm_add_ps(fjy2,ty);
1143             fjz2             = _mm_add_ps(fjz2,tz);
1144             
1145             }
1146
1147             /**************************
1148              * CALCULATE INTERACTIONS *
1149              **************************/
1150
1151             if (gmx_mm_any_lt(rsq13,rcutoff2))
1152             {
1153
1154             r13              = _mm_mul_ps(rsq13,rinv13);
1155             r13              = _mm_andnot_ps(dummy_mask,r13);
1156
1157             /* EWALD ELECTROSTATICS */
1158
1159             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1160             ewrt             = _mm_mul_ps(r13,ewtabscale);
1161             ewitab           = _mm_cvttps_epi32(ewrt);
1162             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1163             ewitab           = _mm_slli_epi32(ewitab,2);
1164             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1165             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1166             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1167             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1168             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1169             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1170             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1171             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
1172             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1173
1174             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1175
1176             /* Update potential sum for this i atom from the interaction with this j atom. */
1177             velec            = _mm_and_ps(velec,cutoff_mask);
1178             velec            = _mm_andnot_ps(dummy_mask,velec);
1179             velecsum         = _mm_add_ps(velecsum,velec);
1180
1181             fscal            = felec;
1182
1183             fscal            = _mm_and_ps(fscal,cutoff_mask);
1184
1185             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1186
1187             /* Calculate temporary vectorial force */
1188             tx               = _mm_mul_ps(fscal,dx13);
1189             ty               = _mm_mul_ps(fscal,dy13);
1190             tz               = _mm_mul_ps(fscal,dz13);
1191
1192             /* Update vectorial force */
1193             fix1             = _mm_add_ps(fix1,tx);
1194             fiy1             = _mm_add_ps(fiy1,ty);
1195             fiz1             = _mm_add_ps(fiz1,tz);
1196
1197             fjx3             = _mm_add_ps(fjx3,tx);
1198             fjy3             = _mm_add_ps(fjy3,ty);
1199             fjz3             = _mm_add_ps(fjz3,tz);
1200             
1201             }
1202
1203             /**************************
1204              * CALCULATE INTERACTIONS *
1205              **************************/
1206
1207             if (gmx_mm_any_lt(rsq21,rcutoff2))
1208             {
1209
1210             r21              = _mm_mul_ps(rsq21,rinv21);
1211             r21              = _mm_andnot_ps(dummy_mask,r21);
1212
1213             /* EWALD ELECTROSTATICS */
1214
1215             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1216             ewrt             = _mm_mul_ps(r21,ewtabscale);
1217             ewitab           = _mm_cvttps_epi32(ewrt);
1218             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1219             ewitab           = _mm_slli_epi32(ewitab,2);
1220             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1221             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1222             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1223             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1224             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1225             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1226             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1227             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1228             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1229
1230             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1231
1232             /* Update potential sum for this i atom from the interaction with this j atom. */
1233             velec            = _mm_and_ps(velec,cutoff_mask);
1234             velec            = _mm_andnot_ps(dummy_mask,velec);
1235             velecsum         = _mm_add_ps(velecsum,velec);
1236
1237             fscal            = felec;
1238
1239             fscal            = _mm_and_ps(fscal,cutoff_mask);
1240
1241             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1242
1243             /* Calculate temporary vectorial force */
1244             tx               = _mm_mul_ps(fscal,dx21);
1245             ty               = _mm_mul_ps(fscal,dy21);
1246             tz               = _mm_mul_ps(fscal,dz21);
1247
1248             /* Update vectorial force */
1249             fix2             = _mm_add_ps(fix2,tx);
1250             fiy2             = _mm_add_ps(fiy2,ty);
1251             fiz2             = _mm_add_ps(fiz2,tz);
1252
1253             fjx1             = _mm_add_ps(fjx1,tx);
1254             fjy1             = _mm_add_ps(fjy1,ty);
1255             fjz1             = _mm_add_ps(fjz1,tz);
1256             
1257             }
1258
1259             /**************************
1260              * CALCULATE INTERACTIONS *
1261              **************************/
1262
1263             if (gmx_mm_any_lt(rsq22,rcutoff2))
1264             {
1265
1266             r22              = _mm_mul_ps(rsq22,rinv22);
1267             r22              = _mm_andnot_ps(dummy_mask,r22);
1268
1269             /* EWALD ELECTROSTATICS */
1270
1271             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1272             ewrt             = _mm_mul_ps(r22,ewtabscale);
1273             ewitab           = _mm_cvttps_epi32(ewrt);
1274             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1275             ewitab           = _mm_slli_epi32(ewitab,2);
1276             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1277             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1278             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1279             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1280             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1281             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1282             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1283             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1284             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1285
1286             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1287
1288             /* Update potential sum for this i atom from the interaction with this j atom. */
1289             velec            = _mm_and_ps(velec,cutoff_mask);
1290             velec            = _mm_andnot_ps(dummy_mask,velec);
1291             velecsum         = _mm_add_ps(velecsum,velec);
1292
1293             fscal            = felec;
1294
1295             fscal            = _mm_and_ps(fscal,cutoff_mask);
1296
1297             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1298
1299             /* Calculate temporary vectorial force */
1300             tx               = _mm_mul_ps(fscal,dx22);
1301             ty               = _mm_mul_ps(fscal,dy22);
1302             tz               = _mm_mul_ps(fscal,dz22);
1303
1304             /* Update vectorial force */
1305             fix2             = _mm_add_ps(fix2,tx);
1306             fiy2             = _mm_add_ps(fiy2,ty);
1307             fiz2             = _mm_add_ps(fiz2,tz);
1308
1309             fjx2             = _mm_add_ps(fjx2,tx);
1310             fjy2             = _mm_add_ps(fjy2,ty);
1311             fjz2             = _mm_add_ps(fjz2,tz);
1312             
1313             }
1314
1315             /**************************
1316              * CALCULATE INTERACTIONS *
1317              **************************/
1318
1319             if (gmx_mm_any_lt(rsq23,rcutoff2))
1320             {
1321
1322             r23              = _mm_mul_ps(rsq23,rinv23);
1323             r23              = _mm_andnot_ps(dummy_mask,r23);
1324
1325             /* EWALD ELECTROSTATICS */
1326
1327             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1328             ewrt             = _mm_mul_ps(r23,ewtabscale);
1329             ewitab           = _mm_cvttps_epi32(ewrt);
1330             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1331             ewitab           = _mm_slli_epi32(ewitab,2);
1332             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1333             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1334             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1335             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1336             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1337             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1338             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1339             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
1340             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1341
1342             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1343
1344             /* Update potential sum for this i atom from the interaction with this j atom. */
1345             velec            = _mm_and_ps(velec,cutoff_mask);
1346             velec            = _mm_andnot_ps(dummy_mask,velec);
1347             velecsum         = _mm_add_ps(velecsum,velec);
1348
1349             fscal            = felec;
1350
1351             fscal            = _mm_and_ps(fscal,cutoff_mask);
1352
1353             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1354
1355             /* Calculate temporary vectorial force */
1356             tx               = _mm_mul_ps(fscal,dx23);
1357             ty               = _mm_mul_ps(fscal,dy23);
1358             tz               = _mm_mul_ps(fscal,dz23);
1359
1360             /* Update vectorial force */
1361             fix2             = _mm_add_ps(fix2,tx);
1362             fiy2             = _mm_add_ps(fiy2,ty);
1363             fiz2             = _mm_add_ps(fiz2,tz);
1364
1365             fjx3             = _mm_add_ps(fjx3,tx);
1366             fjy3             = _mm_add_ps(fjy3,ty);
1367             fjz3             = _mm_add_ps(fjz3,tz);
1368             
1369             }
1370
1371             /**************************
1372              * CALCULATE INTERACTIONS *
1373              **************************/
1374
1375             if (gmx_mm_any_lt(rsq31,rcutoff2))
1376             {
1377
1378             r31              = _mm_mul_ps(rsq31,rinv31);
1379             r31              = _mm_andnot_ps(dummy_mask,r31);
1380
1381             /* EWALD ELECTROSTATICS */
1382
1383             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1384             ewrt             = _mm_mul_ps(r31,ewtabscale);
1385             ewitab           = _mm_cvttps_epi32(ewrt);
1386             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1387             ewitab           = _mm_slli_epi32(ewitab,2);
1388             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1389             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1390             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1391             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1392             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1393             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1394             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1395             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
1396             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1397
1398             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1399
1400             /* Update potential sum for this i atom from the interaction with this j atom. */
1401             velec            = _mm_and_ps(velec,cutoff_mask);
1402             velec            = _mm_andnot_ps(dummy_mask,velec);
1403             velecsum         = _mm_add_ps(velecsum,velec);
1404
1405             fscal            = felec;
1406
1407             fscal            = _mm_and_ps(fscal,cutoff_mask);
1408
1409             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1410
1411             /* Calculate temporary vectorial force */
1412             tx               = _mm_mul_ps(fscal,dx31);
1413             ty               = _mm_mul_ps(fscal,dy31);
1414             tz               = _mm_mul_ps(fscal,dz31);
1415
1416             /* Update vectorial force */
1417             fix3             = _mm_add_ps(fix3,tx);
1418             fiy3             = _mm_add_ps(fiy3,ty);
1419             fiz3             = _mm_add_ps(fiz3,tz);
1420
1421             fjx1             = _mm_add_ps(fjx1,tx);
1422             fjy1             = _mm_add_ps(fjy1,ty);
1423             fjz1             = _mm_add_ps(fjz1,tz);
1424             
1425             }
1426
1427             /**************************
1428              * CALCULATE INTERACTIONS *
1429              **************************/
1430
1431             if (gmx_mm_any_lt(rsq32,rcutoff2))
1432             {
1433
1434             r32              = _mm_mul_ps(rsq32,rinv32);
1435             r32              = _mm_andnot_ps(dummy_mask,r32);
1436
1437             /* EWALD ELECTROSTATICS */
1438
1439             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1440             ewrt             = _mm_mul_ps(r32,ewtabscale);
1441             ewitab           = _mm_cvttps_epi32(ewrt);
1442             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1443             ewitab           = _mm_slli_epi32(ewitab,2);
1444             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1445             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1446             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1447             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1448             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1449             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1450             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1451             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
1452             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1453
1454             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1455
1456             /* Update potential sum for this i atom from the interaction with this j atom. */
1457             velec            = _mm_and_ps(velec,cutoff_mask);
1458             velec            = _mm_andnot_ps(dummy_mask,velec);
1459             velecsum         = _mm_add_ps(velecsum,velec);
1460
1461             fscal            = felec;
1462
1463             fscal            = _mm_and_ps(fscal,cutoff_mask);
1464
1465             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1466
1467             /* Calculate temporary vectorial force */
1468             tx               = _mm_mul_ps(fscal,dx32);
1469             ty               = _mm_mul_ps(fscal,dy32);
1470             tz               = _mm_mul_ps(fscal,dz32);
1471
1472             /* Update vectorial force */
1473             fix3             = _mm_add_ps(fix3,tx);
1474             fiy3             = _mm_add_ps(fiy3,ty);
1475             fiz3             = _mm_add_ps(fiz3,tz);
1476
1477             fjx2             = _mm_add_ps(fjx2,tx);
1478             fjy2             = _mm_add_ps(fjy2,ty);
1479             fjz2             = _mm_add_ps(fjz2,tz);
1480             
1481             }
1482
1483             /**************************
1484              * CALCULATE INTERACTIONS *
1485              **************************/
1486
1487             if (gmx_mm_any_lt(rsq33,rcutoff2))
1488             {
1489
1490             r33              = _mm_mul_ps(rsq33,rinv33);
1491             r33              = _mm_andnot_ps(dummy_mask,r33);
1492
1493             /* EWALD ELECTROSTATICS */
1494
1495             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1496             ewrt             = _mm_mul_ps(r33,ewtabscale);
1497             ewitab           = _mm_cvttps_epi32(ewrt);
1498             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1499             ewitab           = _mm_slli_epi32(ewitab,2);
1500             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1501             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1502             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1503             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1504             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1505             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1506             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1507             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
1508             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1509
1510             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1511
1512             /* Update potential sum for this i atom from the interaction with this j atom. */
1513             velec            = _mm_and_ps(velec,cutoff_mask);
1514             velec            = _mm_andnot_ps(dummy_mask,velec);
1515             velecsum         = _mm_add_ps(velecsum,velec);
1516
1517             fscal            = felec;
1518
1519             fscal            = _mm_and_ps(fscal,cutoff_mask);
1520
1521             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1522
1523             /* Calculate temporary vectorial force */
1524             tx               = _mm_mul_ps(fscal,dx33);
1525             ty               = _mm_mul_ps(fscal,dy33);
1526             tz               = _mm_mul_ps(fscal,dz33);
1527
1528             /* Update vectorial force */
1529             fix3             = _mm_add_ps(fix3,tx);
1530             fiy3             = _mm_add_ps(fiy3,ty);
1531             fiz3             = _mm_add_ps(fiz3,tz);
1532
1533             fjx3             = _mm_add_ps(fjx3,tx);
1534             fjy3             = _mm_add_ps(fjy3,ty);
1535             fjz3             = _mm_add_ps(fjz3,tz);
1536             
1537             }
1538
1539             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1540             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1541             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1542             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1543
1544             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1545                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1546                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1547
1548             /* Inner loop uses 489 flops */
1549         }
1550
1551         /* End of innermost loop */
1552
1553         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1554                                               f+i_coord_offset,fshift+i_shift_offset);
1555
1556         ggid                        = gid[iidx];
1557         /* Update potential energies */
1558         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1559         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1560
1561         /* Increment number of inner iterations */
1562         inneriter                  += j_index_end - j_index_start;
1563
1564         /* Outer loop uses 26 flops */
1565     }
1566
1567     /* Increment number of outer iterations */
1568     outeriter        += nri;
1569
1570     /* Update outer/inner flops */
1571
1572     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*489);
1573 }
1574 /*
1575  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_single
1576  * Electrostatics interaction: Ewald
1577  * VdW interaction:            LJEwald
1578  * Geometry:                   Water4-Water4
1579  * Calculate force/pot:        Force
1580  */
1581 void
1582 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_single
1583                     (t_nblist                    * gmx_restrict       nlist,
1584                      rvec                        * gmx_restrict          xx,
1585                      rvec                        * gmx_restrict          ff,
1586                      struct t_forcerec           * gmx_restrict          fr,
1587                      t_mdatoms                   * gmx_restrict     mdatoms,
1588                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1589                      t_nrnb                      * gmx_restrict        nrnb)
1590 {
1591     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1592      * just 0 for non-waters.
1593      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1594      * jnr indices corresponding to data put in the four positions in the SIMD register.
1595      */
1596     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1597     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1598     int              jnrA,jnrB,jnrC,jnrD;
1599     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1600     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1601     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1602     real             rcutoff_scalar;
1603     real             *shiftvec,*fshift,*x,*f;
1604     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1605     real             scratch[4*DIM];
1606     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1607     int              vdwioffset0;
1608     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1609     int              vdwioffset1;
1610     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1611     int              vdwioffset2;
1612     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1613     int              vdwioffset3;
1614     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1615     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1616     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1617     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1618     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1619     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1620     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1621     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1622     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1623     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1624     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1625     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1626     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1627     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1628     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1629     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1630     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1631     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1632     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1633     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1634     real             *charge;
1635     int              nvdwtype;
1636     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1637     int              *vdwtype;
1638     real             *vdwparam;
1639     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1640     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1641     __m128           c6grid_00;
1642     __m128           c6grid_11;
1643     __m128           c6grid_12;
1644     __m128           c6grid_13;
1645     __m128           c6grid_21;
1646     __m128           c6grid_22;
1647     __m128           c6grid_23;
1648     __m128           c6grid_31;
1649     __m128           c6grid_32;
1650     __m128           c6grid_33;
1651     __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1652     real             *vdwgridparam;
1653     __m128           one_half = _mm_set1_ps(0.5);
1654     __m128           minus_one = _mm_set1_ps(-1.0);
1655     __m128i          ewitab;
1656     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1657     real             *ewtab;
1658     __m128           dummy_mask,cutoff_mask;
1659     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1660     __m128           one     = _mm_set1_ps(1.0);
1661     __m128           two     = _mm_set1_ps(2.0);
1662     x                = xx[0];
1663     f                = ff[0];
1664
1665     nri              = nlist->nri;
1666     iinr             = nlist->iinr;
1667     jindex           = nlist->jindex;
1668     jjnr             = nlist->jjnr;
1669     shiftidx         = nlist->shift;
1670     gid              = nlist->gid;
1671     shiftvec         = fr->shift_vec[0];
1672     fshift           = fr->fshift[0];
1673     facel            = _mm_set1_ps(fr->ic->epsfac);
1674     charge           = mdatoms->chargeA;
1675     nvdwtype         = fr->ntype;
1676     vdwparam         = fr->nbfp;
1677     vdwtype          = mdatoms->typeA;
1678     vdwgridparam     = fr->ljpme_c6grid;
1679     sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
1680     ewclj            = _mm_set1_ps(fr->ic->ewaldcoeff_lj);
1681     ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1682
1683     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1684     ewtab            = fr->ic->tabq_coul_F;
1685     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1686     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1687
1688     /* Setup water-specific parameters */
1689     inr              = nlist->iinr[0];
1690     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1691     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1692     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1693     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1694
1695     jq1              = _mm_set1_ps(charge[inr+1]);
1696     jq2              = _mm_set1_ps(charge[inr+2]);
1697     jq3              = _mm_set1_ps(charge[inr+3]);
1698     vdwjidx0A        = 2*vdwtype[inr+0];
1699     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1700     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1701     c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1702     qq11             = _mm_mul_ps(iq1,jq1);
1703     qq12             = _mm_mul_ps(iq1,jq2);
1704     qq13             = _mm_mul_ps(iq1,jq3);
1705     qq21             = _mm_mul_ps(iq2,jq1);
1706     qq22             = _mm_mul_ps(iq2,jq2);
1707     qq23             = _mm_mul_ps(iq2,jq3);
1708     qq31             = _mm_mul_ps(iq3,jq1);
1709     qq32             = _mm_mul_ps(iq3,jq2);
1710     qq33             = _mm_mul_ps(iq3,jq3);
1711
1712     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1713     rcutoff_scalar   = fr->ic->rcoulomb;
1714     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1715     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1716
1717     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
1718     rvdw             = _mm_set1_ps(fr->ic->rvdw);
1719
1720     /* Avoid stupid compiler warnings */
1721     jnrA = jnrB = jnrC = jnrD = 0;
1722     j_coord_offsetA = 0;
1723     j_coord_offsetB = 0;
1724     j_coord_offsetC = 0;
1725     j_coord_offsetD = 0;
1726
1727     outeriter        = 0;
1728     inneriter        = 0;
1729
1730     for(iidx=0;iidx<4*DIM;iidx++)
1731     {
1732         scratch[iidx] = 0.0;
1733     }  
1734
1735     /* Start outer loop over neighborlists */
1736     for(iidx=0; iidx<nri; iidx++)
1737     {
1738         /* Load shift vector for this list */
1739         i_shift_offset   = DIM*shiftidx[iidx];
1740
1741         /* Load limits for loop over neighbors */
1742         j_index_start    = jindex[iidx];
1743         j_index_end      = jindex[iidx+1];
1744
1745         /* Get outer coordinate index */
1746         inr              = iinr[iidx];
1747         i_coord_offset   = DIM*inr;
1748
1749         /* Load i particle coords and add shift vector */
1750         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1751                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1752         
1753         fix0             = _mm_setzero_ps();
1754         fiy0             = _mm_setzero_ps();
1755         fiz0             = _mm_setzero_ps();
1756         fix1             = _mm_setzero_ps();
1757         fiy1             = _mm_setzero_ps();
1758         fiz1             = _mm_setzero_ps();
1759         fix2             = _mm_setzero_ps();
1760         fiy2             = _mm_setzero_ps();
1761         fiz2             = _mm_setzero_ps();
1762         fix3             = _mm_setzero_ps();
1763         fiy3             = _mm_setzero_ps();
1764         fiz3             = _mm_setzero_ps();
1765
1766         /* Start inner kernel loop */
1767         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1768         {
1769
1770             /* Get j neighbor index, and coordinate index */
1771             jnrA             = jjnr[jidx];
1772             jnrB             = jjnr[jidx+1];
1773             jnrC             = jjnr[jidx+2];
1774             jnrD             = jjnr[jidx+3];
1775             j_coord_offsetA  = DIM*jnrA;
1776             j_coord_offsetB  = DIM*jnrB;
1777             j_coord_offsetC  = DIM*jnrC;
1778             j_coord_offsetD  = DIM*jnrD;
1779
1780             /* load j atom coordinates */
1781             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1782                                               x+j_coord_offsetC,x+j_coord_offsetD,
1783                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1784                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1785
1786             /* Calculate displacement vector */
1787             dx00             = _mm_sub_ps(ix0,jx0);
1788             dy00             = _mm_sub_ps(iy0,jy0);
1789             dz00             = _mm_sub_ps(iz0,jz0);
1790             dx11             = _mm_sub_ps(ix1,jx1);
1791             dy11             = _mm_sub_ps(iy1,jy1);
1792             dz11             = _mm_sub_ps(iz1,jz1);
1793             dx12             = _mm_sub_ps(ix1,jx2);
1794             dy12             = _mm_sub_ps(iy1,jy2);
1795             dz12             = _mm_sub_ps(iz1,jz2);
1796             dx13             = _mm_sub_ps(ix1,jx3);
1797             dy13             = _mm_sub_ps(iy1,jy3);
1798             dz13             = _mm_sub_ps(iz1,jz3);
1799             dx21             = _mm_sub_ps(ix2,jx1);
1800             dy21             = _mm_sub_ps(iy2,jy1);
1801             dz21             = _mm_sub_ps(iz2,jz1);
1802             dx22             = _mm_sub_ps(ix2,jx2);
1803             dy22             = _mm_sub_ps(iy2,jy2);
1804             dz22             = _mm_sub_ps(iz2,jz2);
1805             dx23             = _mm_sub_ps(ix2,jx3);
1806             dy23             = _mm_sub_ps(iy2,jy3);
1807             dz23             = _mm_sub_ps(iz2,jz3);
1808             dx31             = _mm_sub_ps(ix3,jx1);
1809             dy31             = _mm_sub_ps(iy3,jy1);
1810             dz31             = _mm_sub_ps(iz3,jz1);
1811             dx32             = _mm_sub_ps(ix3,jx2);
1812             dy32             = _mm_sub_ps(iy3,jy2);
1813             dz32             = _mm_sub_ps(iz3,jz2);
1814             dx33             = _mm_sub_ps(ix3,jx3);
1815             dy33             = _mm_sub_ps(iy3,jy3);
1816             dz33             = _mm_sub_ps(iz3,jz3);
1817
1818             /* Calculate squared distance and things based on it */
1819             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1820             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1821             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1822             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1823             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1824             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1825             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1826             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1827             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1828             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1829
1830             rinv00           = sse2_invsqrt_f(rsq00);
1831             rinv11           = sse2_invsqrt_f(rsq11);
1832             rinv12           = sse2_invsqrt_f(rsq12);
1833             rinv13           = sse2_invsqrt_f(rsq13);
1834             rinv21           = sse2_invsqrt_f(rsq21);
1835             rinv22           = sse2_invsqrt_f(rsq22);
1836             rinv23           = sse2_invsqrt_f(rsq23);
1837             rinv31           = sse2_invsqrt_f(rsq31);
1838             rinv32           = sse2_invsqrt_f(rsq32);
1839             rinv33           = sse2_invsqrt_f(rsq33);
1840
1841             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1842             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1843             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1844             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1845             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1846             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1847             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1848             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1849             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1850             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1851
1852             fjx0             = _mm_setzero_ps();
1853             fjy0             = _mm_setzero_ps();
1854             fjz0             = _mm_setzero_ps();
1855             fjx1             = _mm_setzero_ps();
1856             fjy1             = _mm_setzero_ps();
1857             fjz1             = _mm_setzero_ps();
1858             fjx2             = _mm_setzero_ps();
1859             fjy2             = _mm_setzero_ps();
1860             fjz2             = _mm_setzero_ps();
1861             fjx3             = _mm_setzero_ps();
1862             fjy3             = _mm_setzero_ps();
1863             fjz3             = _mm_setzero_ps();
1864
1865             /**************************
1866              * CALCULATE INTERACTIONS *
1867              **************************/
1868
1869             if (gmx_mm_any_lt(rsq00,rcutoff2))
1870             {
1871
1872             r00              = _mm_mul_ps(rsq00,rinv00);
1873
1874             /* Analytical LJ-PME */
1875             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1876             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
1877             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1878             exponent         = sse2_exp_f(ewcljrsq);
1879             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1880             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1881             /* f6A = 6 * C6grid * (1 - poly) */
1882             f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1883             /* f6B = C6grid * exponent * beta^6 */
1884             f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1885             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1886             fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1887
1888             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1889
1890             fscal            = fvdw;
1891
1892             fscal            = _mm_and_ps(fscal,cutoff_mask);
1893
1894             /* Calculate temporary vectorial force */
1895             tx               = _mm_mul_ps(fscal,dx00);
1896             ty               = _mm_mul_ps(fscal,dy00);
1897             tz               = _mm_mul_ps(fscal,dz00);
1898
1899             /* Update vectorial force */
1900             fix0             = _mm_add_ps(fix0,tx);
1901             fiy0             = _mm_add_ps(fiy0,ty);
1902             fiz0             = _mm_add_ps(fiz0,tz);
1903
1904             fjx0             = _mm_add_ps(fjx0,tx);
1905             fjy0             = _mm_add_ps(fjy0,ty);
1906             fjz0             = _mm_add_ps(fjz0,tz);
1907             
1908             }
1909
1910             /**************************
1911              * CALCULATE INTERACTIONS *
1912              **************************/
1913
1914             if (gmx_mm_any_lt(rsq11,rcutoff2))
1915             {
1916
1917             r11              = _mm_mul_ps(rsq11,rinv11);
1918
1919             /* EWALD ELECTROSTATICS */
1920
1921             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1922             ewrt             = _mm_mul_ps(r11,ewtabscale);
1923             ewitab           = _mm_cvttps_epi32(ewrt);
1924             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1925             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1926                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1927                                          &ewtabF,&ewtabFn);
1928             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1929             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1930
1931             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1932
1933             fscal            = felec;
1934
1935             fscal            = _mm_and_ps(fscal,cutoff_mask);
1936
1937             /* Calculate temporary vectorial force */
1938             tx               = _mm_mul_ps(fscal,dx11);
1939             ty               = _mm_mul_ps(fscal,dy11);
1940             tz               = _mm_mul_ps(fscal,dz11);
1941
1942             /* Update vectorial force */
1943             fix1             = _mm_add_ps(fix1,tx);
1944             fiy1             = _mm_add_ps(fiy1,ty);
1945             fiz1             = _mm_add_ps(fiz1,tz);
1946
1947             fjx1             = _mm_add_ps(fjx1,tx);
1948             fjy1             = _mm_add_ps(fjy1,ty);
1949             fjz1             = _mm_add_ps(fjz1,tz);
1950             
1951             }
1952
1953             /**************************
1954              * CALCULATE INTERACTIONS *
1955              **************************/
1956
1957             if (gmx_mm_any_lt(rsq12,rcutoff2))
1958             {
1959
1960             r12              = _mm_mul_ps(rsq12,rinv12);
1961
1962             /* EWALD ELECTROSTATICS */
1963
1964             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1965             ewrt             = _mm_mul_ps(r12,ewtabscale);
1966             ewitab           = _mm_cvttps_epi32(ewrt);
1967             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1968             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1969                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1970                                          &ewtabF,&ewtabFn);
1971             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1972             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1973
1974             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1975
1976             fscal            = felec;
1977
1978             fscal            = _mm_and_ps(fscal,cutoff_mask);
1979
1980             /* Calculate temporary vectorial force */
1981             tx               = _mm_mul_ps(fscal,dx12);
1982             ty               = _mm_mul_ps(fscal,dy12);
1983             tz               = _mm_mul_ps(fscal,dz12);
1984
1985             /* Update vectorial force */
1986             fix1             = _mm_add_ps(fix1,tx);
1987             fiy1             = _mm_add_ps(fiy1,ty);
1988             fiz1             = _mm_add_ps(fiz1,tz);
1989
1990             fjx2             = _mm_add_ps(fjx2,tx);
1991             fjy2             = _mm_add_ps(fjy2,ty);
1992             fjz2             = _mm_add_ps(fjz2,tz);
1993             
1994             }
1995
1996             /**************************
1997              * CALCULATE INTERACTIONS *
1998              **************************/
1999
2000             if (gmx_mm_any_lt(rsq13,rcutoff2))
2001             {
2002
2003             r13              = _mm_mul_ps(rsq13,rinv13);
2004
2005             /* EWALD ELECTROSTATICS */
2006
2007             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2008             ewrt             = _mm_mul_ps(r13,ewtabscale);
2009             ewitab           = _mm_cvttps_epi32(ewrt);
2010             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2011             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2012                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2013                                          &ewtabF,&ewtabFn);
2014             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2015             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2016
2017             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2018
2019             fscal            = felec;
2020
2021             fscal            = _mm_and_ps(fscal,cutoff_mask);
2022
2023             /* Calculate temporary vectorial force */
2024             tx               = _mm_mul_ps(fscal,dx13);
2025             ty               = _mm_mul_ps(fscal,dy13);
2026             tz               = _mm_mul_ps(fscal,dz13);
2027
2028             /* Update vectorial force */
2029             fix1             = _mm_add_ps(fix1,tx);
2030             fiy1             = _mm_add_ps(fiy1,ty);
2031             fiz1             = _mm_add_ps(fiz1,tz);
2032
2033             fjx3             = _mm_add_ps(fjx3,tx);
2034             fjy3             = _mm_add_ps(fjy3,ty);
2035             fjz3             = _mm_add_ps(fjz3,tz);
2036             
2037             }
2038
2039             /**************************
2040              * CALCULATE INTERACTIONS *
2041              **************************/
2042
2043             if (gmx_mm_any_lt(rsq21,rcutoff2))
2044             {
2045
2046             r21              = _mm_mul_ps(rsq21,rinv21);
2047
2048             /* EWALD ELECTROSTATICS */
2049
2050             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2051             ewrt             = _mm_mul_ps(r21,ewtabscale);
2052             ewitab           = _mm_cvttps_epi32(ewrt);
2053             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2054             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2055                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2056                                          &ewtabF,&ewtabFn);
2057             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2058             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2059
2060             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2061
2062             fscal            = felec;
2063
2064             fscal            = _mm_and_ps(fscal,cutoff_mask);
2065
2066             /* Calculate temporary vectorial force */
2067             tx               = _mm_mul_ps(fscal,dx21);
2068             ty               = _mm_mul_ps(fscal,dy21);
2069             tz               = _mm_mul_ps(fscal,dz21);
2070
2071             /* Update vectorial force */
2072             fix2             = _mm_add_ps(fix2,tx);
2073             fiy2             = _mm_add_ps(fiy2,ty);
2074             fiz2             = _mm_add_ps(fiz2,tz);
2075
2076             fjx1             = _mm_add_ps(fjx1,tx);
2077             fjy1             = _mm_add_ps(fjy1,ty);
2078             fjz1             = _mm_add_ps(fjz1,tz);
2079             
2080             }
2081
2082             /**************************
2083              * CALCULATE INTERACTIONS *
2084              **************************/
2085
2086             if (gmx_mm_any_lt(rsq22,rcutoff2))
2087             {
2088
2089             r22              = _mm_mul_ps(rsq22,rinv22);
2090
2091             /* EWALD ELECTROSTATICS */
2092
2093             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2094             ewrt             = _mm_mul_ps(r22,ewtabscale);
2095             ewitab           = _mm_cvttps_epi32(ewrt);
2096             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2097             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2098                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2099                                          &ewtabF,&ewtabFn);
2100             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2101             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2102
2103             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2104
2105             fscal            = felec;
2106
2107             fscal            = _mm_and_ps(fscal,cutoff_mask);
2108
2109             /* Calculate temporary vectorial force */
2110             tx               = _mm_mul_ps(fscal,dx22);
2111             ty               = _mm_mul_ps(fscal,dy22);
2112             tz               = _mm_mul_ps(fscal,dz22);
2113
2114             /* Update vectorial force */
2115             fix2             = _mm_add_ps(fix2,tx);
2116             fiy2             = _mm_add_ps(fiy2,ty);
2117             fiz2             = _mm_add_ps(fiz2,tz);
2118
2119             fjx2             = _mm_add_ps(fjx2,tx);
2120             fjy2             = _mm_add_ps(fjy2,ty);
2121             fjz2             = _mm_add_ps(fjz2,tz);
2122             
2123             }
2124
2125             /**************************
2126              * CALCULATE INTERACTIONS *
2127              **************************/
2128
2129             if (gmx_mm_any_lt(rsq23,rcutoff2))
2130             {
2131
2132             r23              = _mm_mul_ps(rsq23,rinv23);
2133
2134             /* EWALD ELECTROSTATICS */
2135
2136             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2137             ewrt             = _mm_mul_ps(r23,ewtabscale);
2138             ewitab           = _mm_cvttps_epi32(ewrt);
2139             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2140             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2141                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2142                                          &ewtabF,&ewtabFn);
2143             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2144             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2145
2146             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2147
2148             fscal            = felec;
2149
2150             fscal            = _mm_and_ps(fscal,cutoff_mask);
2151
2152             /* Calculate temporary vectorial force */
2153             tx               = _mm_mul_ps(fscal,dx23);
2154             ty               = _mm_mul_ps(fscal,dy23);
2155             tz               = _mm_mul_ps(fscal,dz23);
2156
2157             /* Update vectorial force */
2158             fix2             = _mm_add_ps(fix2,tx);
2159             fiy2             = _mm_add_ps(fiy2,ty);
2160             fiz2             = _mm_add_ps(fiz2,tz);
2161
2162             fjx3             = _mm_add_ps(fjx3,tx);
2163             fjy3             = _mm_add_ps(fjy3,ty);
2164             fjz3             = _mm_add_ps(fjz3,tz);
2165             
2166             }
2167
2168             /**************************
2169              * CALCULATE INTERACTIONS *
2170              **************************/
2171
2172             if (gmx_mm_any_lt(rsq31,rcutoff2))
2173             {
2174
2175             r31              = _mm_mul_ps(rsq31,rinv31);
2176
2177             /* EWALD ELECTROSTATICS */
2178
2179             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2180             ewrt             = _mm_mul_ps(r31,ewtabscale);
2181             ewitab           = _mm_cvttps_epi32(ewrt);
2182             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2183             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2184                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2185                                          &ewtabF,&ewtabFn);
2186             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2187             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2188
2189             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2190
2191             fscal            = felec;
2192
2193             fscal            = _mm_and_ps(fscal,cutoff_mask);
2194
2195             /* Calculate temporary vectorial force */
2196             tx               = _mm_mul_ps(fscal,dx31);
2197             ty               = _mm_mul_ps(fscal,dy31);
2198             tz               = _mm_mul_ps(fscal,dz31);
2199
2200             /* Update vectorial force */
2201             fix3             = _mm_add_ps(fix3,tx);
2202             fiy3             = _mm_add_ps(fiy3,ty);
2203             fiz3             = _mm_add_ps(fiz3,tz);
2204
2205             fjx1             = _mm_add_ps(fjx1,tx);
2206             fjy1             = _mm_add_ps(fjy1,ty);
2207             fjz1             = _mm_add_ps(fjz1,tz);
2208             
2209             }
2210
2211             /**************************
2212              * CALCULATE INTERACTIONS *
2213              **************************/
2214
2215             if (gmx_mm_any_lt(rsq32,rcutoff2))
2216             {
2217
2218             r32              = _mm_mul_ps(rsq32,rinv32);
2219
2220             /* EWALD ELECTROSTATICS */
2221
2222             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2223             ewrt             = _mm_mul_ps(r32,ewtabscale);
2224             ewitab           = _mm_cvttps_epi32(ewrt);
2225             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2226             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2227                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2228                                          &ewtabF,&ewtabFn);
2229             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2230             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2231
2232             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2233
2234             fscal            = felec;
2235
2236             fscal            = _mm_and_ps(fscal,cutoff_mask);
2237
2238             /* Calculate temporary vectorial force */
2239             tx               = _mm_mul_ps(fscal,dx32);
2240             ty               = _mm_mul_ps(fscal,dy32);
2241             tz               = _mm_mul_ps(fscal,dz32);
2242
2243             /* Update vectorial force */
2244             fix3             = _mm_add_ps(fix3,tx);
2245             fiy3             = _mm_add_ps(fiy3,ty);
2246             fiz3             = _mm_add_ps(fiz3,tz);
2247
2248             fjx2             = _mm_add_ps(fjx2,tx);
2249             fjy2             = _mm_add_ps(fjy2,ty);
2250             fjz2             = _mm_add_ps(fjz2,tz);
2251             
2252             }
2253
2254             /**************************
2255              * CALCULATE INTERACTIONS *
2256              **************************/
2257
2258             if (gmx_mm_any_lt(rsq33,rcutoff2))
2259             {
2260
2261             r33              = _mm_mul_ps(rsq33,rinv33);
2262
2263             /* EWALD ELECTROSTATICS */
2264
2265             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2266             ewrt             = _mm_mul_ps(r33,ewtabscale);
2267             ewitab           = _mm_cvttps_epi32(ewrt);
2268             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2269             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2270                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2271                                          &ewtabF,&ewtabFn);
2272             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2273             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2274
2275             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2276
2277             fscal            = felec;
2278
2279             fscal            = _mm_and_ps(fscal,cutoff_mask);
2280
2281             /* Calculate temporary vectorial force */
2282             tx               = _mm_mul_ps(fscal,dx33);
2283             ty               = _mm_mul_ps(fscal,dy33);
2284             tz               = _mm_mul_ps(fscal,dz33);
2285
2286             /* Update vectorial force */
2287             fix3             = _mm_add_ps(fix3,tx);
2288             fiy3             = _mm_add_ps(fiy3,ty);
2289             fiz3             = _mm_add_ps(fiz3,tz);
2290
2291             fjx3             = _mm_add_ps(fjx3,tx);
2292             fjy3             = _mm_add_ps(fjy3,ty);
2293             fjz3             = _mm_add_ps(fjz3,tz);
2294             
2295             }
2296
2297             fjptrA             = f+j_coord_offsetA;
2298             fjptrB             = f+j_coord_offsetB;
2299             fjptrC             = f+j_coord_offsetC;
2300             fjptrD             = f+j_coord_offsetD;
2301
2302             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2303                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2304                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2305
2306             /* Inner loop uses 403 flops */
2307         }
2308
2309         if(jidx<j_index_end)
2310         {
2311
2312             /* Get j neighbor index, and coordinate index */
2313             jnrlistA         = jjnr[jidx];
2314             jnrlistB         = jjnr[jidx+1];
2315             jnrlistC         = jjnr[jidx+2];
2316             jnrlistD         = jjnr[jidx+3];
2317             /* Sign of each element will be negative for non-real atoms.
2318              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2319              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2320              */
2321             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2322             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2323             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2324             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2325             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2326             j_coord_offsetA  = DIM*jnrA;
2327             j_coord_offsetB  = DIM*jnrB;
2328             j_coord_offsetC  = DIM*jnrC;
2329             j_coord_offsetD  = DIM*jnrD;
2330
2331             /* load j atom coordinates */
2332             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2333                                               x+j_coord_offsetC,x+j_coord_offsetD,
2334                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2335                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2336
2337             /* Calculate displacement vector */
2338             dx00             = _mm_sub_ps(ix0,jx0);
2339             dy00             = _mm_sub_ps(iy0,jy0);
2340             dz00             = _mm_sub_ps(iz0,jz0);
2341             dx11             = _mm_sub_ps(ix1,jx1);
2342             dy11             = _mm_sub_ps(iy1,jy1);
2343             dz11             = _mm_sub_ps(iz1,jz1);
2344             dx12             = _mm_sub_ps(ix1,jx2);
2345             dy12             = _mm_sub_ps(iy1,jy2);
2346             dz12             = _mm_sub_ps(iz1,jz2);
2347             dx13             = _mm_sub_ps(ix1,jx3);
2348             dy13             = _mm_sub_ps(iy1,jy3);
2349             dz13             = _mm_sub_ps(iz1,jz3);
2350             dx21             = _mm_sub_ps(ix2,jx1);
2351             dy21             = _mm_sub_ps(iy2,jy1);
2352             dz21             = _mm_sub_ps(iz2,jz1);
2353             dx22             = _mm_sub_ps(ix2,jx2);
2354             dy22             = _mm_sub_ps(iy2,jy2);
2355             dz22             = _mm_sub_ps(iz2,jz2);
2356             dx23             = _mm_sub_ps(ix2,jx3);
2357             dy23             = _mm_sub_ps(iy2,jy3);
2358             dz23             = _mm_sub_ps(iz2,jz3);
2359             dx31             = _mm_sub_ps(ix3,jx1);
2360             dy31             = _mm_sub_ps(iy3,jy1);
2361             dz31             = _mm_sub_ps(iz3,jz1);
2362             dx32             = _mm_sub_ps(ix3,jx2);
2363             dy32             = _mm_sub_ps(iy3,jy2);
2364             dz32             = _mm_sub_ps(iz3,jz2);
2365             dx33             = _mm_sub_ps(ix3,jx3);
2366             dy33             = _mm_sub_ps(iy3,jy3);
2367             dz33             = _mm_sub_ps(iz3,jz3);
2368
2369             /* Calculate squared distance and things based on it */
2370             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2371             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2372             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2373             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2374             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2375             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2376             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2377             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2378             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2379             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2380
2381             rinv00           = sse2_invsqrt_f(rsq00);
2382             rinv11           = sse2_invsqrt_f(rsq11);
2383             rinv12           = sse2_invsqrt_f(rsq12);
2384             rinv13           = sse2_invsqrt_f(rsq13);
2385             rinv21           = sse2_invsqrt_f(rsq21);
2386             rinv22           = sse2_invsqrt_f(rsq22);
2387             rinv23           = sse2_invsqrt_f(rsq23);
2388             rinv31           = sse2_invsqrt_f(rsq31);
2389             rinv32           = sse2_invsqrt_f(rsq32);
2390             rinv33           = sse2_invsqrt_f(rsq33);
2391
2392             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2393             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2394             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2395             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2396             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2397             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2398             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2399             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2400             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2401             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2402
2403             fjx0             = _mm_setzero_ps();
2404             fjy0             = _mm_setzero_ps();
2405             fjz0             = _mm_setzero_ps();
2406             fjx1             = _mm_setzero_ps();
2407             fjy1             = _mm_setzero_ps();
2408             fjz1             = _mm_setzero_ps();
2409             fjx2             = _mm_setzero_ps();
2410             fjy2             = _mm_setzero_ps();
2411             fjz2             = _mm_setzero_ps();
2412             fjx3             = _mm_setzero_ps();
2413             fjy3             = _mm_setzero_ps();
2414             fjz3             = _mm_setzero_ps();
2415
2416             /**************************
2417              * CALCULATE INTERACTIONS *
2418              **************************/
2419
2420             if (gmx_mm_any_lt(rsq00,rcutoff2))
2421             {
2422
2423             r00              = _mm_mul_ps(rsq00,rinv00);
2424             r00              = _mm_andnot_ps(dummy_mask,r00);
2425
2426             /* Analytical LJ-PME */
2427             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2428             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
2429             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
2430             exponent         = sse2_exp_f(ewcljrsq);
2431             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2432             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2433             /* f6A = 6 * C6grid * (1 - poly) */
2434             f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2435             /* f6B = C6grid * exponent * beta^6 */
2436             f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2437             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2438             fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2439
2440             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2441
2442             fscal            = fvdw;
2443
2444             fscal            = _mm_and_ps(fscal,cutoff_mask);
2445
2446             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2447
2448             /* Calculate temporary vectorial force */
2449             tx               = _mm_mul_ps(fscal,dx00);
2450             ty               = _mm_mul_ps(fscal,dy00);
2451             tz               = _mm_mul_ps(fscal,dz00);
2452
2453             /* Update vectorial force */
2454             fix0             = _mm_add_ps(fix0,tx);
2455             fiy0             = _mm_add_ps(fiy0,ty);
2456             fiz0             = _mm_add_ps(fiz0,tz);
2457
2458             fjx0             = _mm_add_ps(fjx0,tx);
2459             fjy0             = _mm_add_ps(fjy0,ty);
2460             fjz0             = _mm_add_ps(fjz0,tz);
2461             
2462             }
2463
2464             /**************************
2465              * CALCULATE INTERACTIONS *
2466              **************************/
2467
2468             if (gmx_mm_any_lt(rsq11,rcutoff2))
2469             {
2470
2471             r11              = _mm_mul_ps(rsq11,rinv11);
2472             r11              = _mm_andnot_ps(dummy_mask,r11);
2473
2474             /* EWALD ELECTROSTATICS */
2475
2476             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2477             ewrt             = _mm_mul_ps(r11,ewtabscale);
2478             ewitab           = _mm_cvttps_epi32(ewrt);
2479             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2480             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2481                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2482                                          &ewtabF,&ewtabFn);
2483             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2484             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2485
2486             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2487
2488             fscal            = felec;
2489
2490             fscal            = _mm_and_ps(fscal,cutoff_mask);
2491
2492             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2493
2494             /* Calculate temporary vectorial force */
2495             tx               = _mm_mul_ps(fscal,dx11);
2496             ty               = _mm_mul_ps(fscal,dy11);
2497             tz               = _mm_mul_ps(fscal,dz11);
2498
2499             /* Update vectorial force */
2500             fix1             = _mm_add_ps(fix1,tx);
2501             fiy1             = _mm_add_ps(fiy1,ty);
2502             fiz1             = _mm_add_ps(fiz1,tz);
2503
2504             fjx1             = _mm_add_ps(fjx1,tx);
2505             fjy1             = _mm_add_ps(fjy1,ty);
2506             fjz1             = _mm_add_ps(fjz1,tz);
2507             
2508             }
2509
2510             /**************************
2511              * CALCULATE INTERACTIONS *
2512              **************************/
2513
2514             if (gmx_mm_any_lt(rsq12,rcutoff2))
2515             {
2516
2517             r12              = _mm_mul_ps(rsq12,rinv12);
2518             r12              = _mm_andnot_ps(dummy_mask,r12);
2519
2520             /* EWALD ELECTROSTATICS */
2521
2522             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2523             ewrt             = _mm_mul_ps(r12,ewtabscale);
2524             ewitab           = _mm_cvttps_epi32(ewrt);
2525             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2526             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2527                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2528                                          &ewtabF,&ewtabFn);
2529             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2530             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2531
2532             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2533
2534             fscal            = felec;
2535
2536             fscal            = _mm_and_ps(fscal,cutoff_mask);
2537
2538             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2539
2540             /* Calculate temporary vectorial force */
2541             tx               = _mm_mul_ps(fscal,dx12);
2542             ty               = _mm_mul_ps(fscal,dy12);
2543             tz               = _mm_mul_ps(fscal,dz12);
2544
2545             /* Update vectorial force */
2546             fix1             = _mm_add_ps(fix1,tx);
2547             fiy1             = _mm_add_ps(fiy1,ty);
2548             fiz1             = _mm_add_ps(fiz1,tz);
2549
2550             fjx2             = _mm_add_ps(fjx2,tx);
2551             fjy2             = _mm_add_ps(fjy2,ty);
2552             fjz2             = _mm_add_ps(fjz2,tz);
2553             
2554             }
2555
2556             /**************************
2557              * CALCULATE INTERACTIONS *
2558              **************************/
2559
2560             if (gmx_mm_any_lt(rsq13,rcutoff2))
2561             {
2562
2563             r13              = _mm_mul_ps(rsq13,rinv13);
2564             r13              = _mm_andnot_ps(dummy_mask,r13);
2565
2566             /* EWALD ELECTROSTATICS */
2567
2568             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2569             ewrt             = _mm_mul_ps(r13,ewtabscale);
2570             ewitab           = _mm_cvttps_epi32(ewrt);
2571             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2572             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2573                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2574                                          &ewtabF,&ewtabFn);
2575             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2576             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2577
2578             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2579
2580             fscal            = felec;
2581
2582             fscal            = _mm_and_ps(fscal,cutoff_mask);
2583
2584             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2585
2586             /* Calculate temporary vectorial force */
2587             tx               = _mm_mul_ps(fscal,dx13);
2588             ty               = _mm_mul_ps(fscal,dy13);
2589             tz               = _mm_mul_ps(fscal,dz13);
2590
2591             /* Update vectorial force */
2592             fix1             = _mm_add_ps(fix1,tx);
2593             fiy1             = _mm_add_ps(fiy1,ty);
2594             fiz1             = _mm_add_ps(fiz1,tz);
2595
2596             fjx3             = _mm_add_ps(fjx3,tx);
2597             fjy3             = _mm_add_ps(fjy3,ty);
2598             fjz3             = _mm_add_ps(fjz3,tz);
2599             
2600             }
2601
2602             /**************************
2603              * CALCULATE INTERACTIONS *
2604              **************************/
2605
2606             if (gmx_mm_any_lt(rsq21,rcutoff2))
2607             {
2608
2609             r21              = _mm_mul_ps(rsq21,rinv21);
2610             r21              = _mm_andnot_ps(dummy_mask,r21);
2611
2612             /* EWALD ELECTROSTATICS */
2613
2614             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2615             ewrt             = _mm_mul_ps(r21,ewtabscale);
2616             ewitab           = _mm_cvttps_epi32(ewrt);
2617             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2618             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2619                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2620                                          &ewtabF,&ewtabFn);
2621             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2622             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2623
2624             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2625
2626             fscal            = felec;
2627
2628             fscal            = _mm_and_ps(fscal,cutoff_mask);
2629
2630             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2631
2632             /* Calculate temporary vectorial force */
2633             tx               = _mm_mul_ps(fscal,dx21);
2634             ty               = _mm_mul_ps(fscal,dy21);
2635             tz               = _mm_mul_ps(fscal,dz21);
2636
2637             /* Update vectorial force */
2638             fix2             = _mm_add_ps(fix2,tx);
2639             fiy2             = _mm_add_ps(fiy2,ty);
2640             fiz2             = _mm_add_ps(fiz2,tz);
2641
2642             fjx1             = _mm_add_ps(fjx1,tx);
2643             fjy1             = _mm_add_ps(fjy1,ty);
2644             fjz1             = _mm_add_ps(fjz1,tz);
2645             
2646             }
2647
2648             /**************************
2649              * CALCULATE INTERACTIONS *
2650              **************************/
2651
2652             if (gmx_mm_any_lt(rsq22,rcutoff2))
2653             {
2654
2655             r22              = _mm_mul_ps(rsq22,rinv22);
2656             r22              = _mm_andnot_ps(dummy_mask,r22);
2657
2658             /* EWALD ELECTROSTATICS */
2659
2660             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2661             ewrt             = _mm_mul_ps(r22,ewtabscale);
2662             ewitab           = _mm_cvttps_epi32(ewrt);
2663             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2664             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2665                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2666                                          &ewtabF,&ewtabFn);
2667             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2668             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2669
2670             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2671
2672             fscal            = felec;
2673
2674             fscal            = _mm_and_ps(fscal,cutoff_mask);
2675
2676             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2677
2678             /* Calculate temporary vectorial force */
2679             tx               = _mm_mul_ps(fscal,dx22);
2680             ty               = _mm_mul_ps(fscal,dy22);
2681             tz               = _mm_mul_ps(fscal,dz22);
2682
2683             /* Update vectorial force */
2684             fix2             = _mm_add_ps(fix2,tx);
2685             fiy2             = _mm_add_ps(fiy2,ty);
2686             fiz2             = _mm_add_ps(fiz2,tz);
2687
2688             fjx2             = _mm_add_ps(fjx2,tx);
2689             fjy2             = _mm_add_ps(fjy2,ty);
2690             fjz2             = _mm_add_ps(fjz2,tz);
2691             
2692             }
2693
2694             /**************************
2695              * CALCULATE INTERACTIONS *
2696              **************************/
2697
2698             if (gmx_mm_any_lt(rsq23,rcutoff2))
2699             {
2700
2701             r23              = _mm_mul_ps(rsq23,rinv23);
2702             r23              = _mm_andnot_ps(dummy_mask,r23);
2703
2704             /* EWALD ELECTROSTATICS */
2705
2706             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2707             ewrt             = _mm_mul_ps(r23,ewtabscale);
2708             ewitab           = _mm_cvttps_epi32(ewrt);
2709             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2710             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2711                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2712                                          &ewtabF,&ewtabFn);
2713             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2714             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2715
2716             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2717
2718             fscal            = felec;
2719
2720             fscal            = _mm_and_ps(fscal,cutoff_mask);
2721
2722             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2723
2724             /* Calculate temporary vectorial force */
2725             tx               = _mm_mul_ps(fscal,dx23);
2726             ty               = _mm_mul_ps(fscal,dy23);
2727             tz               = _mm_mul_ps(fscal,dz23);
2728
2729             /* Update vectorial force */
2730             fix2             = _mm_add_ps(fix2,tx);
2731             fiy2             = _mm_add_ps(fiy2,ty);
2732             fiz2             = _mm_add_ps(fiz2,tz);
2733
2734             fjx3             = _mm_add_ps(fjx3,tx);
2735             fjy3             = _mm_add_ps(fjy3,ty);
2736             fjz3             = _mm_add_ps(fjz3,tz);
2737             
2738             }
2739
2740             /**************************
2741              * CALCULATE INTERACTIONS *
2742              **************************/
2743
2744             if (gmx_mm_any_lt(rsq31,rcutoff2))
2745             {
2746
2747             r31              = _mm_mul_ps(rsq31,rinv31);
2748             r31              = _mm_andnot_ps(dummy_mask,r31);
2749
2750             /* EWALD ELECTROSTATICS */
2751
2752             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2753             ewrt             = _mm_mul_ps(r31,ewtabscale);
2754             ewitab           = _mm_cvttps_epi32(ewrt);
2755             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2756             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2757                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2758                                          &ewtabF,&ewtabFn);
2759             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2760             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2761
2762             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2763
2764             fscal            = felec;
2765
2766             fscal            = _mm_and_ps(fscal,cutoff_mask);
2767
2768             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2769
2770             /* Calculate temporary vectorial force */
2771             tx               = _mm_mul_ps(fscal,dx31);
2772             ty               = _mm_mul_ps(fscal,dy31);
2773             tz               = _mm_mul_ps(fscal,dz31);
2774
2775             /* Update vectorial force */
2776             fix3             = _mm_add_ps(fix3,tx);
2777             fiy3             = _mm_add_ps(fiy3,ty);
2778             fiz3             = _mm_add_ps(fiz3,tz);
2779
2780             fjx1             = _mm_add_ps(fjx1,tx);
2781             fjy1             = _mm_add_ps(fjy1,ty);
2782             fjz1             = _mm_add_ps(fjz1,tz);
2783             
2784             }
2785
2786             /**************************
2787              * CALCULATE INTERACTIONS *
2788              **************************/
2789
2790             if (gmx_mm_any_lt(rsq32,rcutoff2))
2791             {
2792
2793             r32              = _mm_mul_ps(rsq32,rinv32);
2794             r32              = _mm_andnot_ps(dummy_mask,r32);
2795
2796             /* EWALD ELECTROSTATICS */
2797
2798             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2799             ewrt             = _mm_mul_ps(r32,ewtabscale);
2800             ewitab           = _mm_cvttps_epi32(ewrt);
2801             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2802             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2803                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2804                                          &ewtabF,&ewtabFn);
2805             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2806             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2807
2808             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2809
2810             fscal            = felec;
2811
2812             fscal            = _mm_and_ps(fscal,cutoff_mask);
2813
2814             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2815
2816             /* Calculate temporary vectorial force */
2817             tx               = _mm_mul_ps(fscal,dx32);
2818             ty               = _mm_mul_ps(fscal,dy32);
2819             tz               = _mm_mul_ps(fscal,dz32);
2820
2821             /* Update vectorial force */
2822             fix3             = _mm_add_ps(fix3,tx);
2823             fiy3             = _mm_add_ps(fiy3,ty);
2824             fiz3             = _mm_add_ps(fiz3,tz);
2825
2826             fjx2             = _mm_add_ps(fjx2,tx);
2827             fjy2             = _mm_add_ps(fjy2,ty);
2828             fjz2             = _mm_add_ps(fjz2,tz);
2829             
2830             }
2831
2832             /**************************
2833              * CALCULATE INTERACTIONS *
2834              **************************/
2835
2836             if (gmx_mm_any_lt(rsq33,rcutoff2))
2837             {
2838
2839             r33              = _mm_mul_ps(rsq33,rinv33);
2840             r33              = _mm_andnot_ps(dummy_mask,r33);
2841
2842             /* EWALD ELECTROSTATICS */
2843
2844             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2845             ewrt             = _mm_mul_ps(r33,ewtabscale);
2846             ewitab           = _mm_cvttps_epi32(ewrt);
2847             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2848             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2849                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2850                                          &ewtabF,&ewtabFn);
2851             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2852             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2853
2854             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2855
2856             fscal            = felec;
2857
2858             fscal            = _mm_and_ps(fscal,cutoff_mask);
2859
2860             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2861
2862             /* Calculate temporary vectorial force */
2863             tx               = _mm_mul_ps(fscal,dx33);
2864             ty               = _mm_mul_ps(fscal,dy33);
2865             tz               = _mm_mul_ps(fscal,dz33);
2866
2867             /* Update vectorial force */
2868             fix3             = _mm_add_ps(fix3,tx);
2869             fiy3             = _mm_add_ps(fiy3,ty);
2870             fiz3             = _mm_add_ps(fiz3,tz);
2871
2872             fjx3             = _mm_add_ps(fjx3,tx);
2873             fjy3             = _mm_add_ps(fjy3,ty);
2874             fjz3             = _mm_add_ps(fjz3,tz);
2875             
2876             }
2877
2878             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2879             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2880             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2881             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2882
2883             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2884                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2885                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2886
2887             /* Inner loop uses 413 flops */
2888         }
2889
2890         /* End of innermost loop */
2891
2892         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2893                                               f+i_coord_offset,fshift+i_shift_offset);
2894
2895         /* Increment number of inner iterations */
2896         inneriter                  += j_index_end - j_index_start;
2897
2898         /* Outer loop uses 24 flops */
2899     }
2900
2901     /* Increment number of outer iterations */
2902     outeriter        += nri;
2903
2904     /* Update outer/inner flops */
2905
2906     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*413);
2907 }