Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse2_single.h"
50 #include "kernelutil_x86_sse2_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LJEwald
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
113     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
114     __m128           c6grid_00;
115     __m128           c6grid_01;
116     __m128           c6grid_02;
117     __m128           c6grid_10;
118     __m128           c6grid_11;
119     __m128           c6grid_12;
120     __m128           c6grid_20;
121     __m128           c6grid_21;
122     __m128           c6grid_22;
123     __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
124     real             *vdwgridparam;
125     __m128           one_half = _mm_set1_ps(0.5);
126     __m128           minus_one = _mm_set1_ps(-1.0);
127     __m128i          ewitab;
128     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
129     real             *ewtab;
130     __m128           dummy_mask,cutoff_mask;
131     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
132     __m128           one     = _mm_set1_ps(1.0);
133     __m128           two     = _mm_set1_ps(2.0);
134     x                = xx[0];
135     f                = ff[0];
136
137     nri              = nlist->nri;
138     iinr             = nlist->iinr;
139     jindex           = nlist->jindex;
140     jjnr             = nlist->jjnr;
141     shiftidx         = nlist->shift;
142     gid              = nlist->gid;
143     shiftvec         = fr->shift_vec[0];
144     fshift           = fr->fshift[0];
145     facel            = _mm_set1_ps(fr->epsfac);
146     charge           = mdatoms->chargeA;
147     nvdwtype         = fr->ntype;
148     vdwparam         = fr->nbfp;
149     vdwtype          = mdatoms->typeA;
150     vdwgridparam     = fr->ljpme_c6grid;
151     sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
152     ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
153     ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
154
155     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
156     ewtab            = fr->ic->tabq_coul_FDV0;
157     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
158     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
159
160     /* Setup water-specific parameters */
161     inr              = nlist->iinr[0];
162     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
163     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
164     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
165     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
166
167     jq0              = _mm_set1_ps(charge[inr+0]);
168     jq1              = _mm_set1_ps(charge[inr+1]);
169     jq2              = _mm_set1_ps(charge[inr+2]);
170     vdwjidx0A        = 2*vdwtype[inr+0];
171     qq00             = _mm_mul_ps(iq0,jq0);
172     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
173     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
174     c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
175     qq01             = _mm_mul_ps(iq0,jq1);
176     qq02             = _mm_mul_ps(iq0,jq2);
177     qq10             = _mm_mul_ps(iq1,jq0);
178     qq11             = _mm_mul_ps(iq1,jq1);
179     qq12             = _mm_mul_ps(iq1,jq2);
180     qq20             = _mm_mul_ps(iq2,jq0);
181     qq21             = _mm_mul_ps(iq2,jq1);
182     qq22             = _mm_mul_ps(iq2,jq2);
183
184     /* Avoid stupid compiler warnings */
185     jnrA = jnrB = jnrC = jnrD = 0;
186     j_coord_offsetA = 0;
187     j_coord_offsetB = 0;
188     j_coord_offsetC = 0;
189     j_coord_offsetD = 0;
190
191     outeriter        = 0;
192     inneriter        = 0;
193
194     for(iidx=0;iidx<4*DIM;iidx++)
195     {
196         scratch[iidx] = 0.0;
197     }  
198
199     /* Start outer loop over neighborlists */
200     for(iidx=0; iidx<nri; iidx++)
201     {
202         /* Load shift vector for this list */
203         i_shift_offset   = DIM*shiftidx[iidx];
204
205         /* Load limits for loop over neighbors */
206         j_index_start    = jindex[iidx];
207         j_index_end      = jindex[iidx+1];
208
209         /* Get outer coordinate index */
210         inr              = iinr[iidx];
211         i_coord_offset   = DIM*inr;
212
213         /* Load i particle coords and add shift vector */
214         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
215                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
216         
217         fix0             = _mm_setzero_ps();
218         fiy0             = _mm_setzero_ps();
219         fiz0             = _mm_setzero_ps();
220         fix1             = _mm_setzero_ps();
221         fiy1             = _mm_setzero_ps();
222         fiz1             = _mm_setzero_ps();
223         fix2             = _mm_setzero_ps();
224         fiy2             = _mm_setzero_ps();
225         fiz2             = _mm_setzero_ps();
226
227         /* Reset potential sums */
228         velecsum         = _mm_setzero_ps();
229         vvdwsum          = _mm_setzero_ps();
230
231         /* Start inner kernel loop */
232         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
233         {
234
235             /* Get j neighbor index, and coordinate index */
236             jnrA             = jjnr[jidx];
237             jnrB             = jjnr[jidx+1];
238             jnrC             = jjnr[jidx+2];
239             jnrD             = jjnr[jidx+3];
240             j_coord_offsetA  = DIM*jnrA;
241             j_coord_offsetB  = DIM*jnrB;
242             j_coord_offsetC  = DIM*jnrC;
243             j_coord_offsetD  = DIM*jnrD;
244
245             /* load j atom coordinates */
246             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
247                                               x+j_coord_offsetC,x+j_coord_offsetD,
248                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
249
250             /* Calculate displacement vector */
251             dx00             = _mm_sub_ps(ix0,jx0);
252             dy00             = _mm_sub_ps(iy0,jy0);
253             dz00             = _mm_sub_ps(iz0,jz0);
254             dx01             = _mm_sub_ps(ix0,jx1);
255             dy01             = _mm_sub_ps(iy0,jy1);
256             dz01             = _mm_sub_ps(iz0,jz1);
257             dx02             = _mm_sub_ps(ix0,jx2);
258             dy02             = _mm_sub_ps(iy0,jy2);
259             dz02             = _mm_sub_ps(iz0,jz2);
260             dx10             = _mm_sub_ps(ix1,jx0);
261             dy10             = _mm_sub_ps(iy1,jy0);
262             dz10             = _mm_sub_ps(iz1,jz0);
263             dx11             = _mm_sub_ps(ix1,jx1);
264             dy11             = _mm_sub_ps(iy1,jy1);
265             dz11             = _mm_sub_ps(iz1,jz1);
266             dx12             = _mm_sub_ps(ix1,jx2);
267             dy12             = _mm_sub_ps(iy1,jy2);
268             dz12             = _mm_sub_ps(iz1,jz2);
269             dx20             = _mm_sub_ps(ix2,jx0);
270             dy20             = _mm_sub_ps(iy2,jy0);
271             dz20             = _mm_sub_ps(iz2,jz0);
272             dx21             = _mm_sub_ps(ix2,jx1);
273             dy21             = _mm_sub_ps(iy2,jy1);
274             dz21             = _mm_sub_ps(iz2,jz1);
275             dx22             = _mm_sub_ps(ix2,jx2);
276             dy22             = _mm_sub_ps(iy2,jy2);
277             dz22             = _mm_sub_ps(iz2,jz2);
278
279             /* Calculate squared distance and things based on it */
280             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
281             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
282             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
283             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
284             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
285             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
286             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
287             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
288             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
289
290             rinv00           = gmx_mm_invsqrt_ps(rsq00);
291             rinv01           = gmx_mm_invsqrt_ps(rsq01);
292             rinv02           = gmx_mm_invsqrt_ps(rsq02);
293             rinv10           = gmx_mm_invsqrt_ps(rsq10);
294             rinv11           = gmx_mm_invsqrt_ps(rsq11);
295             rinv12           = gmx_mm_invsqrt_ps(rsq12);
296             rinv20           = gmx_mm_invsqrt_ps(rsq20);
297             rinv21           = gmx_mm_invsqrt_ps(rsq21);
298             rinv22           = gmx_mm_invsqrt_ps(rsq22);
299
300             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
301             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
302             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
303             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
304             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
305             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
306             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
307             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
308             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
309
310             fjx0             = _mm_setzero_ps();
311             fjy0             = _mm_setzero_ps();
312             fjz0             = _mm_setzero_ps();
313             fjx1             = _mm_setzero_ps();
314             fjy1             = _mm_setzero_ps();
315             fjz1             = _mm_setzero_ps();
316             fjx2             = _mm_setzero_ps();
317             fjy2             = _mm_setzero_ps();
318             fjz2             = _mm_setzero_ps();
319
320             /**************************
321              * CALCULATE INTERACTIONS *
322              **************************/
323
324             r00              = _mm_mul_ps(rsq00,rinv00);
325
326             /* EWALD ELECTROSTATICS */
327
328             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
329             ewrt             = _mm_mul_ps(r00,ewtabscale);
330             ewitab           = _mm_cvttps_epi32(ewrt);
331             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
332             ewitab           = _mm_slli_epi32(ewitab,2);
333             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
334             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
335             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
336             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
337             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
338             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
339             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
340             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
341             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
342
343             /* Analytical LJ-PME */
344             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
345             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
346             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
347             exponent         = gmx_simd_exp_r(ewcljrsq);
348             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
349             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
350             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
351             vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
352             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
353             vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
354             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
355             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
356
357             /* Update potential sum for this i atom from the interaction with this j atom. */
358             velecsum         = _mm_add_ps(velecsum,velec);
359             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
360
361             fscal            = _mm_add_ps(felec,fvdw);
362
363             /* Calculate temporary vectorial force */
364             tx               = _mm_mul_ps(fscal,dx00);
365             ty               = _mm_mul_ps(fscal,dy00);
366             tz               = _mm_mul_ps(fscal,dz00);
367
368             /* Update vectorial force */
369             fix0             = _mm_add_ps(fix0,tx);
370             fiy0             = _mm_add_ps(fiy0,ty);
371             fiz0             = _mm_add_ps(fiz0,tz);
372
373             fjx0             = _mm_add_ps(fjx0,tx);
374             fjy0             = _mm_add_ps(fjy0,ty);
375             fjz0             = _mm_add_ps(fjz0,tz);
376             
377             /**************************
378              * CALCULATE INTERACTIONS *
379              **************************/
380
381             r01              = _mm_mul_ps(rsq01,rinv01);
382
383             /* EWALD ELECTROSTATICS */
384
385             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
386             ewrt             = _mm_mul_ps(r01,ewtabscale);
387             ewitab           = _mm_cvttps_epi32(ewrt);
388             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
389             ewitab           = _mm_slli_epi32(ewitab,2);
390             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
391             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
392             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
393             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
394             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
395             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
396             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
397             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
398             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
399
400             /* Update potential sum for this i atom from the interaction with this j atom. */
401             velecsum         = _mm_add_ps(velecsum,velec);
402
403             fscal            = felec;
404
405             /* Calculate temporary vectorial force */
406             tx               = _mm_mul_ps(fscal,dx01);
407             ty               = _mm_mul_ps(fscal,dy01);
408             tz               = _mm_mul_ps(fscal,dz01);
409
410             /* Update vectorial force */
411             fix0             = _mm_add_ps(fix0,tx);
412             fiy0             = _mm_add_ps(fiy0,ty);
413             fiz0             = _mm_add_ps(fiz0,tz);
414
415             fjx1             = _mm_add_ps(fjx1,tx);
416             fjy1             = _mm_add_ps(fjy1,ty);
417             fjz1             = _mm_add_ps(fjz1,tz);
418             
419             /**************************
420              * CALCULATE INTERACTIONS *
421              **************************/
422
423             r02              = _mm_mul_ps(rsq02,rinv02);
424
425             /* EWALD ELECTROSTATICS */
426
427             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
428             ewrt             = _mm_mul_ps(r02,ewtabscale);
429             ewitab           = _mm_cvttps_epi32(ewrt);
430             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
431             ewitab           = _mm_slli_epi32(ewitab,2);
432             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
433             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
434             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
435             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
436             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
437             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
438             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
439             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
440             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
441
442             /* Update potential sum for this i atom from the interaction with this j atom. */
443             velecsum         = _mm_add_ps(velecsum,velec);
444
445             fscal            = felec;
446
447             /* Calculate temporary vectorial force */
448             tx               = _mm_mul_ps(fscal,dx02);
449             ty               = _mm_mul_ps(fscal,dy02);
450             tz               = _mm_mul_ps(fscal,dz02);
451
452             /* Update vectorial force */
453             fix0             = _mm_add_ps(fix0,tx);
454             fiy0             = _mm_add_ps(fiy0,ty);
455             fiz0             = _mm_add_ps(fiz0,tz);
456
457             fjx2             = _mm_add_ps(fjx2,tx);
458             fjy2             = _mm_add_ps(fjy2,ty);
459             fjz2             = _mm_add_ps(fjz2,tz);
460             
461             /**************************
462              * CALCULATE INTERACTIONS *
463              **************************/
464
465             r10              = _mm_mul_ps(rsq10,rinv10);
466
467             /* EWALD ELECTROSTATICS */
468
469             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
470             ewrt             = _mm_mul_ps(r10,ewtabscale);
471             ewitab           = _mm_cvttps_epi32(ewrt);
472             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
473             ewitab           = _mm_slli_epi32(ewitab,2);
474             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
475             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
476             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
477             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
478             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
479             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
480             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
481             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
482             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
483
484             /* Update potential sum for this i atom from the interaction with this j atom. */
485             velecsum         = _mm_add_ps(velecsum,velec);
486
487             fscal            = felec;
488
489             /* Calculate temporary vectorial force */
490             tx               = _mm_mul_ps(fscal,dx10);
491             ty               = _mm_mul_ps(fscal,dy10);
492             tz               = _mm_mul_ps(fscal,dz10);
493
494             /* Update vectorial force */
495             fix1             = _mm_add_ps(fix1,tx);
496             fiy1             = _mm_add_ps(fiy1,ty);
497             fiz1             = _mm_add_ps(fiz1,tz);
498
499             fjx0             = _mm_add_ps(fjx0,tx);
500             fjy0             = _mm_add_ps(fjy0,ty);
501             fjz0             = _mm_add_ps(fjz0,tz);
502             
503             /**************************
504              * CALCULATE INTERACTIONS *
505              **************************/
506
507             r11              = _mm_mul_ps(rsq11,rinv11);
508
509             /* EWALD ELECTROSTATICS */
510
511             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
512             ewrt             = _mm_mul_ps(r11,ewtabscale);
513             ewitab           = _mm_cvttps_epi32(ewrt);
514             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
515             ewitab           = _mm_slli_epi32(ewitab,2);
516             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
517             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
518             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
519             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
520             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
521             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
522             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
523             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
524             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
525
526             /* Update potential sum for this i atom from the interaction with this j atom. */
527             velecsum         = _mm_add_ps(velecsum,velec);
528
529             fscal            = felec;
530
531             /* Calculate temporary vectorial force */
532             tx               = _mm_mul_ps(fscal,dx11);
533             ty               = _mm_mul_ps(fscal,dy11);
534             tz               = _mm_mul_ps(fscal,dz11);
535
536             /* Update vectorial force */
537             fix1             = _mm_add_ps(fix1,tx);
538             fiy1             = _mm_add_ps(fiy1,ty);
539             fiz1             = _mm_add_ps(fiz1,tz);
540
541             fjx1             = _mm_add_ps(fjx1,tx);
542             fjy1             = _mm_add_ps(fjy1,ty);
543             fjz1             = _mm_add_ps(fjz1,tz);
544             
545             /**************************
546              * CALCULATE INTERACTIONS *
547              **************************/
548
549             r12              = _mm_mul_ps(rsq12,rinv12);
550
551             /* EWALD ELECTROSTATICS */
552
553             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
554             ewrt             = _mm_mul_ps(r12,ewtabscale);
555             ewitab           = _mm_cvttps_epi32(ewrt);
556             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
557             ewitab           = _mm_slli_epi32(ewitab,2);
558             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
559             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
560             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
561             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
562             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
563             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
564             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
565             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
566             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
567
568             /* Update potential sum for this i atom from the interaction with this j atom. */
569             velecsum         = _mm_add_ps(velecsum,velec);
570
571             fscal            = felec;
572
573             /* Calculate temporary vectorial force */
574             tx               = _mm_mul_ps(fscal,dx12);
575             ty               = _mm_mul_ps(fscal,dy12);
576             tz               = _mm_mul_ps(fscal,dz12);
577
578             /* Update vectorial force */
579             fix1             = _mm_add_ps(fix1,tx);
580             fiy1             = _mm_add_ps(fiy1,ty);
581             fiz1             = _mm_add_ps(fiz1,tz);
582
583             fjx2             = _mm_add_ps(fjx2,tx);
584             fjy2             = _mm_add_ps(fjy2,ty);
585             fjz2             = _mm_add_ps(fjz2,tz);
586             
587             /**************************
588              * CALCULATE INTERACTIONS *
589              **************************/
590
591             r20              = _mm_mul_ps(rsq20,rinv20);
592
593             /* EWALD ELECTROSTATICS */
594
595             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
596             ewrt             = _mm_mul_ps(r20,ewtabscale);
597             ewitab           = _mm_cvttps_epi32(ewrt);
598             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
599             ewitab           = _mm_slli_epi32(ewitab,2);
600             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
601             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
602             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
603             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
604             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
605             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
606             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
607             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
608             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
609
610             /* Update potential sum for this i atom from the interaction with this j atom. */
611             velecsum         = _mm_add_ps(velecsum,velec);
612
613             fscal            = felec;
614
615             /* Calculate temporary vectorial force */
616             tx               = _mm_mul_ps(fscal,dx20);
617             ty               = _mm_mul_ps(fscal,dy20);
618             tz               = _mm_mul_ps(fscal,dz20);
619
620             /* Update vectorial force */
621             fix2             = _mm_add_ps(fix2,tx);
622             fiy2             = _mm_add_ps(fiy2,ty);
623             fiz2             = _mm_add_ps(fiz2,tz);
624
625             fjx0             = _mm_add_ps(fjx0,tx);
626             fjy0             = _mm_add_ps(fjy0,ty);
627             fjz0             = _mm_add_ps(fjz0,tz);
628             
629             /**************************
630              * CALCULATE INTERACTIONS *
631              **************************/
632
633             r21              = _mm_mul_ps(rsq21,rinv21);
634
635             /* EWALD ELECTROSTATICS */
636
637             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
638             ewrt             = _mm_mul_ps(r21,ewtabscale);
639             ewitab           = _mm_cvttps_epi32(ewrt);
640             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
641             ewitab           = _mm_slli_epi32(ewitab,2);
642             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
643             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
644             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
645             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
646             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
647             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
648             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
649             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
650             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
651
652             /* Update potential sum for this i atom from the interaction with this j atom. */
653             velecsum         = _mm_add_ps(velecsum,velec);
654
655             fscal            = felec;
656
657             /* Calculate temporary vectorial force */
658             tx               = _mm_mul_ps(fscal,dx21);
659             ty               = _mm_mul_ps(fscal,dy21);
660             tz               = _mm_mul_ps(fscal,dz21);
661
662             /* Update vectorial force */
663             fix2             = _mm_add_ps(fix2,tx);
664             fiy2             = _mm_add_ps(fiy2,ty);
665             fiz2             = _mm_add_ps(fiz2,tz);
666
667             fjx1             = _mm_add_ps(fjx1,tx);
668             fjy1             = _mm_add_ps(fjy1,ty);
669             fjz1             = _mm_add_ps(fjz1,tz);
670             
671             /**************************
672              * CALCULATE INTERACTIONS *
673              **************************/
674
675             r22              = _mm_mul_ps(rsq22,rinv22);
676
677             /* EWALD ELECTROSTATICS */
678
679             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
680             ewrt             = _mm_mul_ps(r22,ewtabscale);
681             ewitab           = _mm_cvttps_epi32(ewrt);
682             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
683             ewitab           = _mm_slli_epi32(ewitab,2);
684             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
685             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
686             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
687             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
688             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
689             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
690             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
691             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
692             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
693
694             /* Update potential sum for this i atom from the interaction with this j atom. */
695             velecsum         = _mm_add_ps(velecsum,velec);
696
697             fscal            = felec;
698
699             /* Calculate temporary vectorial force */
700             tx               = _mm_mul_ps(fscal,dx22);
701             ty               = _mm_mul_ps(fscal,dy22);
702             tz               = _mm_mul_ps(fscal,dz22);
703
704             /* Update vectorial force */
705             fix2             = _mm_add_ps(fix2,tx);
706             fiy2             = _mm_add_ps(fiy2,ty);
707             fiz2             = _mm_add_ps(fiz2,tz);
708
709             fjx2             = _mm_add_ps(fjx2,tx);
710             fjy2             = _mm_add_ps(fjy2,ty);
711             fjz2             = _mm_add_ps(fjz2,tz);
712             
713             fjptrA             = f+j_coord_offsetA;
714             fjptrB             = f+j_coord_offsetB;
715             fjptrC             = f+j_coord_offsetC;
716             fjptrD             = f+j_coord_offsetD;
717
718             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
719                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
720
721             /* Inner loop uses 397 flops */
722         }
723
724         if(jidx<j_index_end)
725         {
726
727             /* Get j neighbor index, and coordinate index */
728             jnrlistA         = jjnr[jidx];
729             jnrlistB         = jjnr[jidx+1];
730             jnrlistC         = jjnr[jidx+2];
731             jnrlistD         = jjnr[jidx+3];
732             /* Sign of each element will be negative for non-real atoms.
733              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
734              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
735              */
736             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
737             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
738             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
739             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
740             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
741             j_coord_offsetA  = DIM*jnrA;
742             j_coord_offsetB  = DIM*jnrB;
743             j_coord_offsetC  = DIM*jnrC;
744             j_coord_offsetD  = DIM*jnrD;
745
746             /* load j atom coordinates */
747             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
748                                               x+j_coord_offsetC,x+j_coord_offsetD,
749                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
750
751             /* Calculate displacement vector */
752             dx00             = _mm_sub_ps(ix0,jx0);
753             dy00             = _mm_sub_ps(iy0,jy0);
754             dz00             = _mm_sub_ps(iz0,jz0);
755             dx01             = _mm_sub_ps(ix0,jx1);
756             dy01             = _mm_sub_ps(iy0,jy1);
757             dz01             = _mm_sub_ps(iz0,jz1);
758             dx02             = _mm_sub_ps(ix0,jx2);
759             dy02             = _mm_sub_ps(iy0,jy2);
760             dz02             = _mm_sub_ps(iz0,jz2);
761             dx10             = _mm_sub_ps(ix1,jx0);
762             dy10             = _mm_sub_ps(iy1,jy0);
763             dz10             = _mm_sub_ps(iz1,jz0);
764             dx11             = _mm_sub_ps(ix1,jx1);
765             dy11             = _mm_sub_ps(iy1,jy1);
766             dz11             = _mm_sub_ps(iz1,jz1);
767             dx12             = _mm_sub_ps(ix1,jx2);
768             dy12             = _mm_sub_ps(iy1,jy2);
769             dz12             = _mm_sub_ps(iz1,jz2);
770             dx20             = _mm_sub_ps(ix2,jx0);
771             dy20             = _mm_sub_ps(iy2,jy0);
772             dz20             = _mm_sub_ps(iz2,jz0);
773             dx21             = _mm_sub_ps(ix2,jx1);
774             dy21             = _mm_sub_ps(iy2,jy1);
775             dz21             = _mm_sub_ps(iz2,jz1);
776             dx22             = _mm_sub_ps(ix2,jx2);
777             dy22             = _mm_sub_ps(iy2,jy2);
778             dz22             = _mm_sub_ps(iz2,jz2);
779
780             /* Calculate squared distance and things based on it */
781             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
782             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
783             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
784             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
785             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
786             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
787             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
788             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
789             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
790
791             rinv00           = gmx_mm_invsqrt_ps(rsq00);
792             rinv01           = gmx_mm_invsqrt_ps(rsq01);
793             rinv02           = gmx_mm_invsqrt_ps(rsq02);
794             rinv10           = gmx_mm_invsqrt_ps(rsq10);
795             rinv11           = gmx_mm_invsqrt_ps(rsq11);
796             rinv12           = gmx_mm_invsqrt_ps(rsq12);
797             rinv20           = gmx_mm_invsqrt_ps(rsq20);
798             rinv21           = gmx_mm_invsqrt_ps(rsq21);
799             rinv22           = gmx_mm_invsqrt_ps(rsq22);
800
801             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
802             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
803             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
804             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
805             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
806             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
807             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
808             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
809             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
810
811             fjx0             = _mm_setzero_ps();
812             fjy0             = _mm_setzero_ps();
813             fjz0             = _mm_setzero_ps();
814             fjx1             = _mm_setzero_ps();
815             fjy1             = _mm_setzero_ps();
816             fjz1             = _mm_setzero_ps();
817             fjx2             = _mm_setzero_ps();
818             fjy2             = _mm_setzero_ps();
819             fjz2             = _mm_setzero_ps();
820
821             /**************************
822              * CALCULATE INTERACTIONS *
823              **************************/
824
825             r00              = _mm_mul_ps(rsq00,rinv00);
826             r00              = _mm_andnot_ps(dummy_mask,r00);
827
828             /* EWALD ELECTROSTATICS */
829
830             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
831             ewrt             = _mm_mul_ps(r00,ewtabscale);
832             ewitab           = _mm_cvttps_epi32(ewrt);
833             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
834             ewitab           = _mm_slli_epi32(ewitab,2);
835             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
836             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
837             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
838             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
839             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
840             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
841             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
842             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
843             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
844
845             /* Analytical LJ-PME */
846             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
847             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
848             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
849             exponent         = gmx_simd_exp_r(ewcljrsq);
850             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
851             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
852             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
853             vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
854             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
855             vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
856             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
857             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
858
859             /* Update potential sum for this i atom from the interaction with this j atom. */
860             velec            = _mm_andnot_ps(dummy_mask,velec);
861             velecsum         = _mm_add_ps(velecsum,velec);
862             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
863             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
864
865             fscal            = _mm_add_ps(felec,fvdw);
866
867             fscal            = _mm_andnot_ps(dummy_mask,fscal);
868
869             /* Calculate temporary vectorial force */
870             tx               = _mm_mul_ps(fscal,dx00);
871             ty               = _mm_mul_ps(fscal,dy00);
872             tz               = _mm_mul_ps(fscal,dz00);
873
874             /* Update vectorial force */
875             fix0             = _mm_add_ps(fix0,tx);
876             fiy0             = _mm_add_ps(fiy0,ty);
877             fiz0             = _mm_add_ps(fiz0,tz);
878
879             fjx0             = _mm_add_ps(fjx0,tx);
880             fjy0             = _mm_add_ps(fjy0,ty);
881             fjz0             = _mm_add_ps(fjz0,tz);
882             
883             /**************************
884              * CALCULATE INTERACTIONS *
885              **************************/
886
887             r01              = _mm_mul_ps(rsq01,rinv01);
888             r01              = _mm_andnot_ps(dummy_mask,r01);
889
890             /* EWALD ELECTROSTATICS */
891
892             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
893             ewrt             = _mm_mul_ps(r01,ewtabscale);
894             ewitab           = _mm_cvttps_epi32(ewrt);
895             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
896             ewitab           = _mm_slli_epi32(ewitab,2);
897             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
898             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
899             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
900             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
901             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
902             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
903             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
904             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
905             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
906
907             /* Update potential sum for this i atom from the interaction with this j atom. */
908             velec            = _mm_andnot_ps(dummy_mask,velec);
909             velecsum         = _mm_add_ps(velecsum,velec);
910
911             fscal            = felec;
912
913             fscal            = _mm_andnot_ps(dummy_mask,fscal);
914
915             /* Calculate temporary vectorial force */
916             tx               = _mm_mul_ps(fscal,dx01);
917             ty               = _mm_mul_ps(fscal,dy01);
918             tz               = _mm_mul_ps(fscal,dz01);
919
920             /* Update vectorial force */
921             fix0             = _mm_add_ps(fix0,tx);
922             fiy0             = _mm_add_ps(fiy0,ty);
923             fiz0             = _mm_add_ps(fiz0,tz);
924
925             fjx1             = _mm_add_ps(fjx1,tx);
926             fjy1             = _mm_add_ps(fjy1,ty);
927             fjz1             = _mm_add_ps(fjz1,tz);
928             
929             /**************************
930              * CALCULATE INTERACTIONS *
931              **************************/
932
933             r02              = _mm_mul_ps(rsq02,rinv02);
934             r02              = _mm_andnot_ps(dummy_mask,r02);
935
936             /* EWALD ELECTROSTATICS */
937
938             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
939             ewrt             = _mm_mul_ps(r02,ewtabscale);
940             ewitab           = _mm_cvttps_epi32(ewrt);
941             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
942             ewitab           = _mm_slli_epi32(ewitab,2);
943             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
944             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
945             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
946             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
947             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
948             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
949             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
950             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
951             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
952
953             /* Update potential sum for this i atom from the interaction with this j atom. */
954             velec            = _mm_andnot_ps(dummy_mask,velec);
955             velecsum         = _mm_add_ps(velecsum,velec);
956
957             fscal            = felec;
958
959             fscal            = _mm_andnot_ps(dummy_mask,fscal);
960
961             /* Calculate temporary vectorial force */
962             tx               = _mm_mul_ps(fscal,dx02);
963             ty               = _mm_mul_ps(fscal,dy02);
964             tz               = _mm_mul_ps(fscal,dz02);
965
966             /* Update vectorial force */
967             fix0             = _mm_add_ps(fix0,tx);
968             fiy0             = _mm_add_ps(fiy0,ty);
969             fiz0             = _mm_add_ps(fiz0,tz);
970
971             fjx2             = _mm_add_ps(fjx2,tx);
972             fjy2             = _mm_add_ps(fjy2,ty);
973             fjz2             = _mm_add_ps(fjz2,tz);
974             
975             /**************************
976              * CALCULATE INTERACTIONS *
977              **************************/
978
979             r10              = _mm_mul_ps(rsq10,rinv10);
980             r10              = _mm_andnot_ps(dummy_mask,r10);
981
982             /* EWALD ELECTROSTATICS */
983
984             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
985             ewrt             = _mm_mul_ps(r10,ewtabscale);
986             ewitab           = _mm_cvttps_epi32(ewrt);
987             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
988             ewitab           = _mm_slli_epi32(ewitab,2);
989             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
990             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
991             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
992             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
993             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
994             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
995             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
996             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
997             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
998
999             /* Update potential sum for this i atom from the interaction with this j atom. */
1000             velec            = _mm_andnot_ps(dummy_mask,velec);
1001             velecsum         = _mm_add_ps(velecsum,velec);
1002
1003             fscal            = felec;
1004
1005             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1006
1007             /* Calculate temporary vectorial force */
1008             tx               = _mm_mul_ps(fscal,dx10);
1009             ty               = _mm_mul_ps(fscal,dy10);
1010             tz               = _mm_mul_ps(fscal,dz10);
1011
1012             /* Update vectorial force */
1013             fix1             = _mm_add_ps(fix1,tx);
1014             fiy1             = _mm_add_ps(fiy1,ty);
1015             fiz1             = _mm_add_ps(fiz1,tz);
1016
1017             fjx0             = _mm_add_ps(fjx0,tx);
1018             fjy0             = _mm_add_ps(fjy0,ty);
1019             fjz0             = _mm_add_ps(fjz0,tz);
1020             
1021             /**************************
1022              * CALCULATE INTERACTIONS *
1023              **************************/
1024
1025             r11              = _mm_mul_ps(rsq11,rinv11);
1026             r11              = _mm_andnot_ps(dummy_mask,r11);
1027
1028             /* EWALD ELECTROSTATICS */
1029
1030             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031             ewrt             = _mm_mul_ps(r11,ewtabscale);
1032             ewitab           = _mm_cvttps_epi32(ewrt);
1033             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1034             ewitab           = _mm_slli_epi32(ewitab,2);
1035             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1036             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1037             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1038             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1039             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1040             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1041             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1042             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1043             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1044
1045             /* Update potential sum for this i atom from the interaction with this j atom. */
1046             velec            = _mm_andnot_ps(dummy_mask,velec);
1047             velecsum         = _mm_add_ps(velecsum,velec);
1048
1049             fscal            = felec;
1050
1051             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1052
1053             /* Calculate temporary vectorial force */
1054             tx               = _mm_mul_ps(fscal,dx11);
1055             ty               = _mm_mul_ps(fscal,dy11);
1056             tz               = _mm_mul_ps(fscal,dz11);
1057
1058             /* Update vectorial force */
1059             fix1             = _mm_add_ps(fix1,tx);
1060             fiy1             = _mm_add_ps(fiy1,ty);
1061             fiz1             = _mm_add_ps(fiz1,tz);
1062
1063             fjx1             = _mm_add_ps(fjx1,tx);
1064             fjy1             = _mm_add_ps(fjy1,ty);
1065             fjz1             = _mm_add_ps(fjz1,tz);
1066             
1067             /**************************
1068              * CALCULATE INTERACTIONS *
1069              **************************/
1070
1071             r12              = _mm_mul_ps(rsq12,rinv12);
1072             r12              = _mm_andnot_ps(dummy_mask,r12);
1073
1074             /* EWALD ELECTROSTATICS */
1075
1076             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1077             ewrt             = _mm_mul_ps(r12,ewtabscale);
1078             ewitab           = _mm_cvttps_epi32(ewrt);
1079             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1080             ewitab           = _mm_slli_epi32(ewitab,2);
1081             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1082             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1083             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1084             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1085             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1086             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1087             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1088             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1089             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1090
1091             /* Update potential sum for this i atom from the interaction with this j atom. */
1092             velec            = _mm_andnot_ps(dummy_mask,velec);
1093             velecsum         = _mm_add_ps(velecsum,velec);
1094
1095             fscal            = felec;
1096
1097             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1098
1099             /* Calculate temporary vectorial force */
1100             tx               = _mm_mul_ps(fscal,dx12);
1101             ty               = _mm_mul_ps(fscal,dy12);
1102             tz               = _mm_mul_ps(fscal,dz12);
1103
1104             /* Update vectorial force */
1105             fix1             = _mm_add_ps(fix1,tx);
1106             fiy1             = _mm_add_ps(fiy1,ty);
1107             fiz1             = _mm_add_ps(fiz1,tz);
1108
1109             fjx2             = _mm_add_ps(fjx2,tx);
1110             fjy2             = _mm_add_ps(fjy2,ty);
1111             fjz2             = _mm_add_ps(fjz2,tz);
1112             
1113             /**************************
1114              * CALCULATE INTERACTIONS *
1115              **************************/
1116
1117             r20              = _mm_mul_ps(rsq20,rinv20);
1118             r20              = _mm_andnot_ps(dummy_mask,r20);
1119
1120             /* EWALD ELECTROSTATICS */
1121
1122             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1123             ewrt             = _mm_mul_ps(r20,ewtabscale);
1124             ewitab           = _mm_cvttps_epi32(ewrt);
1125             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1126             ewitab           = _mm_slli_epi32(ewitab,2);
1127             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1128             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1129             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1130             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1131             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1132             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1133             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1134             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1135             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1136
1137             /* Update potential sum for this i atom from the interaction with this j atom. */
1138             velec            = _mm_andnot_ps(dummy_mask,velec);
1139             velecsum         = _mm_add_ps(velecsum,velec);
1140
1141             fscal            = felec;
1142
1143             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1144
1145             /* Calculate temporary vectorial force */
1146             tx               = _mm_mul_ps(fscal,dx20);
1147             ty               = _mm_mul_ps(fscal,dy20);
1148             tz               = _mm_mul_ps(fscal,dz20);
1149
1150             /* Update vectorial force */
1151             fix2             = _mm_add_ps(fix2,tx);
1152             fiy2             = _mm_add_ps(fiy2,ty);
1153             fiz2             = _mm_add_ps(fiz2,tz);
1154
1155             fjx0             = _mm_add_ps(fjx0,tx);
1156             fjy0             = _mm_add_ps(fjy0,ty);
1157             fjz0             = _mm_add_ps(fjz0,tz);
1158             
1159             /**************************
1160              * CALCULATE INTERACTIONS *
1161              **************************/
1162
1163             r21              = _mm_mul_ps(rsq21,rinv21);
1164             r21              = _mm_andnot_ps(dummy_mask,r21);
1165
1166             /* EWALD ELECTROSTATICS */
1167
1168             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1169             ewrt             = _mm_mul_ps(r21,ewtabscale);
1170             ewitab           = _mm_cvttps_epi32(ewrt);
1171             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1172             ewitab           = _mm_slli_epi32(ewitab,2);
1173             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1174             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1175             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1176             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1177             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1178             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1179             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1180             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1181             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1182
1183             /* Update potential sum for this i atom from the interaction with this j atom. */
1184             velec            = _mm_andnot_ps(dummy_mask,velec);
1185             velecsum         = _mm_add_ps(velecsum,velec);
1186
1187             fscal            = felec;
1188
1189             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1190
1191             /* Calculate temporary vectorial force */
1192             tx               = _mm_mul_ps(fscal,dx21);
1193             ty               = _mm_mul_ps(fscal,dy21);
1194             tz               = _mm_mul_ps(fscal,dz21);
1195
1196             /* Update vectorial force */
1197             fix2             = _mm_add_ps(fix2,tx);
1198             fiy2             = _mm_add_ps(fiy2,ty);
1199             fiz2             = _mm_add_ps(fiz2,tz);
1200
1201             fjx1             = _mm_add_ps(fjx1,tx);
1202             fjy1             = _mm_add_ps(fjy1,ty);
1203             fjz1             = _mm_add_ps(fjz1,tz);
1204             
1205             /**************************
1206              * CALCULATE INTERACTIONS *
1207              **************************/
1208
1209             r22              = _mm_mul_ps(rsq22,rinv22);
1210             r22              = _mm_andnot_ps(dummy_mask,r22);
1211
1212             /* EWALD ELECTROSTATICS */
1213
1214             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1215             ewrt             = _mm_mul_ps(r22,ewtabscale);
1216             ewitab           = _mm_cvttps_epi32(ewrt);
1217             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1218             ewitab           = _mm_slli_epi32(ewitab,2);
1219             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1220             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1221             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1222             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1223             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1224             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1225             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1226             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1227             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1228
1229             /* Update potential sum for this i atom from the interaction with this j atom. */
1230             velec            = _mm_andnot_ps(dummy_mask,velec);
1231             velecsum         = _mm_add_ps(velecsum,velec);
1232
1233             fscal            = felec;
1234
1235             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1236
1237             /* Calculate temporary vectorial force */
1238             tx               = _mm_mul_ps(fscal,dx22);
1239             ty               = _mm_mul_ps(fscal,dy22);
1240             tz               = _mm_mul_ps(fscal,dz22);
1241
1242             /* Update vectorial force */
1243             fix2             = _mm_add_ps(fix2,tx);
1244             fiy2             = _mm_add_ps(fiy2,ty);
1245             fiz2             = _mm_add_ps(fiz2,tz);
1246
1247             fjx2             = _mm_add_ps(fjx2,tx);
1248             fjy2             = _mm_add_ps(fjy2,ty);
1249             fjz2             = _mm_add_ps(fjz2,tz);
1250             
1251             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1252             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1253             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1254             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1255
1256             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1257                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1258
1259             /* Inner loop uses 406 flops */
1260         }
1261
1262         /* End of innermost loop */
1263
1264         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1265                                               f+i_coord_offset,fshift+i_shift_offset);
1266
1267         ggid                        = gid[iidx];
1268         /* Update potential energies */
1269         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1270         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1271
1272         /* Increment number of inner iterations */
1273         inneriter                  += j_index_end - j_index_start;
1274
1275         /* Outer loop uses 20 flops */
1276     }
1277
1278     /* Increment number of outer iterations */
1279     outeriter        += nri;
1280
1281     /* Update outer/inner flops */
1282
1283     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*406);
1284 }
1285 /*
1286  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_single
1287  * Electrostatics interaction: Ewald
1288  * VdW interaction:            LJEwald
1289  * Geometry:                   Water3-Water3
1290  * Calculate force/pot:        Force
1291  */
1292 void
1293 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_single
1294                     (t_nblist                    * gmx_restrict       nlist,
1295                      rvec                        * gmx_restrict          xx,
1296                      rvec                        * gmx_restrict          ff,
1297                      t_forcerec                  * gmx_restrict          fr,
1298                      t_mdatoms                   * gmx_restrict     mdatoms,
1299                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1300                      t_nrnb                      * gmx_restrict        nrnb)
1301 {
1302     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1303      * just 0 for non-waters.
1304      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1305      * jnr indices corresponding to data put in the four positions in the SIMD register.
1306      */
1307     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1308     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1309     int              jnrA,jnrB,jnrC,jnrD;
1310     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1311     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1312     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1313     real             rcutoff_scalar;
1314     real             *shiftvec,*fshift,*x,*f;
1315     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1316     real             scratch[4*DIM];
1317     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1318     int              vdwioffset0;
1319     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1320     int              vdwioffset1;
1321     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1322     int              vdwioffset2;
1323     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1324     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1325     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1326     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1327     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1328     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1329     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1330     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1331     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1332     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1333     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1334     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1335     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1336     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1337     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1338     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1339     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1340     real             *charge;
1341     int              nvdwtype;
1342     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1343     int              *vdwtype;
1344     real             *vdwparam;
1345     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1346     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1347     __m128           c6grid_00;
1348     __m128           c6grid_01;
1349     __m128           c6grid_02;
1350     __m128           c6grid_10;
1351     __m128           c6grid_11;
1352     __m128           c6grid_12;
1353     __m128           c6grid_20;
1354     __m128           c6grid_21;
1355     __m128           c6grid_22;
1356     __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1357     real             *vdwgridparam;
1358     __m128           one_half = _mm_set1_ps(0.5);
1359     __m128           minus_one = _mm_set1_ps(-1.0);
1360     __m128i          ewitab;
1361     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1362     real             *ewtab;
1363     __m128           dummy_mask,cutoff_mask;
1364     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1365     __m128           one     = _mm_set1_ps(1.0);
1366     __m128           two     = _mm_set1_ps(2.0);
1367     x                = xx[0];
1368     f                = ff[0];
1369
1370     nri              = nlist->nri;
1371     iinr             = nlist->iinr;
1372     jindex           = nlist->jindex;
1373     jjnr             = nlist->jjnr;
1374     shiftidx         = nlist->shift;
1375     gid              = nlist->gid;
1376     shiftvec         = fr->shift_vec[0];
1377     fshift           = fr->fshift[0];
1378     facel            = _mm_set1_ps(fr->epsfac);
1379     charge           = mdatoms->chargeA;
1380     nvdwtype         = fr->ntype;
1381     vdwparam         = fr->nbfp;
1382     vdwtype          = mdatoms->typeA;
1383     vdwgridparam     = fr->ljpme_c6grid;
1384     sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
1385     ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
1386     ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1387
1388     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1389     ewtab            = fr->ic->tabq_coul_F;
1390     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1391     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1392
1393     /* Setup water-specific parameters */
1394     inr              = nlist->iinr[0];
1395     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1396     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1397     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1398     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1399
1400     jq0              = _mm_set1_ps(charge[inr+0]);
1401     jq1              = _mm_set1_ps(charge[inr+1]);
1402     jq2              = _mm_set1_ps(charge[inr+2]);
1403     vdwjidx0A        = 2*vdwtype[inr+0];
1404     qq00             = _mm_mul_ps(iq0,jq0);
1405     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1406     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1407     c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1408     qq01             = _mm_mul_ps(iq0,jq1);
1409     qq02             = _mm_mul_ps(iq0,jq2);
1410     qq10             = _mm_mul_ps(iq1,jq0);
1411     qq11             = _mm_mul_ps(iq1,jq1);
1412     qq12             = _mm_mul_ps(iq1,jq2);
1413     qq20             = _mm_mul_ps(iq2,jq0);
1414     qq21             = _mm_mul_ps(iq2,jq1);
1415     qq22             = _mm_mul_ps(iq2,jq2);
1416
1417     /* Avoid stupid compiler warnings */
1418     jnrA = jnrB = jnrC = jnrD = 0;
1419     j_coord_offsetA = 0;
1420     j_coord_offsetB = 0;
1421     j_coord_offsetC = 0;
1422     j_coord_offsetD = 0;
1423
1424     outeriter        = 0;
1425     inneriter        = 0;
1426
1427     for(iidx=0;iidx<4*DIM;iidx++)
1428     {
1429         scratch[iidx] = 0.0;
1430     }  
1431
1432     /* Start outer loop over neighborlists */
1433     for(iidx=0; iidx<nri; iidx++)
1434     {
1435         /* Load shift vector for this list */
1436         i_shift_offset   = DIM*shiftidx[iidx];
1437
1438         /* Load limits for loop over neighbors */
1439         j_index_start    = jindex[iidx];
1440         j_index_end      = jindex[iidx+1];
1441
1442         /* Get outer coordinate index */
1443         inr              = iinr[iidx];
1444         i_coord_offset   = DIM*inr;
1445
1446         /* Load i particle coords and add shift vector */
1447         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1448                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1449         
1450         fix0             = _mm_setzero_ps();
1451         fiy0             = _mm_setzero_ps();
1452         fiz0             = _mm_setzero_ps();
1453         fix1             = _mm_setzero_ps();
1454         fiy1             = _mm_setzero_ps();
1455         fiz1             = _mm_setzero_ps();
1456         fix2             = _mm_setzero_ps();
1457         fiy2             = _mm_setzero_ps();
1458         fiz2             = _mm_setzero_ps();
1459
1460         /* Start inner kernel loop */
1461         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1462         {
1463
1464             /* Get j neighbor index, and coordinate index */
1465             jnrA             = jjnr[jidx];
1466             jnrB             = jjnr[jidx+1];
1467             jnrC             = jjnr[jidx+2];
1468             jnrD             = jjnr[jidx+3];
1469             j_coord_offsetA  = DIM*jnrA;
1470             j_coord_offsetB  = DIM*jnrB;
1471             j_coord_offsetC  = DIM*jnrC;
1472             j_coord_offsetD  = DIM*jnrD;
1473
1474             /* load j atom coordinates */
1475             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1476                                               x+j_coord_offsetC,x+j_coord_offsetD,
1477                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1478
1479             /* Calculate displacement vector */
1480             dx00             = _mm_sub_ps(ix0,jx0);
1481             dy00             = _mm_sub_ps(iy0,jy0);
1482             dz00             = _mm_sub_ps(iz0,jz0);
1483             dx01             = _mm_sub_ps(ix0,jx1);
1484             dy01             = _mm_sub_ps(iy0,jy1);
1485             dz01             = _mm_sub_ps(iz0,jz1);
1486             dx02             = _mm_sub_ps(ix0,jx2);
1487             dy02             = _mm_sub_ps(iy0,jy2);
1488             dz02             = _mm_sub_ps(iz0,jz2);
1489             dx10             = _mm_sub_ps(ix1,jx0);
1490             dy10             = _mm_sub_ps(iy1,jy0);
1491             dz10             = _mm_sub_ps(iz1,jz0);
1492             dx11             = _mm_sub_ps(ix1,jx1);
1493             dy11             = _mm_sub_ps(iy1,jy1);
1494             dz11             = _mm_sub_ps(iz1,jz1);
1495             dx12             = _mm_sub_ps(ix1,jx2);
1496             dy12             = _mm_sub_ps(iy1,jy2);
1497             dz12             = _mm_sub_ps(iz1,jz2);
1498             dx20             = _mm_sub_ps(ix2,jx0);
1499             dy20             = _mm_sub_ps(iy2,jy0);
1500             dz20             = _mm_sub_ps(iz2,jz0);
1501             dx21             = _mm_sub_ps(ix2,jx1);
1502             dy21             = _mm_sub_ps(iy2,jy1);
1503             dz21             = _mm_sub_ps(iz2,jz1);
1504             dx22             = _mm_sub_ps(ix2,jx2);
1505             dy22             = _mm_sub_ps(iy2,jy2);
1506             dz22             = _mm_sub_ps(iz2,jz2);
1507
1508             /* Calculate squared distance and things based on it */
1509             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1510             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1511             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1512             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1513             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1514             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1515             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1516             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1517             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1518
1519             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1520             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1521             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1522             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1523             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1524             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1525             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1526             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1527             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1528
1529             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1530             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1531             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1532             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1533             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1534             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1535             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1536             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1537             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1538
1539             fjx0             = _mm_setzero_ps();
1540             fjy0             = _mm_setzero_ps();
1541             fjz0             = _mm_setzero_ps();
1542             fjx1             = _mm_setzero_ps();
1543             fjy1             = _mm_setzero_ps();
1544             fjz1             = _mm_setzero_ps();
1545             fjx2             = _mm_setzero_ps();
1546             fjy2             = _mm_setzero_ps();
1547             fjz2             = _mm_setzero_ps();
1548
1549             /**************************
1550              * CALCULATE INTERACTIONS *
1551              **************************/
1552
1553             r00              = _mm_mul_ps(rsq00,rinv00);
1554
1555             /* EWALD ELECTROSTATICS */
1556
1557             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1558             ewrt             = _mm_mul_ps(r00,ewtabscale);
1559             ewitab           = _mm_cvttps_epi32(ewrt);
1560             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1561             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1562                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1563                                          &ewtabF,&ewtabFn);
1564             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1565             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1566
1567             /* Analytical LJ-PME */
1568             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1569             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
1570             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1571             exponent         = gmx_simd_exp_r(ewcljrsq);
1572             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1573             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1574             /* f6A = 6 * C6grid * (1 - poly) */
1575             f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1576             /* f6B = C6grid * exponent * beta^6 */
1577             f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1578             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1579             fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1580
1581             fscal            = _mm_add_ps(felec,fvdw);
1582
1583             /* Calculate temporary vectorial force */
1584             tx               = _mm_mul_ps(fscal,dx00);
1585             ty               = _mm_mul_ps(fscal,dy00);
1586             tz               = _mm_mul_ps(fscal,dz00);
1587
1588             /* Update vectorial force */
1589             fix0             = _mm_add_ps(fix0,tx);
1590             fiy0             = _mm_add_ps(fiy0,ty);
1591             fiz0             = _mm_add_ps(fiz0,tz);
1592
1593             fjx0             = _mm_add_ps(fjx0,tx);
1594             fjy0             = _mm_add_ps(fjy0,ty);
1595             fjz0             = _mm_add_ps(fjz0,tz);
1596             
1597             /**************************
1598              * CALCULATE INTERACTIONS *
1599              **************************/
1600
1601             r01              = _mm_mul_ps(rsq01,rinv01);
1602
1603             /* EWALD ELECTROSTATICS */
1604
1605             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1606             ewrt             = _mm_mul_ps(r01,ewtabscale);
1607             ewitab           = _mm_cvttps_epi32(ewrt);
1608             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1609             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1610                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1611                                          &ewtabF,&ewtabFn);
1612             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1613             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1614
1615             fscal            = felec;
1616
1617             /* Calculate temporary vectorial force */
1618             tx               = _mm_mul_ps(fscal,dx01);
1619             ty               = _mm_mul_ps(fscal,dy01);
1620             tz               = _mm_mul_ps(fscal,dz01);
1621
1622             /* Update vectorial force */
1623             fix0             = _mm_add_ps(fix0,tx);
1624             fiy0             = _mm_add_ps(fiy0,ty);
1625             fiz0             = _mm_add_ps(fiz0,tz);
1626
1627             fjx1             = _mm_add_ps(fjx1,tx);
1628             fjy1             = _mm_add_ps(fjy1,ty);
1629             fjz1             = _mm_add_ps(fjz1,tz);
1630             
1631             /**************************
1632              * CALCULATE INTERACTIONS *
1633              **************************/
1634
1635             r02              = _mm_mul_ps(rsq02,rinv02);
1636
1637             /* EWALD ELECTROSTATICS */
1638
1639             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1640             ewrt             = _mm_mul_ps(r02,ewtabscale);
1641             ewitab           = _mm_cvttps_epi32(ewrt);
1642             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1643             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1644                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1645                                          &ewtabF,&ewtabFn);
1646             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1647             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1648
1649             fscal            = felec;
1650
1651             /* Calculate temporary vectorial force */
1652             tx               = _mm_mul_ps(fscal,dx02);
1653             ty               = _mm_mul_ps(fscal,dy02);
1654             tz               = _mm_mul_ps(fscal,dz02);
1655
1656             /* Update vectorial force */
1657             fix0             = _mm_add_ps(fix0,tx);
1658             fiy0             = _mm_add_ps(fiy0,ty);
1659             fiz0             = _mm_add_ps(fiz0,tz);
1660
1661             fjx2             = _mm_add_ps(fjx2,tx);
1662             fjy2             = _mm_add_ps(fjy2,ty);
1663             fjz2             = _mm_add_ps(fjz2,tz);
1664             
1665             /**************************
1666              * CALCULATE INTERACTIONS *
1667              **************************/
1668
1669             r10              = _mm_mul_ps(rsq10,rinv10);
1670
1671             /* EWALD ELECTROSTATICS */
1672
1673             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1674             ewrt             = _mm_mul_ps(r10,ewtabscale);
1675             ewitab           = _mm_cvttps_epi32(ewrt);
1676             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1677             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1678                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1679                                          &ewtabF,&ewtabFn);
1680             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1681             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1682
1683             fscal            = felec;
1684
1685             /* Calculate temporary vectorial force */
1686             tx               = _mm_mul_ps(fscal,dx10);
1687             ty               = _mm_mul_ps(fscal,dy10);
1688             tz               = _mm_mul_ps(fscal,dz10);
1689
1690             /* Update vectorial force */
1691             fix1             = _mm_add_ps(fix1,tx);
1692             fiy1             = _mm_add_ps(fiy1,ty);
1693             fiz1             = _mm_add_ps(fiz1,tz);
1694
1695             fjx0             = _mm_add_ps(fjx0,tx);
1696             fjy0             = _mm_add_ps(fjy0,ty);
1697             fjz0             = _mm_add_ps(fjz0,tz);
1698             
1699             /**************************
1700              * CALCULATE INTERACTIONS *
1701              **************************/
1702
1703             r11              = _mm_mul_ps(rsq11,rinv11);
1704
1705             /* EWALD ELECTROSTATICS */
1706
1707             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1708             ewrt             = _mm_mul_ps(r11,ewtabscale);
1709             ewitab           = _mm_cvttps_epi32(ewrt);
1710             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1711             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1712                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1713                                          &ewtabF,&ewtabFn);
1714             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1715             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1716
1717             fscal            = felec;
1718
1719             /* Calculate temporary vectorial force */
1720             tx               = _mm_mul_ps(fscal,dx11);
1721             ty               = _mm_mul_ps(fscal,dy11);
1722             tz               = _mm_mul_ps(fscal,dz11);
1723
1724             /* Update vectorial force */
1725             fix1             = _mm_add_ps(fix1,tx);
1726             fiy1             = _mm_add_ps(fiy1,ty);
1727             fiz1             = _mm_add_ps(fiz1,tz);
1728
1729             fjx1             = _mm_add_ps(fjx1,tx);
1730             fjy1             = _mm_add_ps(fjy1,ty);
1731             fjz1             = _mm_add_ps(fjz1,tz);
1732             
1733             /**************************
1734              * CALCULATE INTERACTIONS *
1735              **************************/
1736
1737             r12              = _mm_mul_ps(rsq12,rinv12);
1738
1739             /* EWALD ELECTROSTATICS */
1740
1741             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1742             ewrt             = _mm_mul_ps(r12,ewtabscale);
1743             ewitab           = _mm_cvttps_epi32(ewrt);
1744             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1745             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1746                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1747                                          &ewtabF,&ewtabFn);
1748             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1749             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1750
1751             fscal            = felec;
1752
1753             /* Calculate temporary vectorial force */
1754             tx               = _mm_mul_ps(fscal,dx12);
1755             ty               = _mm_mul_ps(fscal,dy12);
1756             tz               = _mm_mul_ps(fscal,dz12);
1757
1758             /* Update vectorial force */
1759             fix1             = _mm_add_ps(fix1,tx);
1760             fiy1             = _mm_add_ps(fiy1,ty);
1761             fiz1             = _mm_add_ps(fiz1,tz);
1762
1763             fjx2             = _mm_add_ps(fjx2,tx);
1764             fjy2             = _mm_add_ps(fjy2,ty);
1765             fjz2             = _mm_add_ps(fjz2,tz);
1766             
1767             /**************************
1768              * CALCULATE INTERACTIONS *
1769              **************************/
1770
1771             r20              = _mm_mul_ps(rsq20,rinv20);
1772
1773             /* EWALD ELECTROSTATICS */
1774
1775             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1776             ewrt             = _mm_mul_ps(r20,ewtabscale);
1777             ewitab           = _mm_cvttps_epi32(ewrt);
1778             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1779             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1780                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1781                                          &ewtabF,&ewtabFn);
1782             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1783             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1784
1785             fscal            = felec;
1786
1787             /* Calculate temporary vectorial force */
1788             tx               = _mm_mul_ps(fscal,dx20);
1789             ty               = _mm_mul_ps(fscal,dy20);
1790             tz               = _mm_mul_ps(fscal,dz20);
1791
1792             /* Update vectorial force */
1793             fix2             = _mm_add_ps(fix2,tx);
1794             fiy2             = _mm_add_ps(fiy2,ty);
1795             fiz2             = _mm_add_ps(fiz2,tz);
1796
1797             fjx0             = _mm_add_ps(fjx0,tx);
1798             fjy0             = _mm_add_ps(fjy0,ty);
1799             fjz0             = _mm_add_ps(fjz0,tz);
1800             
1801             /**************************
1802              * CALCULATE INTERACTIONS *
1803              **************************/
1804
1805             r21              = _mm_mul_ps(rsq21,rinv21);
1806
1807             /* EWALD ELECTROSTATICS */
1808
1809             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1810             ewrt             = _mm_mul_ps(r21,ewtabscale);
1811             ewitab           = _mm_cvttps_epi32(ewrt);
1812             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1813             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1814                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1815                                          &ewtabF,&ewtabFn);
1816             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1817             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1818
1819             fscal            = felec;
1820
1821             /* Calculate temporary vectorial force */
1822             tx               = _mm_mul_ps(fscal,dx21);
1823             ty               = _mm_mul_ps(fscal,dy21);
1824             tz               = _mm_mul_ps(fscal,dz21);
1825
1826             /* Update vectorial force */
1827             fix2             = _mm_add_ps(fix2,tx);
1828             fiy2             = _mm_add_ps(fiy2,ty);
1829             fiz2             = _mm_add_ps(fiz2,tz);
1830
1831             fjx1             = _mm_add_ps(fjx1,tx);
1832             fjy1             = _mm_add_ps(fjy1,ty);
1833             fjz1             = _mm_add_ps(fjz1,tz);
1834             
1835             /**************************
1836              * CALCULATE INTERACTIONS *
1837              **************************/
1838
1839             r22              = _mm_mul_ps(rsq22,rinv22);
1840
1841             /* EWALD ELECTROSTATICS */
1842
1843             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1844             ewrt             = _mm_mul_ps(r22,ewtabscale);
1845             ewitab           = _mm_cvttps_epi32(ewrt);
1846             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1847             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1848                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1849                                          &ewtabF,&ewtabFn);
1850             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1851             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1852
1853             fscal            = felec;
1854
1855             /* Calculate temporary vectorial force */
1856             tx               = _mm_mul_ps(fscal,dx22);
1857             ty               = _mm_mul_ps(fscal,dy22);
1858             tz               = _mm_mul_ps(fscal,dz22);
1859
1860             /* Update vectorial force */
1861             fix2             = _mm_add_ps(fix2,tx);
1862             fiy2             = _mm_add_ps(fiy2,ty);
1863             fiz2             = _mm_add_ps(fiz2,tz);
1864
1865             fjx2             = _mm_add_ps(fjx2,tx);
1866             fjy2             = _mm_add_ps(fjy2,ty);
1867             fjz2             = _mm_add_ps(fjz2,tz);
1868             
1869             fjptrA             = f+j_coord_offsetA;
1870             fjptrB             = f+j_coord_offsetB;
1871             fjptrC             = f+j_coord_offsetC;
1872             fjptrD             = f+j_coord_offsetD;
1873
1874             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1875                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1876
1877             /* Inner loop uses 347 flops */
1878         }
1879
1880         if(jidx<j_index_end)
1881         {
1882
1883             /* Get j neighbor index, and coordinate index */
1884             jnrlistA         = jjnr[jidx];
1885             jnrlistB         = jjnr[jidx+1];
1886             jnrlistC         = jjnr[jidx+2];
1887             jnrlistD         = jjnr[jidx+3];
1888             /* Sign of each element will be negative for non-real atoms.
1889              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1890              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1891              */
1892             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1893             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1894             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1895             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1896             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1897             j_coord_offsetA  = DIM*jnrA;
1898             j_coord_offsetB  = DIM*jnrB;
1899             j_coord_offsetC  = DIM*jnrC;
1900             j_coord_offsetD  = DIM*jnrD;
1901
1902             /* load j atom coordinates */
1903             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1904                                               x+j_coord_offsetC,x+j_coord_offsetD,
1905                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1906
1907             /* Calculate displacement vector */
1908             dx00             = _mm_sub_ps(ix0,jx0);
1909             dy00             = _mm_sub_ps(iy0,jy0);
1910             dz00             = _mm_sub_ps(iz0,jz0);
1911             dx01             = _mm_sub_ps(ix0,jx1);
1912             dy01             = _mm_sub_ps(iy0,jy1);
1913             dz01             = _mm_sub_ps(iz0,jz1);
1914             dx02             = _mm_sub_ps(ix0,jx2);
1915             dy02             = _mm_sub_ps(iy0,jy2);
1916             dz02             = _mm_sub_ps(iz0,jz2);
1917             dx10             = _mm_sub_ps(ix1,jx0);
1918             dy10             = _mm_sub_ps(iy1,jy0);
1919             dz10             = _mm_sub_ps(iz1,jz0);
1920             dx11             = _mm_sub_ps(ix1,jx1);
1921             dy11             = _mm_sub_ps(iy1,jy1);
1922             dz11             = _mm_sub_ps(iz1,jz1);
1923             dx12             = _mm_sub_ps(ix1,jx2);
1924             dy12             = _mm_sub_ps(iy1,jy2);
1925             dz12             = _mm_sub_ps(iz1,jz2);
1926             dx20             = _mm_sub_ps(ix2,jx0);
1927             dy20             = _mm_sub_ps(iy2,jy0);
1928             dz20             = _mm_sub_ps(iz2,jz0);
1929             dx21             = _mm_sub_ps(ix2,jx1);
1930             dy21             = _mm_sub_ps(iy2,jy1);
1931             dz21             = _mm_sub_ps(iz2,jz1);
1932             dx22             = _mm_sub_ps(ix2,jx2);
1933             dy22             = _mm_sub_ps(iy2,jy2);
1934             dz22             = _mm_sub_ps(iz2,jz2);
1935
1936             /* Calculate squared distance and things based on it */
1937             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1938             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1939             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1940             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1941             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1942             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1943             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1944             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1945             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1946
1947             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1948             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1949             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1950             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1951             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1952             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1953             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1954             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1955             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1956
1957             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1958             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1959             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1960             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1961             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1962             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1963             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1964             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1965             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1966
1967             fjx0             = _mm_setzero_ps();
1968             fjy0             = _mm_setzero_ps();
1969             fjz0             = _mm_setzero_ps();
1970             fjx1             = _mm_setzero_ps();
1971             fjy1             = _mm_setzero_ps();
1972             fjz1             = _mm_setzero_ps();
1973             fjx2             = _mm_setzero_ps();
1974             fjy2             = _mm_setzero_ps();
1975             fjz2             = _mm_setzero_ps();
1976
1977             /**************************
1978              * CALCULATE INTERACTIONS *
1979              **************************/
1980
1981             r00              = _mm_mul_ps(rsq00,rinv00);
1982             r00              = _mm_andnot_ps(dummy_mask,r00);
1983
1984             /* EWALD ELECTROSTATICS */
1985
1986             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1987             ewrt             = _mm_mul_ps(r00,ewtabscale);
1988             ewitab           = _mm_cvttps_epi32(ewrt);
1989             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1990             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1991                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1992                                          &ewtabF,&ewtabFn);
1993             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1994             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1995
1996             /* Analytical LJ-PME */
1997             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1998             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
1999             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
2000             exponent         = gmx_simd_exp_r(ewcljrsq);
2001             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2002             poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2003             /* f6A = 6 * C6grid * (1 - poly) */
2004             f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2005             /* f6B = C6grid * exponent * beta^6 */
2006             f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2007             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2008             fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2009
2010             fscal            = _mm_add_ps(felec,fvdw);
2011
2012             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2013
2014             /* Calculate temporary vectorial force */
2015             tx               = _mm_mul_ps(fscal,dx00);
2016             ty               = _mm_mul_ps(fscal,dy00);
2017             tz               = _mm_mul_ps(fscal,dz00);
2018
2019             /* Update vectorial force */
2020             fix0             = _mm_add_ps(fix0,tx);
2021             fiy0             = _mm_add_ps(fiy0,ty);
2022             fiz0             = _mm_add_ps(fiz0,tz);
2023
2024             fjx0             = _mm_add_ps(fjx0,tx);
2025             fjy0             = _mm_add_ps(fjy0,ty);
2026             fjz0             = _mm_add_ps(fjz0,tz);
2027             
2028             /**************************
2029              * CALCULATE INTERACTIONS *
2030              **************************/
2031
2032             r01              = _mm_mul_ps(rsq01,rinv01);
2033             r01              = _mm_andnot_ps(dummy_mask,r01);
2034
2035             /* EWALD ELECTROSTATICS */
2036
2037             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2038             ewrt             = _mm_mul_ps(r01,ewtabscale);
2039             ewitab           = _mm_cvttps_epi32(ewrt);
2040             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2041             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2042                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2043                                          &ewtabF,&ewtabFn);
2044             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2045             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2046
2047             fscal            = felec;
2048
2049             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2050
2051             /* Calculate temporary vectorial force */
2052             tx               = _mm_mul_ps(fscal,dx01);
2053             ty               = _mm_mul_ps(fscal,dy01);
2054             tz               = _mm_mul_ps(fscal,dz01);
2055
2056             /* Update vectorial force */
2057             fix0             = _mm_add_ps(fix0,tx);
2058             fiy0             = _mm_add_ps(fiy0,ty);
2059             fiz0             = _mm_add_ps(fiz0,tz);
2060
2061             fjx1             = _mm_add_ps(fjx1,tx);
2062             fjy1             = _mm_add_ps(fjy1,ty);
2063             fjz1             = _mm_add_ps(fjz1,tz);
2064             
2065             /**************************
2066              * CALCULATE INTERACTIONS *
2067              **************************/
2068
2069             r02              = _mm_mul_ps(rsq02,rinv02);
2070             r02              = _mm_andnot_ps(dummy_mask,r02);
2071
2072             /* EWALD ELECTROSTATICS */
2073
2074             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2075             ewrt             = _mm_mul_ps(r02,ewtabscale);
2076             ewitab           = _mm_cvttps_epi32(ewrt);
2077             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2078             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2079                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2080                                          &ewtabF,&ewtabFn);
2081             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2082             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2083
2084             fscal            = felec;
2085
2086             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2087
2088             /* Calculate temporary vectorial force */
2089             tx               = _mm_mul_ps(fscal,dx02);
2090             ty               = _mm_mul_ps(fscal,dy02);
2091             tz               = _mm_mul_ps(fscal,dz02);
2092
2093             /* Update vectorial force */
2094             fix0             = _mm_add_ps(fix0,tx);
2095             fiy0             = _mm_add_ps(fiy0,ty);
2096             fiz0             = _mm_add_ps(fiz0,tz);
2097
2098             fjx2             = _mm_add_ps(fjx2,tx);
2099             fjy2             = _mm_add_ps(fjy2,ty);
2100             fjz2             = _mm_add_ps(fjz2,tz);
2101             
2102             /**************************
2103              * CALCULATE INTERACTIONS *
2104              **************************/
2105
2106             r10              = _mm_mul_ps(rsq10,rinv10);
2107             r10              = _mm_andnot_ps(dummy_mask,r10);
2108
2109             /* EWALD ELECTROSTATICS */
2110
2111             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2112             ewrt             = _mm_mul_ps(r10,ewtabscale);
2113             ewitab           = _mm_cvttps_epi32(ewrt);
2114             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2115             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2116                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2117                                          &ewtabF,&ewtabFn);
2118             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2119             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2120
2121             fscal            = felec;
2122
2123             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2124
2125             /* Calculate temporary vectorial force */
2126             tx               = _mm_mul_ps(fscal,dx10);
2127             ty               = _mm_mul_ps(fscal,dy10);
2128             tz               = _mm_mul_ps(fscal,dz10);
2129
2130             /* Update vectorial force */
2131             fix1             = _mm_add_ps(fix1,tx);
2132             fiy1             = _mm_add_ps(fiy1,ty);
2133             fiz1             = _mm_add_ps(fiz1,tz);
2134
2135             fjx0             = _mm_add_ps(fjx0,tx);
2136             fjy0             = _mm_add_ps(fjy0,ty);
2137             fjz0             = _mm_add_ps(fjz0,tz);
2138             
2139             /**************************
2140              * CALCULATE INTERACTIONS *
2141              **************************/
2142
2143             r11              = _mm_mul_ps(rsq11,rinv11);
2144             r11              = _mm_andnot_ps(dummy_mask,r11);
2145
2146             /* EWALD ELECTROSTATICS */
2147
2148             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2149             ewrt             = _mm_mul_ps(r11,ewtabscale);
2150             ewitab           = _mm_cvttps_epi32(ewrt);
2151             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2152             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2153                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2154                                          &ewtabF,&ewtabFn);
2155             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2156             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2157
2158             fscal            = felec;
2159
2160             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2161
2162             /* Calculate temporary vectorial force */
2163             tx               = _mm_mul_ps(fscal,dx11);
2164             ty               = _mm_mul_ps(fscal,dy11);
2165             tz               = _mm_mul_ps(fscal,dz11);
2166
2167             /* Update vectorial force */
2168             fix1             = _mm_add_ps(fix1,tx);
2169             fiy1             = _mm_add_ps(fiy1,ty);
2170             fiz1             = _mm_add_ps(fiz1,tz);
2171
2172             fjx1             = _mm_add_ps(fjx1,tx);
2173             fjy1             = _mm_add_ps(fjy1,ty);
2174             fjz1             = _mm_add_ps(fjz1,tz);
2175             
2176             /**************************
2177              * CALCULATE INTERACTIONS *
2178              **************************/
2179
2180             r12              = _mm_mul_ps(rsq12,rinv12);
2181             r12              = _mm_andnot_ps(dummy_mask,r12);
2182
2183             /* EWALD ELECTROSTATICS */
2184
2185             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2186             ewrt             = _mm_mul_ps(r12,ewtabscale);
2187             ewitab           = _mm_cvttps_epi32(ewrt);
2188             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2189             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2190                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2191                                          &ewtabF,&ewtabFn);
2192             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2193             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2194
2195             fscal            = felec;
2196
2197             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2198
2199             /* Calculate temporary vectorial force */
2200             tx               = _mm_mul_ps(fscal,dx12);
2201             ty               = _mm_mul_ps(fscal,dy12);
2202             tz               = _mm_mul_ps(fscal,dz12);
2203
2204             /* Update vectorial force */
2205             fix1             = _mm_add_ps(fix1,tx);
2206             fiy1             = _mm_add_ps(fiy1,ty);
2207             fiz1             = _mm_add_ps(fiz1,tz);
2208
2209             fjx2             = _mm_add_ps(fjx2,tx);
2210             fjy2             = _mm_add_ps(fjy2,ty);
2211             fjz2             = _mm_add_ps(fjz2,tz);
2212             
2213             /**************************
2214              * CALCULATE INTERACTIONS *
2215              **************************/
2216
2217             r20              = _mm_mul_ps(rsq20,rinv20);
2218             r20              = _mm_andnot_ps(dummy_mask,r20);
2219
2220             /* EWALD ELECTROSTATICS */
2221
2222             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2223             ewrt             = _mm_mul_ps(r20,ewtabscale);
2224             ewitab           = _mm_cvttps_epi32(ewrt);
2225             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2226             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2227                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2228                                          &ewtabF,&ewtabFn);
2229             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2230             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2231
2232             fscal            = felec;
2233
2234             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2235
2236             /* Calculate temporary vectorial force */
2237             tx               = _mm_mul_ps(fscal,dx20);
2238             ty               = _mm_mul_ps(fscal,dy20);
2239             tz               = _mm_mul_ps(fscal,dz20);
2240
2241             /* Update vectorial force */
2242             fix2             = _mm_add_ps(fix2,tx);
2243             fiy2             = _mm_add_ps(fiy2,ty);
2244             fiz2             = _mm_add_ps(fiz2,tz);
2245
2246             fjx0             = _mm_add_ps(fjx0,tx);
2247             fjy0             = _mm_add_ps(fjy0,ty);
2248             fjz0             = _mm_add_ps(fjz0,tz);
2249             
2250             /**************************
2251              * CALCULATE INTERACTIONS *
2252              **************************/
2253
2254             r21              = _mm_mul_ps(rsq21,rinv21);
2255             r21              = _mm_andnot_ps(dummy_mask,r21);
2256
2257             /* EWALD ELECTROSTATICS */
2258
2259             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2260             ewrt             = _mm_mul_ps(r21,ewtabscale);
2261             ewitab           = _mm_cvttps_epi32(ewrt);
2262             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2263             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2264                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2265                                          &ewtabF,&ewtabFn);
2266             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2267             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2268
2269             fscal            = felec;
2270
2271             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2272
2273             /* Calculate temporary vectorial force */
2274             tx               = _mm_mul_ps(fscal,dx21);
2275             ty               = _mm_mul_ps(fscal,dy21);
2276             tz               = _mm_mul_ps(fscal,dz21);
2277
2278             /* Update vectorial force */
2279             fix2             = _mm_add_ps(fix2,tx);
2280             fiy2             = _mm_add_ps(fiy2,ty);
2281             fiz2             = _mm_add_ps(fiz2,tz);
2282
2283             fjx1             = _mm_add_ps(fjx1,tx);
2284             fjy1             = _mm_add_ps(fjy1,ty);
2285             fjz1             = _mm_add_ps(fjz1,tz);
2286             
2287             /**************************
2288              * CALCULATE INTERACTIONS *
2289              **************************/
2290
2291             r22              = _mm_mul_ps(rsq22,rinv22);
2292             r22              = _mm_andnot_ps(dummy_mask,r22);
2293
2294             /* EWALD ELECTROSTATICS */
2295
2296             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2297             ewrt             = _mm_mul_ps(r22,ewtabscale);
2298             ewitab           = _mm_cvttps_epi32(ewrt);
2299             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2300             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2301                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2302                                          &ewtabF,&ewtabFn);
2303             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2304             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2305
2306             fscal            = felec;
2307
2308             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2309
2310             /* Calculate temporary vectorial force */
2311             tx               = _mm_mul_ps(fscal,dx22);
2312             ty               = _mm_mul_ps(fscal,dy22);
2313             tz               = _mm_mul_ps(fscal,dz22);
2314
2315             /* Update vectorial force */
2316             fix2             = _mm_add_ps(fix2,tx);
2317             fiy2             = _mm_add_ps(fiy2,ty);
2318             fiz2             = _mm_add_ps(fiz2,tz);
2319
2320             fjx2             = _mm_add_ps(fjx2,tx);
2321             fjy2             = _mm_add_ps(fjy2,ty);
2322             fjz2             = _mm_add_ps(fjz2,tz);
2323             
2324             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2325             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2326             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2327             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2328
2329             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2330                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2331
2332             /* Inner loop uses 356 flops */
2333         }
2334
2335         /* End of innermost loop */
2336
2337         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2338                                               f+i_coord_offset,fshift+i_shift_offset);
2339
2340         /* Increment number of inner iterations */
2341         inneriter                  += j_index_end - j_index_start;
2342
2343         /* Outer loop uses 18 flops */
2344     }
2345
2346     /* Increment number of outer iterations */
2347     outeriter        += nri;
2348
2349     /* Update outer/inner flops */
2350
2351     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*356);
2352 }