Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse2_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse2_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     int              nvdwtype;
106     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107     int              *vdwtype;
108     real             *vdwparam;
109     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
110     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
111     __m128i          ewitab;
112     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
113     real             *ewtab;
114     __m128           dummy_mask,cutoff_mask;
115     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
116     __m128           one     = _mm_set1_ps(1.0);
117     __m128           two     = _mm_set1_ps(2.0);
118     x                = xx[0];
119     f                = ff[0];
120
121     nri              = nlist->nri;
122     iinr             = nlist->iinr;
123     jindex           = nlist->jindex;
124     jjnr             = nlist->jjnr;
125     shiftidx         = nlist->shift;
126     gid              = nlist->gid;
127     shiftvec         = fr->shift_vec[0];
128     fshift           = fr->fshift[0];
129     facel            = _mm_set1_ps(fr->ic->epsfac);
130     charge           = mdatoms->chargeA;
131     nvdwtype         = fr->ntype;
132     vdwparam         = fr->nbfp;
133     vdwtype          = mdatoms->typeA;
134
135     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
136     ewtab            = fr->ic->tabq_coul_FDV0;
137     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
138     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
139
140     /* Setup water-specific parameters */
141     inr              = nlist->iinr[0];
142     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
143     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
144     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
145     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
146
147     jq0              = _mm_set1_ps(charge[inr+0]);
148     jq1              = _mm_set1_ps(charge[inr+1]);
149     jq2              = _mm_set1_ps(charge[inr+2]);
150     vdwjidx0A        = 2*vdwtype[inr+0];
151     qq00             = _mm_mul_ps(iq0,jq0);
152     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
153     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
154     qq01             = _mm_mul_ps(iq0,jq1);
155     qq02             = _mm_mul_ps(iq0,jq2);
156     qq10             = _mm_mul_ps(iq1,jq0);
157     qq11             = _mm_mul_ps(iq1,jq1);
158     qq12             = _mm_mul_ps(iq1,jq2);
159     qq20             = _mm_mul_ps(iq2,jq0);
160     qq21             = _mm_mul_ps(iq2,jq1);
161     qq22             = _mm_mul_ps(iq2,jq2);
162
163     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
164     rcutoff_scalar   = fr->ic->rcoulomb;
165     rcutoff          = _mm_set1_ps(rcutoff_scalar);
166     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
167
168     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
169     rvdw             = _mm_set1_ps(fr->ic->rvdw);
170
171     /* Avoid stupid compiler warnings */
172     jnrA = jnrB = jnrC = jnrD = 0;
173     j_coord_offsetA = 0;
174     j_coord_offsetB = 0;
175     j_coord_offsetC = 0;
176     j_coord_offsetD = 0;
177
178     outeriter        = 0;
179     inneriter        = 0;
180
181     for(iidx=0;iidx<4*DIM;iidx++)
182     {
183         scratch[iidx] = 0.0;
184     }  
185
186     /* Start outer loop over neighborlists */
187     for(iidx=0; iidx<nri; iidx++)
188     {
189         /* Load shift vector for this list */
190         i_shift_offset   = DIM*shiftidx[iidx];
191
192         /* Load limits for loop over neighbors */
193         j_index_start    = jindex[iidx];
194         j_index_end      = jindex[iidx+1];
195
196         /* Get outer coordinate index */
197         inr              = iinr[iidx];
198         i_coord_offset   = DIM*inr;
199
200         /* Load i particle coords and add shift vector */
201         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
202                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
203         
204         fix0             = _mm_setzero_ps();
205         fiy0             = _mm_setzero_ps();
206         fiz0             = _mm_setzero_ps();
207         fix1             = _mm_setzero_ps();
208         fiy1             = _mm_setzero_ps();
209         fiz1             = _mm_setzero_ps();
210         fix2             = _mm_setzero_ps();
211         fiy2             = _mm_setzero_ps();
212         fiz2             = _mm_setzero_ps();
213
214         /* Reset potential sums */
215         velecsum         = _mm_setzero_ps();
216         vvdwsum          = _mm_setzero_ps();
217
218         /* Start inner kernel loop */
219         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
220         {
221
222             /* Get j neighbor index, and coordinate index */
223             jnrA             = jjnr[jidx];
224             jnrB             = jjnr[jidx+1];
225             jnrC             = jjnr[jidx+2];
226             jnrD             = jjnr[jidx+3];
227             j_coord_offsetA  = DIM*jnrA;
228             j_coord_offsetB  = DIM*jnrB;
229             j_coord_offsetC  = DIM*jnrC;
230             j_coord_offsetD  = DIM*jnrD;
231
232             /* load j atom coordinates */
233             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
234                                               x+j_coord_offsetC,x+j_coord_offsetD,
235                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
236
237             /* Calculate displacement vector */
238             dx00             = _mm_sub_ps(ix0,jx0);
239             dy00             = _mm_sub_ps(iy0,jy0);
240             dz00             = _mm_sub_ps(iz0,jz0);
241             dx01             = _mm_sub_ps(ix0,jx1);
242             dy01             = _mm_sub_ps(iy0,jy1);
243             dz01             = _mm_sub_ps(iz0,jz1);
244             dx02             = _mm_sub_ps(ix0,jx2);
245             dy02             = _mm_sub_ps(iy0,jy2);
246             dz02             = _mm_sub_ps(iz0,jz2);
247             dx10             = _mm_sub_ps(ix1,jx0);
248             dy10             = _mm_sub_ps(iy1,jy0);
249             dz10             = _mm_sub_ps(iz1,jz0);
250             dx11             = _mm_sub_ps(ix1,jx1);
251             dy11             = _mm_sub_ps(iy1,jy1);
252             dz11             = _mm_sub_ps(iz1,jz1);
253             dx12             = _mm_sub_ps(ix1,jx2);
254             dy12             = _mm_sub_ps(iy1,jy2);
255             dz12             = _mm_sub_ps(iz1,jz2);
256             dx20             = _mm_sub_ps(ix2,jx0);
257             dy20             = _mm_sub_ps(iy2,jy0);
258             dz20             = _mm_sub_ps(iz2,jz0);
259             dx21             = _mm_sub_ps(ix2,jx1);
260             dy21             = _mm_sub_ps(iy2,jy1);
261             dz21             = _mm_sub_ps(iz2,jz1);
262             dx22             = _mm_sub_ps(ix2,jx2);
263             dy22             = _mm_sub_ps(iy2,jy2);
264             dz22             = _mm_sub_ps(iz2,jz2);
265
266             /* Calculate squared distance and things based on it */
267             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
268             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
269             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
270             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
271             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
272             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
273             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
274             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
275             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
276
277             rinv00           = sse2_invsqrt_f(rsq00);
278             rinv01           = sse2_invsqrt_f(rsq01);
279             rinv02           = sse2_invsqrt_f(rsq02);
280             rinv10           = sse2_invsqrt_f(rsq10);
281             rinv11           = sse2_invsqrt_f(rsq11);
282             rinv12           = sse2_invsqrt_f(rsq12);
283             rinv20           = sse2_invsqrt_f(rsq20);
284             rinv21           = sse2_invsqrt_f(rsq21);
285             rinv22           = sse2_invsqrt_f(rsq22);
286
287             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
288             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
289             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
290             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
291             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
292             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
293             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
294             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
295             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
296
297             fjx0             = _mm_setzero_ps();
298             fjy0             = _mm_setzero_ps();
299             fjz0             = _mm_setzero_ps();
300             fjx1             = _mm_setzero_ps();
301             fjy1             = _mm_setzero_ps();
302             fjz1             = _mm_setzero_ps();
303             fjx2             = _mm_setzero_ps();
304             fjy2             = _mm_setzero_ps();
305             fjz2             = _mm_setzero_ps();
306
307             /**************************
308              * CALCULATE INTERACTIONS *
309              **************************/
310
311             if (gmx_mm_any_lt(rsq00,rcutoff2))
312             {
313
314             r00              = _mm_mul_ps(rsq00,rinv00);
315
316             /* EWALD ELECTROSTATICS */
317
318             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
319             ewrt             = _mm_mul_ps(r00,ewtabscale);
320             ewitab           = _mm_cvttps_epi32(ewrt);
321             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
322             ewitab           = _mm_slli_epi32(ewitab,2);
323             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
324             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
325             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
326             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
327             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
328             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
329             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
330             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
331             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
332
333             /* LENNARD-JONES DISPERSION/REPULSION */
334
335             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
336             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
337             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
338             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
339                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
340             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
341
342             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
343
344             /* Update potential sum for this i atom from the interaction with this j atom. */
345             velec            = _mm_and_ps(velec,cutoff_mask);
346             velecsum         = _mm_add_ps(velecsum,velec);
347             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
348             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
349
350             fscal            = _mm_add_ps(felec,fvdw);
351
352             fscal            = _mm_and_ps(fscal,cutoff_mask);
353
354             /* Calculate temporary vectorial force */
355             tx               = _mm_mul_ps(fscal,dx00);
356             ty               = _mm_mul_ps(fscal,dy00);
357             tz               = _mm_mul_ps(fscal,dz00);
358
359             /* Update vectorial force */
360             fix0             = _mm_add_ps(fix0,tx);
361             fiy0             = _mm_add_ps(fiy0,ty);
362             fiz0             = _mm_add_ps(fiz0,tz);
363
364             fjx0             = _mm_add_ps(fjx0,tx);
365             fjy0             = _mm_add_ps(fjy0,ty);
366             fjz0             = _mm_add_ps(fjz0,tz);
367             
368             }
369
370             /**************************
371              * CALCULATE INTERACTIONS *
372              **************************/
373
374             if (gmx_mm_any_lt(rsq01,rcutoff2))
375             {
376
377             r01              = _mm_mul_ps(rsq01,rinv01);
378
379             /* EWALD ELECTROSTATICS */
380
381             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
382             ewrt             = _mm_mul_ps(r01,ewtabscale);
383             ewitab           = _mm_cvttps_epi32(ewrt);
384             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
385             ewitab           = _mm_slli_epi32(ewitab,2);
386             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
387             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
388             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
389             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
390             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
391             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
392             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
393             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
394             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
395
396             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
397
398             /* Update potential sum for this i atom from the interaction with this j atom. */
399             velec            = _mm_and_ps(velec,cutoff_mask);
400             velecsum         = _mm_add_ps(velecsum,velec);
401
402             fscal            = felec;
403
404             fscal            = _mm_and_ps(fscal,cutoff_mask);
405
406             /* Calculate temporary vectorial force */
407             tx               = _mm_mul_ps(fscal,dx01);
408             ty               = _mm_mul_ps(fscal,dy01);
409             tz               = _mm_mul_ps(fscal,dz01);
410
411             /* Update vectorial force */
412             fix0             = _mm_add_ps(fix0,tx);
413             fiy0             = _mm_add_ps(fiy0,ty);
414             fiz0             = _mm_add_ps(fiz0,tz);
415
416             fjx1             = _mm_add_ps(fjx1,tx);
417             fjy1             = _mm_add_ps(fjy1,ty);
418             fjz1             = _mm_add_ps(fjz1,tz);
419             
420             }
421
422             /**************************
423              * CALCULATE INTERACTIONS *
424              **************************/
425
426             if (gmx_mm_any_lt(rsq02,rcutoff2))
427             {
428
429             r02              = _mm_mul_ps(rsq02,rinv02);
430
431             /* EWALD ELECTROSTATICS */
432
433             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
434             ewrt             = _mm_mul_ps(r02,ewtabscale);
435             ewitab           = _mm_cvttps_epi32(ewrt);
436             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
437             ewitab           = _mm_slli_epi32(ewitab,2);
438             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
439             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
440             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
441             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
442             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
443             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
444             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
445             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
446             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
447
448             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
449
450             /* Update potential sum for this i atom from the interaction with this j atom. */
451             velec            = _mm_and_ps(velec,cutoff_mask);
452             velecsum         = _mm_add_ps(velecsum,velec);
453
454             fscal            = felec;
455
456             fscal            = _mm_and_ps(fscal,cutoff_mask);
457
458             /* Calculate temporary vectorial force */
459             tx               = _mm_mul_ps(fscal,dx02);
460             ty               = _mm_mul_ps(fscal,dy02);
461             tz               = _mm_mul_ps(fscal,dz02);
462
463             /* Update vectorial force */
464             fix0             = _mm_add_ps(fix0,tx);
465             fiy0             = _mm_add_ps(fiy0,ty);
466             fiz0             = _mm_add_ps(fiz0,tz);
467
468             fjx2             = _mm_add_ps(fjx2,tx);
469             fjy2             = _mm_add_ps(fjy2,ty);
470             fjz2             = _mm_add_ps(fjz2,tz);
471             
472             }
473
474             /**************************
475              * CALCULATE INTERACTIONS *
476              **************************/
477
478             if (gmx_mm_any_lt(rsq10,rcutoff2))
479             {
480
481             r10              = _mm_mul_ps(rsq10,rinv10);
482
483             /* EWALD ELECTROSTATICS */
484
485             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
486             ewrt             = _mm_mul_ps(r10,ewtabscale);
487             ewitab           = _mm_cvttps_epi32(ewrt);
488             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
489             ewitab           = _mm_slli_epi32(ewitab,2);
490             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
491             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
492             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
493             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
494             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
495             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
496             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
497             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
498             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
499
500             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
501
502             /* Update potential sum for this i atom from the interaction with this j atom. */
503             velec            = _mm_and_ps(velec,cutoff_mask);
504             velecsum         = _mm_add_ps(velecsum,velec);
505
506             fscal            = felec;
507
508             fscal            = _mm_and_ps(fscal,cutoff_mask);
509
510             /* Calculate temporary vectorial force */
511             tx               = _mm_mul_ps(fscal,dx10);
512             ty               = _mm_mul_ps(fscal,dy10);
513             tz               = _mm_mul_ps(fscal,dz10);
514
515             /* Update vectorial force */
516             fix1             = _mm_add_ps(fix1,tx);
517             fiy1             = _mm_add_ps(fiy1,ty);
518             fiz1             = _mm_add_ps(fiz1,tz);
519
520             fjx0             = _mm_add_ps(fjx0,tx);
521             fjy0             = _mm_add_ps(fjy0,ty);
522             fjz0             = _mm_add_ps(fjz0,tz);
523             
524             }
525
526             /**************************
527              * CALCULATE INTERACTIONS *
528              **************************/
529
530             if (gmx_mm_any_lt(rsq11,rcutoff2))
531             {
532
533             r11              = _mm_mul_ps(rsq11,rinv11);
534
535             /* EWALD ELECTROSTATICS */
536
537             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
538             ewrt             = _mm_mul_ps(r11,ewtabscale);
539             ewitab           = _mm_cvttps_epi32(ewrt);
540             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
541             ewitab           = _mm_slli_epi32(ewitab,2);
542             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
543             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
544             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
545             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
546             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
547             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
548             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
549             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
550             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
551
552             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
553
554             /* Update potential sum for this i atom from the interaction with this j atom. */
555             velec            = _mm_and_ps(velec,cutoff_mask);
556             velecsum         = _mm_add_ps(velecsum,velec);
557
558             fscal            = felec;
559
560             fscal            = _mm_and_ps(fscal,cutoff_mask);
561
562             /* Calculate temporary vectorial force */
563             tx               = _mm_mul_ps(fscal,dx11);
564             ty               = _mm_mul_ps(fscal,dy11);
565             tz               = _mm_mul_ps(fscal,dz11);
566
567             /* Update vectorial force */
568             fix1             = _mm_add_ps(fix1,tx);
569             fiy1             = _mm_add_ps(fiy1,ty);
570             fiz1             = _mm_add_ps(fiz1,tz);
571
572             fjx1             = _mm_add_ps(fjx1,tx);
573             fjy1             = _mm_add_ps(fjy1,ty);
574             fjz1             = _mm_add_ps(fjz1,tz);
575             
576             }
577
578             /**************************
579              * CALCULATE INTERACTIONS *
580              **************************/
581
582             if (gmx_mm_any_lt(rsq12,rcutoff2))
583             {
584
585             r12              = _mm_mul_ps(rsq12,rinv12);
586
587             /* EWALD ELECTROSTATICS */
588
589             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
590             ewrt             = _mm_mul_ps(r12,ewtabscale);
591             ewitab           = _mm_cvttps_epi32(ewrt);
592             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
593             ewitab           = _mm_slli_epi32(ewitab,2);
594             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
595             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
596             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
597             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
598             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
599             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
600             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
601             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
602             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
603
604             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
605
606             /* Update potential sum for this i atom from the interaction with this j atom. */
607             velec            = _mm_and_ps(velec,cutoff_mask);
608             velecsum         = _mm_add_ps(velecsum,velec);
609
610             fscal            = felec;
611
612             fscal            = _mm_and_ps(fscal,cutoff_mask);
613
614             /* Calculate temporary vectorial force */
615             tx               = _mm_mul_ps(fscal,dx12);
616             ty               = _mm_mul_ps(fscal,dy12);
617             tz               = _mm_mul_ps(fscal,dz12);
618
619             /* Update vectorial force */
620             fix1             = _mm_add_ps(fix1,tx);
621             fiy1             = _mm_add_ps(fiy1,ty);
622             fiz1             = _mm_add_ps(fiz1,tz);
623
624             fjx2             = _mm_add_ps(fjx2,tx);
625             fjy2             = _mm_add_ps(fjy2,ty);
626             fjz2             = _mm_add_ps(fjz2,tz);
627             
628             }
629
630             /**************************
631              * CALCULATE INTERACTIONS *
632              **************************/
633
634             if (gmx_mm_any_lt(rsq20,rcutoff2))
635             {
636
637             r20              = _mm_mul_ps(rsq20,rinv20);
638
639             /* EWALD ELECTROSTATICS */
640
641             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
642             ewrt             = _mm_mul_ps(r20,ewtabscale);
643             ewitab           = _mm_cvttps_epi32(ewrt);
644             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
645             ewitab           = _mm_slli_epi32(ewitab,2);
646             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
647             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
648             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
649             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
650             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
651             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
652             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
653             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
654             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
655
656             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
657
658             /* Update potential sum for this i atom from the interaction with this j atom. */
659             velec            = _mm_and_ps(velec,cutoff_mask);
660             velecsum         = _mm_add_ps(velecsum,velec);
661
662             fscal            = felec;
663
664             fscal            = _mm_and_ps(fscal,cutoff_mask);
665
666             /* Calculate temporary vectorial force */
667             tx               = _mm_mul_ps(fscal,dx20);
668             ty               = _mm_mul_ps(fscal,dy20);
669             tz               = _mm_mul_ps(fscal,dz20);
670
671             /* Update vectorial force */
672             fix2             = _mm_add_ps(fix2,tx);
673             fiy2             = _mm_add_ps(fiy2,ty);
674             fiz2             = _mm_add_ps(fiz2,tz);
675
676             fjx0             = _mm_add_ps(fjx0,tx);
677             fjy0             = _mm_add_ps(fjy0,ty);
678             fjz0             = _mm_add_ps(fjz0,tz);
679             
680             }
681
682             /**************************
683              * CALCULATE INTERACTIONS *
684              **************************/
685
686             if (gmx_mm_any_lt(rsq21,rcutoff2))
687             {
688
689             r21              = _mm_mul_ps(rsq21,rinv21);
690
691             /* EWALD ELECTROSTATICS */
692
693             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
694             ewrt             = _mm_mul_ps(r21,ewtabscale);
695             ewitab           = _mm_cvttps_epi32(ewrt);
696             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
697             ewitab           = _mm_slli_epi32(ewitab,2);
698             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
699             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
700             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
701             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
702             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
703             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
704             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
705             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
706             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
707
708             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
709
710             /* Update potential sum for this i atom from the interaction with this j atom. */
711             velec            = _mm_and_ps(velec,cutoff_mask);
712             velecsum         = _mm_add_ps(velecsum,velec);
713
714             fscal            = felec;
715
716             fscal            = _mm_and_ps(fscal,cutoff_mask);
717
718             /* Calculate temporary vectorial force */
719             tx               = _mm_mul_ps(fscal,dx21);
720             ty               = _mm_mul_ps(fscal,dy21);
721             tz               = _mm_mul_ps(fscal,dz21);
722
723             /* Update vectorial force */
724             fix2             = _mm_add_ps(fix2,tx);
725             fiy2             = _mm_add_ps(fiy2,ty);
726             fiz2             = _mm_add_ps(fiz2,tz);
727
728             fjx1             = _mm_add_ps(fjx1,tx);
729             fjy1             = _mm_add_ps(fjy1,ty);
730             fjz1             = _mm_add_ps(fjz1,tz);
731             
732             }
733
734             /**************************
735              * CALCULATE INTERACTIONS *
736              **************************/
737
738             if (gmx_mm_any_lt(rsq22,rcutoff2))
739             {
740
741             r22              = _mm_mul_ps(rsq22,rinv22);
742
743             /* EWALD ELECTROSTATICS */
744
745             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
746             ewrt             = _mm_mul_ps(r22,ewtabscale);
747             ewitab           = _mm_cvttps_epi32(ewrt);
748             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
749             ewitab           = _mm_slli_epi32(ewitab,2);
750             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
751             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
752             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
753             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
754             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
755             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
756             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
757             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
758             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
759
760             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
761
762             /* Update potential sum for this i atom from the interaction with this j atom. */
763             velec            = _mm_and_ps(velec,cutoff_mask);
764             velecsum         = _mm_add_ps(velecsum,velec);
765
766             fscal            = felec;
767
768             fscal            = _mm_and_ps(fscal,cutoff_mask);
769
770             /* Calculate temporary vectorial force */
771             tx               = _mm_mul_ps(fscal,dx22);
772             ty               = _mm_mul_ps(fscal,dy22);
773             tz               = _mm_mul_ps(fscal,dz22);
774
775             /* Update vectorial force */
776             fix2             = _mm_add_ps(fix2,tx);
777             fiy2             = _mm_add_ps(fiy2,ty);
778             fiz2             = _mm_add_ps(fiz2,tz);
779
780             fjx2             = _mm_add_ps(fjx2,tx);
781             fjy2             = _mm_add_ps(fjy2,ty);
782             fjz2             = _mm_add_ps(fjz2,tz);
783             
784             }
785
786             fjptrA             = f+j_coord_offsetA;
787             fjptrB             = f+j_coord_offsetB;
788             fjptrC             = f+j_coord_offsetC;
789             fjptrD             = f+j_coord_offsetD;
790
791             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
792                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
793
794             /* Inner loop uses 432 flops */
795         }
796
797         if(jidx<j_index_end)
798         {
799
800             /* Get j neighbor index, and coordinate index */
801             jnrlistA         = jjnr[jidx];
802             jnrlistB         = jjnr[jidx+1];
803             jnrlistC         = jjnr[jidx+2];
804             jnrlistD         = jjnr[jidx+3];
805             /* Sign of each element will be negative for non-real atoms.
806              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
807              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
808              */
809             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
810             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
811             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
812             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
813             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
814             j_coord_offsetA  = DIM*jnrA;
815             j_coord_offsetB  = DIM*jnrB;
816             j_coord_offsetC  = DIM*jnrC;
817             j_coord_offsetD  = DIM*jnrD;
818
819             /* load j atom coordinates */
820             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
821                                               x+j_coord_offsetC,x+j_coord_offsetD,
822                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
823
824             /* Calculate displacement vector */
825             dx00             = _mm_sub_ps(ix0,jx0);
826             dy00             = _mm_sub_ps(iy0,jy0);
827             dz00             = _mm_sub_ps(iz0,jz0);
828             dx01             = _mm_sub_ps(ix0,jx1);
829             dy01             = _mm_sub_ps(iy0,jy1);
830             dz01             = _mm_sub_ps(iz0,jz1);
831             dx02             = _mm_sub_ps(ix0,jx2);
832             dy02             = _mm_sub_ps(iy0,jy2);
833             dz02             = _mm_sub_ps(iz0,jz2);
834             dx10             = _mm_sub_ps(ix1,jx0);
835             dy10             = _mm_sub_ps(iy1,jy0);
836             dz10             = _mm_sub_ps(iz1,jz0);
837             dx11             = _mm_sub_ps(ix1,jx1);
838             dy11             = _mm_sub_ps(iy1,jy1);
839             dz11             = _mm_sub_ps(iz1,jz1);
840             dx12             = _mm_sub_ps(ix1,jx2);
841             dy12             = _mm_sub_ps(iy1,jy2);
842             dz12             = _mm_sub_ps(iz1,jz2);
843             dx20             = _mm_sub_ps(ix2,jx0);
844             dy20             = _mm_sub_ps(iy2,jy0);
845             dz20             = _mm_sub_ps(iz2,jz0);
846             dx21             = _mm_sub_ps(ix2,jx1);
847             dy21             = _mm_sub_ps(iy2,jy1);
848             dz21             = _mm_sub_ps(iz2,jz1);
849             dx22             = _mm_sub_ps(ix2,jx2);
850             dy22             = _mm_sub_ps(iy2,jy2);
851             dz22             = _mm_sub_ps(iz2,jz2);
852
853             /* Calculate squared distance and things based on it */
854             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
855             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
856             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
857             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
858             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
859             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
860             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
861             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
862             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
863
864             rinv00           = sse2_invsqrt_f(rsq00);
865             rinv01           = sse2_invsqrt_f(rsq01);
866             rinv02           = sse2_invsqrt_f(rsq02);
867             rinv10           = sse2_invsqrt_f(rsq10);
868             rinv11           = sse2_invsqrt_f(rsq11);
869             rinv12           = sse2_invsqrt_f(rsq12);
870             rinv20           = sse2_invsqrt_f(rsq20);
871             rinv21           = sse2_invsqrt_f(rsq21);
872             rinv22           = sse2_invsqrt_f(rsq22);
873
874             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
875             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
876             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
877             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
878             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
879             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
880             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
881             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
882             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
883
884             fjx0             = _mm_setzero_ps();
885             fjy0             = _mm_setzero_ps();
886             fjz0             = _mm_setzero_ps();
887             fjx1             = _mm_setzero_ps();
888             fjy1             = _mm_setzero_ps();
889             fjz1             = _mm_setzero_ps();
890             fjx2             = _mm_setzero_ps();
891             fjy2             = _mm_setzero_ps();
892             fjz2             = _mm_setzero_ps();
893
894             /**************************
895              * CALCULATE INTERACTIONS *
896              **************************/
897
898             if (gmx_mm_any_lt(rsq00,rcutoff2))
899             {
900
901             r00              = _mm_mul_ps(rsq00,rinv00);
902             r00              = _mm_andnot_ps(dummy_mask,r00);
903
904             /* EWALD ELECTROSTATICS */
905
906             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
907             ewrt             = _mm_mul_ps(r00,ewtabscale);
908             ewitab           = _mm_cvttps_epi32(ewrt);
909             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
910             ewitab           = _mm_slli_epi32(ewitab,2);
911             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
912             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
913             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
914             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
915             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
916             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
917             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
918             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
919             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
920
921             /* LENNARD-JONES DISPERSION/REPULSION */
922
923             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
924             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
925             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
926             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
927                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
928             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
929
930             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
931
932             /* Update potential sum for this i atom from the interaction with this j atom. */
933             velec            = _mm_and_ps(velec,cutoff_mask);
934             velec            = _mm_andnot_ps(dummy_mask,velec);
935             velecsum         = _mm_add_ps(velecsum,velec);
936             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
937             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
938             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
939
940             fscal            = _mm_add_ps(felec,fvdw);
941
942             fscal            = _mm_and_ps(fscal,cutoff_mask);
943
944             fscal            = _mm_andnot_ps(dummy_mask,fscal);
945
946             /* Calculate temporary vectorial force */
947             tx               = _mm_mul_ps(fscal,dx00);
948             ty               = _mm_mul_ps(fscal,dy00);
949             tz               = _mm_mul_ps(fscal,dz00);
950
951             /* Update vectorial force */
952             fix0             = _mm_add_ps(fix0,tx);
953             fiy0             = _mm_add_ps(fiy0,ty);
954             fiz0             = _mm_add_ps(fiz0,tz);
955
956             fjx0             = _mm_add_ps(fjx0,tx);
957             fjy0             = _mm_add_ps(fjy0,ty);
958             fjz0             = _mm_add_ps(fjz0,tz);
959             
960             }
961
962             /**************************
963              * CALCULATE INTERACTIONS *
964              **************************/
965
966             if (gmx_mm_any_lt(rsq01,rcutoff2))
967             {
968
969             r01              = _mm_mul_ps(rsq01,rinv01);
970             r01              = _mm_andnot_ps(dummy_mask,r01);
971
972             /* EWALD ELECTROSTATICS */
973
974             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
975             ewrt             = _mm_mul_ps(r01,ewtabscale);
976             ewitab           = _mm_cvttps_epi32(ewrt);
977             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
978             ewitab           = _mm_slli_epi32(ewitab,2);
979             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
980             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
981             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
982             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
983             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
984             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
985             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
986             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
987             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
988
989             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
990
991             /* Update potential sum for this i atom from the interaction with this j atom. */
992             velec            = _mm_and_ps(velec,cutoff_mask);
993             velec            = _mm_andnot_ps(dummy_mask,velec);
994             velecsum         = _mm_add_ps(velecsum,velec);
995
996             fscal            = felec;
997
998             fscal            = _mm_and_ps(fscal,cutoff_mask);
999
1000             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1001
1002             /* Calculate temporary vectorial force */
1003             tx               = _mm_mul_ps(fscal,dx01);
1004             ty               = _mm_mul_ps(fscal,dy01);
1005             tz               = _mm_mul_ps(fscal,dz01);
1006
1007             /* Update vectorial force */
1008             fix0             = _mm_add_ps(fix0,tx);
1009             fiy0             = _mm_add_ps(fiy0,ty);
1010             fiz0             = _mm_add_ps(fiz0,tz);
1011
1012             fjx1             = _mm_add_ps(fjx1,tx);
1013             fjy1             = _mm_add_ps(fjy1,ty);
1014             fjz1             = _mm_add_ps(fjz1,tz);
1015             
1016             }
1017
1018             /**************************
1019              * CALCULATE INTERACTIONS *
1020              **************************/
1021
1022             if (gmx_mm_any_lt(rsq02,rcutoff2))
1023             {
1024
1025             r02              = _mm_mul_ps(rsq02,rinv02);
1026             r02              = _mm_andnot_ps(dummy_mask,r02);
1027
1028             /* EWALD ELECTROSTATICS */
1029
1030             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031             ewrt             = _mm_mul_ps(r02,ewtabscale);
1032             ewitab           = _mm_cvttps_epi32(ewrt);
1033             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1034             ewitab           = _mm_slli_epi32(ewitab,2);
1035             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1036             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1037             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1038             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1039             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1040             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1041             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1042             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
1043             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1044
1045             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1046
1047             /* Update potential sum for this i atom from the interaction with this j atom. */
1048             velec            = _mm_and_ps(velec,cutoff_mask);
1049             velec            = _mm_andnot_ps(dummy_mask,velec);
1050             velecsum         = _mm_add_ps(velecsum,velec);
1051
1052             fscal            = felec;
1053
1054             fscal            = _mm_and_ps(fscal,cutoff_mask);
1055
1056             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1057
1058             /* Calculate temporary vectorial force */
1059             tx               = _mm_mul_ps(fscal,dx02);
1060             ty               = _mm_mul_ps(fscal,dy02);
1061             tz               = _mm_mul_ps(fscal,dz02);
1062
1063             /* Update vectorial force */
1064             fix0             = _mm_add_ps(fix0,tx);
1065             fiy0             = _mm_add_ps(fiy0,ty);
1066             fiz0             = _mm_add_ps(fiz0,tz);
1067
1068             fjx2             = _mm_add_ps(fjx2,tx);
1069             fjy2             = _mm_add_ps(fjy2,ty);
1070             fjz2             = _mm_add_ps(fjz2,tz);
1071             
1072             }
1073
1074             /**************************
1075              * CALCULATE INTERACTIONS *
1076              **************************/
1077
1078             if (gmx_mm_any_lt(rsq10,rcutoff2))
1079             {
1080
1081             r10              = _mm_mul_ps(rsq10,rinv10);
1082             r10              = _mm_andnot_ps(dummy_mask,r10);
1083
1084             /* EWALD ELECTROSTATICS */
1085
1086             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1087             ewrt             = _mm_mul_ps(r10,ewtabscale);
1088             ewitab           = _mm_cvttps_epi32(ewrt);
1089             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1090             ewitab           = _mm_slli_epi32(ewitab,2);
1091             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1092             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1093             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1094             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1095             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1096             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1097             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1098             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1099             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1100
1101             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1102
1103             /* Update potential sum for this i atom from the interaction with this j atom. */
1104             velec            = _mm_and_ps(velec,cutoff_mask);
1105             velec            = _mm_andnot_ps(dummy_mask,velec);
1106             velecsum         = _mm_add_ps(velecsum,velec);
1107
1108             fscal            = felec;
1109
1110             fscal            = _mm_and_ps(fscal,cutoff_mask);
1111
1112             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1113
1114             /* Calculate temporary vectorial force */
1115             tx               = _mm_mul_ps(fscal,dx10);
1116             ty               = _mm_mul_ps(fscal,dy10);
1117             tz               = _mm_mul_ps(fscal,dz10);
1118
1119             /* Update vectorial force */
1120             fix1             = _mm_add_ps(fix1,tx);
1121             fiy1             = _mm_add_ps(fiy1,ty);
1122             fiz1             = _mm_add_ps(fiz1,tz);
1123
1124             fjx0             = _mm_add_ps(fjx0,tx);
1125             fjy0             = _mm_add_ps(fjy0,ty);
1126             fjz0             = _mm_add_ps(fjz0,tz);
1127             
1128             }
1129
1130             /**************************
1131              * CALCULATE INTERACTIONS *
1132              **************************/
1133
1134             if (gmx_mm_any_lt(rsq11,rcutoff2))
1135             {
1136
1137             r11              = _mm_mul_ps(rsq11,rinv11);
1138             r11              = _mm_andnot_ps(dummy_mask,r11);
1139
1140             /* EWALD ELECTROSTATICS */
1141
1142             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1143             ewrt             = _mm_mul_ps(r11,ewtabscale);
1144             ewitab           = _mm_cvttps_epi32(ewrt);
1145             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1146             ewitab           = _mm_slli_epi32(ewitab,2);
1147             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1148             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1149             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1150             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1151             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1152             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1153             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1154             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1155             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1156
1157             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1158
1159             /* Update potential sum for this i atom from the interaction with this j atom. */
1160             velec            = _mm_and_ps(velec,cutoff_mask);
1161             velec            = _mm_andnot_ps(dummy_mask,velec);
1162             velecsum         = _mm_add_ps(velecsum,velec);
1163
1164             fscal            = felec;
1165
1166             fscal            = _mm_and_ps(fscal,cutoff_mask);
1167
1168             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1169
1170             /* Calculate temporary vectorial force */
1171             tx               = _mm_mul_ps(fscal,dx11);
1172             ty               = _mm_mul_ps(fscal,dy11);
1173             tz               = _mm_mul_ps(fscal,dz11);
1174
1175             /* Update vectorial force */
1176             fix1             = _mm_add_ps(fix1,tx);
1177             fiy1             = _mm_add_ps(fiy1,ty);
1178             fiz1             = _mm_add_ps(fiz1,tz);
1179
1180             fjx1             = _mm_add_ps(fjx1,tx);
1181             fjy1             = _mm_add_ps(fjy1,ty);
1182             fjz1             = _mm_add_ps(fjz1,tz);
1183             
1184             }
1185
1186             /**************************
1187              * CALCULATE INTERACTIONS *
1188              **************************/
1189
1190             if (gmx_mm_any_lt(rsq12,rcutoff2))
1191             {
1192
1193             r12              = _mm_mul_ps(rsq12,rinv12);
1194             r12              = _mm_andnot_ps(dummy_mask,r12);
1195
1196             /* EWALD ELECTROSTATICS */
1197
1198             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1199             ewrt             = _mm_mul_ps(r12,ewtabscale);
1200             ewitab           = _mm_cvttps_epi32(ewrt);
1201             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1202             ewitab           = _mm_slli_epi32(ewitab,2);
1203             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1204             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1205             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1206             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1207             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1208             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1209             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1210             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1211             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1212
1213             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1214
1215             /* Update potential sum for this i atom from the interaction with this j atom. */
1216             velec            = _mm_and_ps(velec,cutoff_mask);
1217             velec            = _mm_andnot_ps(dummy_mask,velec);
1218             velecsum         = _mm_add_ps(velecsum,velec);
1219
1220             fscal            = felec;
1221
1222             fscal            = _mm_and_ps(fscal,cutoff_mask);
1223
1224             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1225
1226             /* Calculate temporary vectorial force */
1227             tx               = _mm_mul_ps(fscal,dx12);
1228             ty               = _mm_mul_ps(fscal,dy12);
1229             tz               = _mm_mul_ps(fscal,dz12);
1230
1231             /* Update vectorial force */
1232             fix1             = _mm_add_ps(fix1,tx);
1233             fiy1             = _mm_add_ps(fiy1,ty);
1234             fiz1             = _mm_add_ps(fiz1,tz);
1235
1236             fjx2             = _mm_add_ps(fjx2,tx);
1237             fjy2             = _mm_add_ps(fjy2,ty);
1238             fjz2             = _mm_add_ps(fjz2,tz);
1239             
1240             }
1241
1242             /**************************
1243              * CALCULATE INTERACTIONS *
1244              **************************/
1245
1246             if (gmx_mm_any_lt(rsq20,rcutoff2))
1247             {
1248
1249             r20              = _mm_mul_ps(rsq20,rinv20);
1250             r20              = _mm_andnot_ps(dummy_mask,r20);
1251
1252             /* EWALD ELECTROSTATICS */
1253
1254             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1255             ewrt             = _mm_mul_ps(r20,ewtabscale);
1256             ewitab           = _mm_cvttps_epi32(ewrt);
1257             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1258             ewitab           = _mm_slli_epi32(ewitab,2);
1259             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1260             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1261             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1262             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1263             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1264             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1265             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1266             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1267             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1268
1269             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1270
1271             /* Update potential sum for this i atom from the interaction with this j atom. */
1272             velec            = _mm_and_ps(velec,cutoff_mask);
1273             velec            = _mm_andnot_ps(dummy_mask,velec);
1274             velecsum         = _mm_add_ps(velecsum,velec);
1275
1276             fscal            = felec;
1277
1278             fscal            = _mm_and_ps(fscal,cutoff_mask);
1279
1280             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1281
1282             /* Calculate temporary vectorial force */
1283             tx               = _mm_mul_ps(fscal,dx20);
1284             ty               = _mm_mul_ps(fscal,dy20);
1285             tz               = _mm_mul_ps(fscal,dz20);
1286
1287             /* Update vectorial force */
1288             fix2             = _mm_add_ps(fix2,tx);
1289             fiy2             = _mm_add_ps(fiy2,ty);
1290             fiz2             = _mm_add_ps(fiz2,tz);
1291
1292             fjx0             = _mm_add_ps(fjx0,tx);
1293             fjy0             = _mm_add_ps(fjy0,ty);
1294             fjz0             = _mm_add_ps(fjz0,tz);
1295             
1296             }
1297
1298             /**************************
1299              * CALCULATE INTERACTIONS *
1300              **************************/
1301
1302             if (gmx_mm_any_lt(rsq21,rcutoff2))
1303             {
1304
1305             r21              = _mm_mul_ps(rsq21,rinv21);
1306             r21              = _mm_andnot_ps(dummy_mask,r21);
1307
1308             /* EWALD ELECTROSTATICS */
1309
1310             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1311             ewrt             = _mm_mul_ps(r21,ewtabscale);
1312             ewitab           = _mm_cvttps_epi32(ewrt);
1313             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1314             ewitab           = _mm_slli_epi32(ewitab,2);
1315             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1316             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1317             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1318             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1319             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1320             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1321             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1322             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1323             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1324
1325             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1326
1327             /* Update potential sum for this i atom from the interaction with this j atom. */
1328             velec            = _mm_and_ps(velec,cutoff_mask);
1329             velec            = _mm_andnot_ps(dummy_mask,velec);
1330             velecsum         = _mm_add_ps(velecsum,velec);
1331
1332             fscal            = felec;
1333
1334             fscal            = _mm_and_ps(fscal,cutoff_mask);
1335
1336             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1337
1338             /* Calculate temporary vectorial force */
1339             tx               = _mm_mul_ps(fscal,dx21);
1340             ty               = _mm_mul_ps(fscal,dy21);
1341             tz               = _mm_mul_ps(fscal,dz21);
1342
1343             /* Update vectorial force */
1344             fix2             = _mm_add_ps(fix2,tx);
1345             fiy2             = _mm_add_ps(fiy2,ty);
1346             fiz2             = _mm_add_ps(fiz2,tz);
1347
1348             fjx1             = _mm_add_ps(fjx1,tx);
1349             fjy1             = _mm_add_ps(fjy1,ty);
1350             fjz1             = _mm_add_ps(fjz1,tz);
1351             
1352             }
1353
1354             /**************************
1355              * CALCULATE INTERACTIONS *
1356              **************************/
1357
1358             if (gmx_mm_any_lt(rsq22,rcutoff2))
1359             {
1360
1361             r22              = _mm_mul_ps(rsq22,rinv22);
1362             r22              = _mm_andnot_ps(dummy_mask,r22);
1363
1364             /* EWALD ELECTROSTATICS */
1365
1366             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1367             ewrt             = _mm_mul_ps(r22,ewtabscale);
1368             ewitab           = _mm_cvttps_epi32(ewrt);
1369             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1370             ewitab           = _mm_slli_epi32(ewitab,2);
1371             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1372             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1373             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1374             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1375             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1376             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1377             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1378             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1379             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1380
1381             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1382
1383             /* Update potential sum for this i atom from the interaction with this j atom. */
1384             velec            = _mm_and_ps(velec,cutoff_mask);
1385             velec            = _mm_andnot_ps(dummy_mask,velec);
1386             velecsum         = _mm_add_ps(velecsum,velec);
1387
1388             fscal            = felec;
1389
1390             fscal            = _mm_and_ps(fscal,cutoff_mask);
1391
1392             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1393
1394             /* Calculate temporary vectorial force */
1395             tx               = _mm_mul_ps(fscal,dx22);
1396             ty               = _mm_mul_ps(fscal,dy22);
1397             tz               = _mm_mul_ps(fscal,dz22);
1398
1399             /* Update vectorial force */
1400             fix2             = _mm_add_ps(fix2,tx);
1401             fiy2             = _mm_add_ps(fiy2,ty);
1402             fiz2             = _mm_add_ps(fiz2,tz);
1403
1404             fjx2             = _mm_add_ps(fjx2,tx);
1405             fjy2             = _mm_add_ps(fjy2,ty);
1406             fjz2             = _mm_add_ps(fjz2,tz);
1407             
1408             }
1409
1410             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1411             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1412             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1413             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1414
1415             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1416                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1417
1418             /* Inner loop uses 441 flops */
1419         }
1420
1421         /* End of innermost loop */
1422
1423         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1424                                               f+i_coord_offset,fshift+i_shift_offset);
1425
1426         ggid                        = gid[iidx];
1427         /* Update potential energies */
1428         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1429         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1430
1431         /* Increment number of inner iterations */
1432         inneriter                  += j_index_end - j_index_start;
1433
1434         /* Outer loop uses 20 flops */
1435     }
1436
1437     /* Increment number of outer iterations */
1438     outeriter        += nri;
1439
1440     /* Update outer/inner flops */
1441
1442     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1443 }
1444 /*
1445  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
1446  * Electrostatics interaction: Ewald
1447  * VdW interaction:            LennardJones
1448  * Geometry:                   Water3-Water3
1449  * Calculate force/pot:        Force
1450  */
1451 void
1452 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
1453                     (t_nblist                    * gmx_restrict       nlist,
1454                      rvec                        * gmx_restrict          xx,
1455                      rvec                        * gmx_restrict          ff,
1456                      struct t_forcerec           * gmx_restrict          fr,
1457                      t_mdatoms                   * gmx_restrict     mdatoms,
1458                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1459                      t_nrnb                      * gmx_restrict        nrnb)
1460 {
1461     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1462      * just 0 for non-waters.
1463      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1464      * jnr indices corresponding to data put in the four positions in the SIMD register.
1465      */
1466     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1467     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1468     int              jnrA,jnrB,jnrC,jnrD;
1469     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1470     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1471     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1472     real             rcutoff_scalar;
1473     real             *shiftvec,*fshift,*x,*f;
1474     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1475     real             scratch[4*DIM];
1476     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1477     int              vdwioffset0;
1478     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1479     int              vdwioffset1;
1480     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1481     int              vdwioffset2;
1482     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1483     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1484     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1485     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1486     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1487     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1488     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1489     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1490     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1491     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1492     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1493     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1494     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1495     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1496     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1497     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1498     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1499     real             *charge;
1500     int              nvdwtype;
1501     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1502     int              *vdwtype;
1503     real             *vdwparam;
1504     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1505     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1506     __m128i          ewitab;
1507     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1508     real             *ewtab;
1509     __m128           dummy_mask,cutoff_mask;
1510     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1511     __m128           one     = _mm_set1_ps(1.0);
1512     __m128           two     = _mm_set1_ps(2.0);
1513     x                = xx[0];
1514     f                = ff[0];
1515
1516     nri              = nlist->nri;
1517     iinr             = nlist->iinr;
1518     jindex           = nlist->jindex;
1519     jjnr             = nlist->jjnr;
1520     shiftidx         = nlist->shift;
1521     gid              = nlist->gid;
1522     shiftvec         = fr->shift_vec[0];
1523     fshift           = fr->fshift[0];
1524     facel            = _mm_set1_ps(fr->ic->epsfac);
1525     charge           = mdatoms->chargeA;
1526     nvdwtype         = fr->ntype;
1527     vdwparam         = fr->nbfp;
1528     vdwtype          = mdatoms->typeA;
1529
1530     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1531     ewtab            = fr->ic->tabq_coul_F;
1532     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1533     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1534
1535     /* Setup water-specific parameters */
1536     inr              = nlist->iinr[0];
1537     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1538     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1539     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1540     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1541
1542     jq0              = _mm_set1_ps(charge[inr+0]);
1543     jq1              = _mm_set1_ps(charge[inr+1]);
1544     jq2              = _mm_set1_ps(charge[inr+2]);
1545     vdwjidx0A        = 2*vdwtype[inr+0];
1546     qq00             = _mm_mul_ps(iq0,jq0);
1547     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1548     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1549     qq01             = _mm_mul_ps(iq0,jq1);
1550     qq02             = _mm_mul_ps(iq0,jq2);
1551     qq10             = _mm_mul_ps(iq1,jq0);
1552     qq11             = _mm_mul_ps(iq1,jq1);
1553     qq12             = _mm_mul_ps(iq1,jq2);
1554     qq20             = _mm_mul_ps(iq2,jq0);
1555     qq21             = _mm_mul_ps(iq2,jq1);
1556     qq22             = _mm_mul_ps(iq2,jq2);
1557
1558     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1559     rcutoff_scalar   = fr->ic->rcoulomb;
1560     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1561     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1562
1563     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
1564     rvdw             = _mm_set1_ps(fr->ic->rvdw);
1565
1566     /* Avoid stupid compiler warnings */
1567     jnrA = jnrB = jnrC = jnrD = 0;
1568     j_coord_offsetA = 0;
1569     j_coord_offsetB = 0;
1570     j_coord_offsetC = 0;
1571     j_coord_offsetD = 0;
1572
1573     outeriter        = 0;
1574     inneriter        = 0;
1575
1576     for(iidx=0;iidx<4*DIM;iidx++)
1577     {
1578         scratch[iidx] = 0.0;
1579     }  
1580
1581     /* Start outer loop over neighborlists */
1582     for(iidx=0; iidx<nri; iidx++)
1583     {
1584         /* Load shift vector for this list */
1585         i_shift_offset   = DIM*shiftidx[iidx];
1586
1587         /* Load limits for loop over neighbors */
1588         j_index_start    = jindex[iidx];
1589         j_index_end      = jindex[iidx+1];
1590
1591         /* Get outer coordinate index */
1592         inr              = iinr[iidx];
1593         i_coord_offset   = DIM*inr;
1594
1595         /* Load i particle coords and add shift vector */
1596         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1597                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1598         
1599         fix0             = _mm_setzero_ps();
1600         fiy0             = _mm_setzero_ps();
1601         fiz0             = _mm_setzero_ps();
1602         fix1             = _mm_setzero_ps();
1603         fiy1             = _mm_setzero_ps();
1604         fiz1             = _mm_setzero_ps();
1605         fix2             = _mm_setzero_ps();
1606         fiy2             = _mm_setzero_ps();
1607         fiz2             = _mm_setzero_ps();
1608
1609         /* Start inner kernel loop */
1610         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1611         {
1612
1613             /* Get j neighbor index, and coordinate index */
1614             jnrA             = jjnr[jidx];
1615             jnrB             = jjnr[jidx+1];
1616             jnrC             = jjnr[jidx+2];
1617             jnrD             = jjnr[jidx+3];
1618             j_coord_offsetA  = DIM*jnrA;
1619             j_coord_offsetB  = DIM*jnrB;
1620             j_coord_offsetC  = DIM*jnrC;
1621             j_coord_offsetD  = DIM*jnrD;
1622
1623             /* load j atom coordinates */
1624             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1625                                               x+j_coord_offsetC,x+j_coord_offsetD,
1626                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1627
1628             /* Calculate displacement vector */
1629             dx00             = _mm_sub_ps(ix0,jx0);
1630             dy00             = _mm_sub_ps(iy0,jy0);
1631             dz00             = _mm_sub_ps(iz0,jz0);
1632             dx01             = _mm_sub_ps(ix0,jx1);
1633             dy01             = _mm_sub_ps(iy0,jy1);
1634             dz01             = _mm_sub_ps(iz0,jz1);
1635             dx02             = _mm_sub_ps(ix0,jx2);
1636             dy02             = _mm_sub_ps(iy0,jy2);
1637             dz02             = _mm_sub_ps(iz0,jz2);
1638             dx10             = _mm_sub_ps(ix1,jx0);
1639             dy10             = _mm_sub_ps(iy1,jy0);
1640             dz10             = _mm_sub_ps(iz1,jz0);
1641             dx11             = _mm_sub_ps(ix1,jx1);
1642             dy11             = _mm_sub_ps(iy1,jy1);
1643             dz11             = _mm_sub_ps(iz1,jz1);
1644             dx12             = _mm_sub_ps(ix1,jx2);
1645             dy12             = _mm_sub_ps(iy1,jy2);
1646             dz12             = _mm_sub_ps(iz1,jz2);
1647             dx20             = _mm_sub_ps(ix2,jx0);
1648             dy20             = _mm_sub_ps(iy2,jy0);
1649             dz20             = _mm_sub_ps(iz2,jz0);
1650             dx21             = _mm_sub_ps(ix2,jx1);
1651             dy21             = _mm_sub_ps(iy2,jy1);
1652             dz21             = _mm_sub_ps(iz2,jz1);
1653             dx22             = _mm_sub_ps(ix2,jx2);
1654             dy22             = _mm_sub_ps(iy2,jy2);
1655             dz22             = _mm_sub_ps(iz2,jz2);
1656
1657             /* Calculate squared distance and things based on it */
1658             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1659             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1660             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1661             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1662             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1663             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1664             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1665             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1666             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1667
1668             rinv00           = sse2_invsqrt_f(rsq00);
1669             rinv01           = sse2_invsqrt_f(rsq01);
1670             rinv02           = sse2_invsqrt_f(rsq02);
1671             rinv10           = sse2_invsqrt_f(rsq10);
1672             rinv11           = sse2_invsqrt_f(rsq11);
1673             rinv12           = sse2_invsqrt_f(rsq12);
1674             rinv20           = sse2_invsqrt_f(rsq20);
1675             rinv21           = sse2_invsqrt_f(rsq21);
1676             rinv22           = sse2_invsqrt_f(rsq22);
1677
1678             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1679             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1680             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1681             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1682             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1683             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1684             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1685             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1686             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1687
1688             fjx0             = _mm_setzero_ps();
1689             fjy0             = _mm_setzero_ps();
1690             fjz0             = _mm_setzero_ps();
1691             fjx1             = _mm_setzero_ps();
1692             fjy1             = _mm_setzero_ps();
1693             fjz1             = _mm_setzero_ps();
1694             fjx2             = _mm_setzero_ps();
1695             fjy2             = _mm_setzero_ps();
1696             fjz2             = _mm_setzero_ps();
1697
1698             /**************************
1699              * CALCULATE INTERACTIONS *
1700              **************************/
1701
1702             if (gmx_mm_any_lt(rsq00,rcutoff2))
1703             {
1704
1705             r00              = _mm_mul_ps(rsq00,rinv00);
1706
1707             /* EWALD ELECTROSTATICS */
1708
1709             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1710             ewrt             = _mm_mul_ps(r00,ewtabscale);
1711             ewitab           = _mm_cvttps_epi32(ewrt);
1712             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1713             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1714                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1715                                          &ewtabF,&ewtabFn);
1716             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1717             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1718
1719             /* LENNARD-JONES DISPERSION/REPULSION */
1720
1721             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1722             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1723
1724             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1725
1726             fscal            = _mm_add_ps(felec,fvdw);
1727
1728             fscal            = _mm_and_ps(fscal,cutoff_mask);
1729
1730             /* Calculate temporary vectorial force */
1731             tx               = _mm_mul_ps(fscal,dx00);
1732             ty               = _mm_mul_ps(fscal,dy00);
1733             tz               = _mm_mul_ps(fscal,dz00);
1734
1735             /* Update vectorial force */
1736             fix0             = _mm_add_ps(fix0,tx);
1737             fiy0             = _mm_add_ps(fiy0,ty);
1738             fiz0             = _mm_add_ps(fiz0,tz);
1739
1740             fjx0             = _mm_add_ps(fjx0,tx);
1741             fjy0             = _mm_add_ps(fjy0,ty);
1742             fjz0             = _mm_add_ps(fjz0,tz);
1743             
1744             }
1745
1746             /**************************
1747              * CALCULATE INTERACTIONS *
1748              **************************/
1749
1750             if (gmx_mm_any_lt(rsq01,rcutoff2))
1751             {
1752
1753             r01              = _mm_mul_ps(rsq01,rinv01);
1754
1755             /* EWALD ELECTROSTATICS */
1756
1757             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1758             ewrt             = _mm_mul_ps(r01,ewtabscale);
1759             ewitab           = _mm_cvttps_epi32(ewrt);
1760             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1761             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1762                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1763                                          &ewtabF,&ewtabFn);
1764             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1765             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1766
1767             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1768
1769             fscal            = felec;
1770
1771             fscal            = _mm_and_ps(fscal,cutoff_mask);
1772
1773             /* Calculate temporary vectorial force */
1774             tx               = _mm_mul_ps(fscal,dx01);
1775             ty               = _mm_mul_ps(fscal,dy01);
1776             tz               = _mm_mul_ps(fscal,dz01);
1777
1778             /* Update vectorial force */
1779             fix0             = _mm_add_ps(fix0,tx);
1780             fiy0             = _mm_add_ps(fiy0,ty);
1781             fiz0             = _mm_add_ps(fiz0,tz);
1782
1783             fjx1             = _mm_add_ps(fjx1,tx);
1784             fjy1             = _mm_add_ps(fjy1,ty);
1785             fjz1             = _mm_add_ps(fjz1,tz);
1786             
1787             }
1788
1789             /**************************
1790              * CALCULATE INTERACTIONS *
1791              **************************/
1792
1793             if (gmx_mm_any_lt(rsq02,rcutoff2))
1794             {
1795
1796             r02              = _mm_mul_ps(rsq02,rinv02);
1797
1798             /* EWALD ELECTROSTATICS */
1799
1800             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1801             ewrt             = _mm_mul_ps(r02,ewtabscale);
1802             ewitab           = _mm_cvttps_epi32(ewrt);
1803             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1804             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1805                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1806                                          &ewtabF,&ewtabFn);
1807             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1808             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1809
1810             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1811
1812             fscal            = felec;
1813
1814             fscal            = _mm_and_ps(fscal,cutoff_mask);
1815
1816             /* Calculate temporary vectorial force */
1817             tx               = _mm_mul_ps(fscal,dx02);
1818             ty               = _mm_mul_ps(fscal,dy02);
1819             tz               = _mm_mul_ps(fscal,dz02);
1820
1821             /* Update vectorial force */
1822             fix0             = _mm_add_ps(fix0,tx);
1823             fiy0             = _mm_add_ps(fiy0,ty);
1824             fiz0             = _mm_add_ps(fiz0,tz);
1825
1826             fjx2             = _mm_add_ps(fjx2,tx);
1827             fjy2             = _mm_add_ps(fjy2,ty);
1828             fjz2             = _mm_add_ps(fjz2,tz);
1829             
1830             }
1831
1832             /**************************
1833              * CALCULATE INTERACTIONS *
1834              **************************/
1835
1836             if (gmx_mm_any_lt(rsq10,rcutoff2))
1837             {
1838
1839             r10              = _mm_mul_ps(rsq10,rinv10);
1840
1841             /* EWALD ELECTROSTATICS */
1842
1843             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1844             ewrt             = _mm_mul_ps(r10,ewtabscale);
1845             ewitab           = _mm_cvttps_epi32(ewrt);
1846             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1847             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1848                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1849                                          &ewtabF,&ewtabFn);
1850             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1851             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1852
1853             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1854
1855             fscal            = felec;
1856
1857             fscal            = _mm_and_ps(fscal,cutoff_mask);
1858
1859             /* Calculate temporary vectorial force */
1860             tx               = _mm_mul_ps(fscal,dx10);
1861             ty               = _mm_mul_ps(fscal,dy10);
1862             tz               = _mm_mul_ps(fscal,dz10);
1863
1864             /* Update vectorial force */
1865             fix1             = _mm_add_ps(fix1,tx);
1866             fiy1             = _mm_add_ps(fiy1,ty);
1867             fiz1             = _mm_add_ps(fiz1,tz);
1868
1869             fjx0             = _mm_add_ps(fjx0,tx);
1870             fjy0             = _mm_add_ps(fjy0,ty);
1871             fjz0             = _mm_add_ps(fjz0,tz);
1872             
1873             }
1874
1875             /**************************
1876              * CALCULATE INTERACTIONS *
1877              **************************/
1878
1879             if (gmx_mm_any_lt(rsq11,rcutoff2))
1880             {
1881
1882             r11              = _mm_mul_ps(rsq11,rinv11);
1883
1884             /* EWALD ELECTROSTATICS */
1885
1886             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1887             ewrt             = _mm_mul_ps(r11,ewtabscale);
1888             ewitab           = _mm_cvttps_epi32(ewrt);
1889             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1890             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1891                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1892                                          &ewtabF,&ewtabFn);
1893             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1894             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1895
1896             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1897
1898             fscal            = felec;
1899
1900             fscal            = _mm_and_ps(fscal,cutoff_mask);
1901
1902             /* Calculate temporary vectorial force */
1903             tx               = _mm_mul_ps(fscal,dx11);
1904             ty               = _mm_mul_ps(fscal,dy11);
1905             tz               = _mm_mul_ps(fscal,dz11);
1906
1907             /* Update vectorial force */
1908             fix1             = _mm_add_ps(fix1,tx);
1909             fiy1             = _mm_add_ps(fiy1,ty);
1910             fiz1             = _mm_add_ps(fiz1,tz);
1911
1912             fjx1             = _mm_add_ps(fjx1,tx);
1913             fjy1             = _mm_add_ps(fjy1,ty);
1914             fjz1             = _mm_add_ps(fjz1,tz);
1915             
1916             }
1917
1918             /**************************
1919              * CALCULATE INTERACTIONS *
1920              **************************/
1921
1922             if (gmx_mm_any_lt(rsq12,rcutoff2))
1923             {
1924
1925             r12              = _mm_mul_ps(rsq12,rinv12);
1926
1927             /* EWALD ELECTROSTATICS */
1928
1929             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1930             ewrt             = _mm_mul_ps(r12,ewtabscale);
1931             ewitab           = _mm_cvttps_epi32(ewrt);
1932             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1933             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1934                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1935                                          &ewtabF,&ewtabFn);
1936             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1937             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1938
1939             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1940
1941             fscal            = felec;
1942
1943             fscal            = _mm_and_ps(fscal,cutoff_mask);
1944
1945             /* Calculate temporary vectorial force */
1946             tx               = _mm_mul_ps(fscal,dx12);
1947             ty               = _mm_mul_ps(fscal,dy12);
1948             tz               = _mm_mul_ps(fscal,dz12);
1949
1950             /* Update vectorial force */
1951             fix1             = _mm_add_ps(fix1,tx);
1952             fiy1             = _mm_add_ps(fiy1,ty);
1953             fiz1             = _mm_add_ps(fiz1,tz);
1954
1955             fjx2             = _mm_add_ps(fjx2,tx);
1956             fjy2             = _mm_add_ps(fjy2,ty);
1957             fjz2             = _mm_add_ps(fjz2,tz);
1958             
1959             }
1960
1961             /**************************
1962              * CALCULATE INTERACTIONS *
1963              **************************/
1964
1965             if (gmx_mm_any_lt(rsq20,rcutoff2))
1966             {
1967
1968             r20              = _mm_mul_ps(rsq20,rinv20);
1969
1970             /* EWALD ELECTROSTATICS */
1971
1972             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1973             ewrt             = _mm_mul_ps(r20,ewtabscale);
1974             ewitab           = _mm_cvttps_epi32(ewrt);
1975             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1976             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1977                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1978                                          &ewtabF,&ewtabFn);
1979             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1980             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1981
1982             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1983
1984             fscal            = felec;
1985
1986             fscal            = _mm_and_ps(fscal,cutoff_mask);
1987
1988             /* Calculate temporary vectorial force */
1989             tx               = _mm_mul_ps(fscal,dx20);
1990             ty               = _mm_mul_ps(fscal,dy20);
1991             tz               = _mm_mul_ps(fscal,dz20);
1992
1993             /* Update vectorial force */
1994             fix2             = _mm_add_ps(fix2,tx);
1995             fiy2             = _mm_add_ps(fiy2,ty);
1996             fiz2             = _mm_add_ps(fiz2,tz);
1997
1998             fjx0             = _mm_add_ps(fjx0,tx);
1999             fjy0             = _mm_add_ps(fjy0,ty);
2000             fjz0             = _mm_add_ps(fjz0,tz);
2001             
2002             }
2003
2004             /**************************
2005              * CALCULATE INTERACTIONS *
2006              **************************/
2007
2008             if (gmx_mm_any_lt(rsq21,rcutoff2))
2009             {
2010
2011             r21              = _mm_mul_ps(rsq21,rinv21);
2012
2013             /* EWALD ELECTROSTATICS */
2014
2015             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2016             ewrt             = _mm_mul_ps(r21,ewtabscale);
2017             ewitab           = _mm_cvttps_epi32(ewrt);
2018             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2019             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2020                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2021                                          &ewtabF,&ewtabFn);
2022             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2023             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2024
2025             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2026
2027             fscal            = felec;
2028
2029             fscal            = _mm_and_ps(fscal,cutoff_mask);
2030
2031             /* Calculate temporary vectorial force */
2032             tx               = _mm_mul_ps(fscal,dx21);
2033             ty               = _mm_mul_ps(fscal,dy21);
2034             tz               = _mm_mul_ps(fscal,dz21);
2035
2036             /* Update vectorial force */
2037             fix2             = _mm_add_ps(fix2,tx);
2038             fiy2             = _mm_add_ps(fiy2,ty);
2039             fiz2             = _mm_add_ps(fiz2,tz);
2040
2041             fjx1             = _mm_add_ps(fjx1,tx);
2042             fjy1             = _mm_add_ps(fjy1,ty);
2043             fjz1             = _mm_add_ps(fjz1,tz);
2044             
2045             }
2046
2047             /**************************
2048              * CALCULATE INTERACTIONS *
2049              **************************/
2050
2051             if (gmx_mm_any_lt(rsq22,rcutoff2))
2052             {
2053
2054             r22              = _mm_mul_ps(rsq22,rinv22);
2055
2056             /* EWALD ELECTROSTATICS */
2057
2058             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2059             ewrt             = _mm_mul_ps(r22,ewtabscale);
2060             ewitab           = _mm_cvttps_epi32(ewrt);
2061             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2062             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2063                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2064                                          &ewtabF,&ewtabFn);
2065             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2066             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2067
2068             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2069
2070             fscal            = felec;
2071
2072             fscal            = _mm_and_ps(fscal,cutoff_mask);
2073
2074             /* Calculate temporary vectorial force */
2075             tx               = _mm_mul_ps(fscal,dx22);
2076             ty               = _mm_mul_ps(fscal,dy22);
2077             tz               = _mm_mul_ps(fscal,dz22);
2078
2079             /* Update vectorial force */
2080             fix2             = _mm_add_ps(fix2,tx);
2081             fiy2             = _mm_add_ps(fiy2,ty);
2082             fiz2             = _mm_add_ps(fiz2,tz);
2083
2084             fjx2             = _mm_add_ps(fjx2,tx);
2085             fjy2             = _mm_add_ps(fjy2,ty);
2086             fjz2             = _mm_add_ps(fjz2,tz);
2087             
2088             }
2089
2090             fjptrA             = f+j_coord_offsetA;
2091             fjptrB             = f+j_coord_offsetB;
2092             fjptrC             = f+j_coord_offsetC;
2093             fjptrD             = f+j_coord_offsetD;
2094
2095             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2096                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2097
2098             /* Inner loop uses 358 flops */
2099         }
2100
2101         if(jidx<j_index_end)
2102         {
2103
2104             /* Get j neighbor index, and coordinate index */
2105             jnrlistA         = jjnr[jidx];
2106             jnrlistB         = jjnr[jidx+1];
2107             jnrlistC         = jjnr[jidx+2];
2108             jnrlistD         = jjnr[jidx+3];
2109             /* Sign of each element will be negative for non-real atoms.
2110              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2111              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2112              */
2113             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2114             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2115             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2116             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2117             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2118             j_coord_offsetA  = DIM*jnrA;
2119             j_coord_offsetB  = DIM*jnrB;
2120             j_coord_offsetC  = DIM*jnrC;
2121             j_coord_offsetD  = DIM*jnrD;
2122
2123             /* load j atom coordinates */
2124             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2125                                               x+j_coord_offsetC,x+j_coord_offsetD,
2126                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2127
2128             /* Calculate displacement vector */
2129             dx00             = _mm_sub_ps(ix0,jx0);
2130             dy00             = _mm_sub_ps(iy0,jy0);
2131             dz00             = _mm_sub_ps(iz0,jz0);
2132             dx01             = _mm_sub_ps(ix0,jx1);
2133             dy01             = _mm_sub_ps(iy0,jy1);
2134             dz01             = _mm_sub_ps(iz0,jz1);
2135             dx02             = _mm_sub_ps(ix0,jx2);
2136             dy02             = _mm_sub_ps(iy0,jy2);
2137             dz02             = _mm_sub_ps(iz0,jz2);
2138             dx10             = _mm_sub_ps(ix1,jx0);
2139             dy10             = _mm_sub_ps(iy1,jy0);
2140             dz10             = _mm_sub_ps(iz1,jz0);
2141             dx11             = _mm_sub_ps(ix1,jx1);
2142             dy11             = _mm_sub_ps(iy1,jy1);
2143             dz11             = _mm_sub_ps(iz1,jz1);
2144             dx12             = _mm_sub_ps(ix1,jx2);
2145             dy12             = _mm_sub_ps(iy1,jy2);
2146             dz12             = _mm_sub_ps(iz1,jz2);
2147             dx20             = _mm_sub_ps(ix2,jx0);
2148             dy20             = _mm_sub_ps(iy2,jy0);
2149             dz20             = _mm_sub_ps(iz2,jz0);
2150             dx21             = _mm_sub_ps(ix2,jx1);
2151             dy21             = _mm_sub_ps(iy2,jy1);
2152             dz21             = _mm_sub_ps(iz2,jz1);
2153             dx22             = _mm_sub_ps(ix2,jx2);
2154             dy22             = _mm_sub_ps(iy2,jy2);
2155             dz22             = _mm_sub_ps(iz2,jz2);
2156
2157             /* Calculate squared distance and things based on it */
2158             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2159             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2160             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2161             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2162             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2163             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2164             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2165             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2166             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2167
2168             rinv00           = sse2_invsqrt_f(rsq00);
2169             rinv01           = sse2_invsqrt_f(rsq01);
2170             rinv02           = sse2_invsqrt_f(rsq02);
2171             rinv10           = sse2_invsqrt_f(rsq10);
2172             rinv11           = sse2_invsqrt_f(rsq11);
2173             rinv12           = sse2_invsqrt_f(rsq12);
2174             rinv20           = sse2_invsqrt_f(rsq20);
2175             rinv21           = sse2_invsqrt_f(rsq21);
2176             rinv22           = sse2_invsqrt_f(rsq22);
2177
2178             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2179             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2180             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2181             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2182             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2183             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2184             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2185             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2186             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2187
2188             fjx0             = _mm_setzero_ps();
2189             fjy0             = _mm_setzero_ps();
2190             fjz0             = _mm_setzero_ps();
2191             fjx1             = _mm_setzero_ps();
2192             fjy1             = _mm_setzero_ps();
2193             fjz1             = _mm_setzero_ps();
2194             fjx2             = _mm_setzero_ps();
2195             fjy2             = _mm_setzero_ps();
2196             fjz2             = _mm_setzero_ps();
2197
2198             /**************************
2199              * CALCULATE INTERACTIONS *
2200              **************************/
2201
2202             if (gmx_mm_any_lt(rsq00,rcutoff2))
2203             {
2204
2205             r00              = _mm_mul_ps(rsq00,rinv00);
2206             r00              = _mm_andnot_ps(dummy_mask,r00);
2207
2208             /* EWALD ELECTROSTATICS */
2209
2210             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2211             ewrt             = _mm_mul_ps(r00,ewtabscale);
2212             ewitab           = _mm_cvttps_epi32(ewrt);
2213             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2214             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2215                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2216                                          &ewtabF,&ewtabFn);
2217             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2218             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2219
2220             /* LENNARD-JONES DISPERSION/REPULSION */
2221
2222             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2223             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2224
2225             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2226
2227             fscal            = _mm_add_ps(felec,fvdw);
2228
2229             fscal            = _mm_and_ps(fscal,cutoff_mask);
2230
2231             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2232
2233             /* Calculate temporary vectorial force */
2234             tx               = _mm_mul_ps(fscal,dx00);
2235             ty               = _mm_mul_ps(fscal,dy00);
2236             tz               = _mm_mul_ps(fscal,dz00);
2237
2238             /* Update vectorial force */
2239             fix0             = _mm_add_ps(fix0,tx);
2240             fiy0             = _mm_add_ps(fiy0,ty);
2241             fiz0             = _mm_add_ps(fiz0,tz);
2242
2243             fjx0             = _mm_add_ps(fjx0,tx);
2244             fjy0             = _mm_add_ps(fjy0,ty);
2245             fjz0             = _mm_add_ps(fjz0,tz);
2246             
2247             }
2248
2249             /**************************
2250              * CALCULATE INTERACTIONS *
2251              **************************/
2252
2253             if (gmx_mm_any_lt(rsq01,rcutoff2))
2254             {
2255
2256             r01              = _mm_mul_ps(rsq01,rinv01);
2257             r01              = _mm_andnot_ps(dummy_mask,r01);
2258
2259             /* EWALD ELECTROSTATICS */
2260
2261             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2262             ewrt             = _mm_mul_ps(r01,ewtabscale);
2263             ewitab           = _mm_cvttps_epi32(ewrt);
2264             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2265             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2266                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2267                                          &ewtabF,&ewtabFn);
2268             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2269             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2270
2271             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2272
2273             fscal            = felec;
2274
2275             fscal            = _mm_and_ps(fscal,cutoff_mask);
2276
2277             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2278
2279             /* Calculate temporary vectorial force */
2280             tx               = _mm_mul_ps(fscal,dx01);
2281             ty               = _mm_mul_ps(fscal,dy01);
2282             tz               = _mm_mul_ps(fscal,dz01);
2283
2284             /* Update vectorial force */
2285             fix0             = _mm_add_ps(fix0,tx);
2286             fiy0             = _mm_add_ps(fiy0,ty);
2287             fiz0             = _mm_add_ps(fiz0,tz);
2288
2289             fjx1             = _mm_add_ps(fjx1,tx);
2290             fjy1             = _mm_add_ps(fjy1,ty);
2291             fjz1             = _mm_add_ps(fjz1,tz);
2292             
2293             }
2294
2295             /**************************
2296              * CALCULATE INTERACTIONS *
2297              **************************/
2298
2299             if (gmx_mm_any_lt(rsq02,rcutoff2))
2300             {
2301
2302             r02              = _mm_mul_ps(rsq02,rinv02);
2303             r02              = _mm_andnot_ps(dummy_mask,r02);
2304
2305             /* EWALD ELECTROSTATICS */
2306
2307             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2308             ewrt             = _mm_mul_ps(r02,ewtabscale);
2309             ewitab           = _mm_cvttps_epi32(ewrt);
2310             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2311             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2312                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2313                                          &ewtabF,&ewtabFn);
2314             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2315             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2316
2317             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2318
2319             fscal            = felec;
2320
2321             fscal            = _mm_and_ps(fscal,cutoff_mask);
2322
2323             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2324
2325             /* Calculate temporary vectorial force */
2326             tx               = _mm_mul_ps(fscal,dx02);
2327             ty               = _mm_mul_ps(fscal,dy02);
2328             tz               = _mm_mul_ps(fscal,dz02);
2329
2330             /* Update vectorial force */
2331             fix0             = _mm_add_ps(fix0,tx);
2332             fiy0             = _mm_add_ps(fiy0,ty);
2333             fiz0             = _mm_add_ps(fiz0,tz);
2334
2335             fjx2             = _mm_add_ps(fjx2,tx);
2336             fjy2             = _mm_add_ps(fjy2,ty);
2337             fjz2             = _mm_add_ps(fjz2,tz);
2338             
2339             }
2340
2341             /**************************
2342              * CALCULATE INTERACTIONS *
2343              **************************/
2344
2345             if (gmx_mm_any_lt(rsq10,rcutoff2))
2346             {
2347
2348             r10              = _mm_mul_ps(rsq10,rinv10);
2349             r10              = _mm_andnot_ps(dummy_mask,r10);
2350
2351             /* EWALD ELECTROSTATICS */
2352
2353             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2354             ewrt             = _mm_mul_ps(r10,ewtabscale);
2355             ewitab           = _mm_cvttps_epi32(ewrt);
2356             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2357             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2358                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2359                                          &ewtabF,&ewtabFn);
2360             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2361             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2362
2363             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2364
2365             fscal            = felec;
2366
2367             fscal            = _mm_and_ps(fscal,cutoff_mask);
2368
2369             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2370
2371             /* Calculate temporary vectorial force */
2372             tx               = _mm_mul_ps(fscal,dx10);
2373             ty               = _mm_mul_ps(fscal,dy10);
2374             tz               = _mm_mul_ps(fscal,dz10);
2375
2376             /* Update vectorial force */
2377             fix1             = _mm_add_ps(fix1,tx);
2378             fiy1             = _mm_add_ps(fiy1,ty);
2379             fiz1             = _mm_add_ps(fiz1,tz);
2380
2381             fjx0             = _mm_add_ps(fjx0,tx);
2382             fjy0             = _mm_add_ps(fjy0,ty);
2383             fjz0             = _mm_add_ps(fjz0,tz);
2384             
2385             }
2386
2387             /**************************
2388              * CALCULATE INTERACTIONS *
2389              **************************/
2390
2391             if (gmx_mm_any_lt(rsq11,rcutoff2))
2392             {
2393
2394             r11              = _mm_mul_ps(rsq11,rinv11);
2395             r11              = _mm_andnot_ps(dummy_mask,r11);
2396
2397             /* EWALD ELECTROSTATICS */
2398
2399             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2400             ewrt             = _mm_mul_ps(r11,ewtabscale);
2401             ewitab           = _mm_cvttps_epi32(ewrt);
2402             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2403             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2404                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2405                                          &ewtabF,&ewtabFn);
2406             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2407             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2408
2409             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2410
2411             fscal            = felec;
2412
2413             fscal            = _mm_and_ps(fscal,cutoff_mask);
2414
2415             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2416
2417             /* Calculate temporary vectorial force */
2418             tx               = _mm_mul_ps(fscal,dx11);
2419             ty               = _mm_mul_ps(fscal,dy11);
2420             tz               = _mm_mul_ps(fscal,dz11);
2421
2422             /* Update vectorial force */
2423             fix1             = _mm_add_ps(fix1,tx);
2424             fiy1             = _mm_add_ps(fiy1,ty);
2425             fiz1             = _mm_add_ps(fiz1,tz);
2426
2427             fjx1             = _mm_add_ps(fjx1,tx);
2428             fjy1             = _mm_add_ps(fjy1,ty);
2429             fjz1             = _mm_add_ps(fjz1,tz);
2430             
2431             }
2432
2433             /**************************
2434              * CALCULATE INTERACTIONS *
2435              **************************/
2436
2437             if (gmx_mm_any_lt(rsq12,rcutoff2))
2438             {
2439
2440             r12              = _mm_mul_ps(rsq12,rinv12);
2441             r12              = _mm_andnot_ps(dummy_mask,r12);
2442
2443             /* EWALD ELECTROSTATICS */
2444
2445             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2446             ewrt             = _mm_mul_ps(r12,ewtabscale);
2447             ewitab           = _mm_cvttps_epi32(ewrt);
2448             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2449             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2450                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2451                                          &ewtabF,&ewtabFn);
2452             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2453             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2454
2455             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2456
2457             fscal            = felec;
2458
2459             fscal            = _mm_and_ps(fscal,cutoff_mask);
2460
2461             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2462
2463             /* Calculate temporary vectorial force */
2464             tx               = _mm_mul_ps(fscal,dx12);
2465             ty               = _mm_mul_ps(fscal,dy12);
2466             tz               = _mm_mul_ps(fscal,dz12);
2467
2468             /* Update vectorial force */
2469             fix1             = _mm_add_ps(fix1,tx);
2470             fiy1             = _mm_add_ps(fiy1,ty);
2471             fiz1             = _mm_add_ps(fiz1,tz);
2472
2473             fjx2             = _mm_add_ps(fjx2,tx);
2474             fjy2             = _mm_add_ps(fjy2,ty);
2475             fjz2             = _mm_add_ps(fjz2,tz);
2476             
2477             }
2478
2479             /**************************
2480              * CALCULATE INTERACTIONS *
2481              **************************/
2482
2483             if (gmx_mm_any_lt(rsq20,rcutoff2))
2484             {
2485
2486             r20              = _mm_mul_ps(rsq20,rinv20);
2487             r20              = _mm_andnot_ps(dummy_mask,r20);
2488
2489             /* EWALD ELECTROSTATICS */
2490
2491             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2492             ewrt             = _mm_mul_ps(r20,ewtabscale);
2493             ewitab           = _mm_cvttps_epi32(ewrt);
2494             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2495             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2496                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2497                                          &ewtabF,&ewtabFn);
2498             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2499             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2500
2501             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2502
2503             fscal            = felec;
2504
2505             fscal            = _mm_and_ps(fscal,cutoff_mask);
2506
2507             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2508
2509             /* Calculate temporary vectorial force */
2510             tx               = _mm_mul_ps(fscal,dx20);
2511             ty               = _mm_mul_ps(fscal,dy20);
2512             tz               = _mm_mul_ps(fscal,dz20);
2513
2514             /* Update vectorial force */
2515             fix2             = _mm_add_ps(fix2,tx);
2516             fiy2             = _mm_add_ps(fiy2,ty);
2517             fiz2             = _mm_add_ps(fiz2,tz);
2518
2519             fjx0             = _mm_add_ps(fjx0,tx);
2520             fjy0             = _mm_add_ps(fjy0,ty);
2521             fjz0             = _mm_add_ps(fjz0,tz);
2522             
2523             }
2524
2525             /**************************
2526              * CALCULATE INTERACTIONS *
2527              **************************/
2528
2529             if (gmx_mm_any_lt(rsq21,rcutoff2))
2530             {
2531
2532             r21              = _mm_mul_ps(rsq21,rinv21);
2533             r21              = _mm_andnot_ps(dummy_mask,r21);
2534
2535             /* EWALD ELECTROSTATICS */
2536
2537             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2538             ewrt             = _mm_mul_ps(r21,ewtabscale);
2539             ewitab           = _mm_cvttps_epi32(ewrt);
2540             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2541             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2542                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2543                                          &ewtabF,&ewtabFn);
2544             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2545             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2546
2547             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2548
2549             fscal            = felec;
2550
2551             fscal            = _mm_and_ps(fscal,cutoff_mask);
2552
2553             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2554
2555             /* Calculate temporary vectorial force */
2556             tx               = _mm_mul_ps(fscal,dx21);
2557             ty               = _mm_mul_ps(fscal,dy21);
2558             tz               = _mm_mul_ps(fscal,dz21);
2559
2560             /* Update vectorial force */
2561             fix2             = _mm_add_ps(fix2,tx);
2562             fiy2             = _mm_add_ps(fiy2,ty);
2563             fiz2             = _mm_add_ps(fiz2,tz);
2564
2565             fjx1             = _mm_add_ps(fjx1,tx);
2566             fjy1             = _mm_add_ps(fjy1,ty);
2567             fjz1             = _mm_add_ps(fjz1,tz);
2568             
2569             }
2570
2571             /**************************
2572              * CALCULATE INTERACTIONS *
2573              **************************/
2574
2575             if (gmx_mm_any_lt(rsq22,rcutoff2))
2576             {
2577
2578             r22              = _mm_mul_ps(rsq22,rinv22);
2579             r22              = _mm_andnot_ps(dummy_mask,r22);
2580
2581             /* EWALD ELECTROSTATICS */
2582
2583             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2584             ewrt             = _mm_mul_ps(r22,ewtabscale);
2585             ewitab           = _mm_cvttps_epi32(ewrt);
2586             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2587             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2588                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2589                                          &ewtabF,&ewtabFn);
2590             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2591             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2592
2593             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2594
2595             fscal            = felec;
2596
2597             fscal            = _mm_and_ps(fscal,cutoff_mask);
2598
2599             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2600
2601             /* Calculate temporary vectorial force */
2602             tx               = _mm_mul_ps(fscal,dx22);
2603             ty               = _mm_mul_ps(fscal,dy22);
2604             tz               = _mm_mul_ps(fscal,dz22);
2605
2606             /* Update vectorial force */
2607             fix2             = _mm_add_ps(fix2,tx);
2608             fiy2             = _mm_add_ps(fiy2,ty);
2609             fiz2             = _mm_add_ps(fiz2,tz);
2610
2611             fjx2             = _mm_add_ps(fjx2,tx);
2612             fjy2             = _mm_add_ps(fjy2,ty);
2613             fjz2             = _mm_add_ps(fjz2,tz);
2614             
2615             }
2616
2617             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2618             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2619             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2620             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2621
2622             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2623                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2624
2625             /* Inner loop uses 367 flops */
2626         }
2627
2628         /* End of innermost loop */
2629
2630         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2631                                               f+i_coord_offset,fshift+i_shift_offset);
2632
2633         /* Increment number of inner iterations */
2634         inneriter                  += j_index_end - j_index_start;
2635
2636         /* Outer loop uses 18 flops */
2637     }
2638
2639     /* Increment number of outer iterations */
2640     outeriter        += nri;
2641
2642     /* Update outer/inner flops */
2643
2644     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);
2645 }