Created SIMD module
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEw_VdwLJ_GeomW3W3_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse2_single.h"
50 #include "kernelutil_x86_sse2_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
113     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
114     __m128i          ewitab;
115     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
116     real             *ewtab;
117     __m128           dummy_mask,cutoff_mask;
118     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
119     __m128           one     = _mm_set1_ps(1.0);
120     __m128           two     = _mm_set1_ps(2.0);
121     x                = xx[0];
122     f                = ff[0];
123
124     nri              = nlist->nri;
125     iinr             = nlist->iinr;
126     jindex           = nlist->jindex;
127     jjnr             = nlist->jjnr;
128     shiftidx         = nlist->shift;
129     gid              = nlist->gid;
130     shiftvec         = fr->shift_vec[0];
131     fshift           = fr->fshift[0];
132     facel            = _mm_set1_ps(fr->epsfac);
133     charge           = mdatoms->chargeA;
134     nvdwtype         = fr->ntype;
135     vdwparam         = fr->nbfp;
136     vdwtype          = mdatoms->typeA;
137
138     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
139     ewtab            = fr->ic->tabq_coul_FDV0;
140     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
141     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
142
143     /* Setup water-specific parameters */
144     inr              = nlist->iinr[0];
145     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
146     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
147     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
148     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
149
150     jq0              = _mm_set1_ps(charge[inr+0]);
151     jq1              = _mm_set1_ps(charge[inr+1]);
152     jq2              = _mm_set1_ps(charge[inr+2]);
153     vdwjidx0A        = 2*vdwtype[inr+0];
154     qq00             = _mm_mul_ps(iq0,jq0);
155     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
156     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
157     qq01             = _mm_mul_ps(iq0,jq1);
158     qq02             = _mm_mul_ps(iq0,jq2);
159     qq10             = _mm_mul_ps(iq1,jq0);
160     qq11             = _mm_mul_ps(iq1,jq1);
161     qq12             = _mm_mul_ps(iq1,jq2);
162     qq20             = _mm_mul_ps(iq2,jq0);
163     qq21             = _mm_mul_ps(iq2,jq1);
164     qq22             = _mm_mul_ps(iq2,jq2);
165
166     /* Avoid stupid compiler warnings */
167     jnrA = jnrB = jnrC = jnrD = 0;
168     j_coord_offsetA = 0;
169     j_coord_offsetB = 0;
170     j_coord_offsetC = 0;
171     j_coord_offsetD = 0;
172
173     outeriter        = 0;
174     inneriter        = 0;
175
176     for(iidx=0;iidx<4*DIM;iidx++)
177     {
178         scratch[iidx] = 0.0;
179     }  
180
181     /* Start outer loop over neighborlists */
182     for(iidx=0; iidx<nri; iidx++)
183     {
184         /* Load shift vector for this list */
185         i_shift_offset   = DIM*shiftidx[iidx];
186
187         /* Load limits for loop over neighbors */
188         j_index_start    = jindex[iidx];
189         j_index_end      = jindex[iidx+1];
190
191         /* Get outer coordinate index */
192         inr              = iinr[iidx];
193         i_coord_offset   = DIM*inr;
194
195         /* Load i particle coords and add shift vector */
196         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
197                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
198         
199         fix0             = _mm_setzero_ps();
200         fiy0             = _mm_setzero_ps();
201         fiz0             = _mm_setzero_ps();
202         fix1             = _mm_setzero_ps();
203         fiy1             = _mm_setzero_ps();
204         fiz1             = _mm_setzero_ps();
205         fix2             = _mm_setzero_ps();
206         fiy2             = _mm_setzero_ps();
207         fiz2             = _mm_setzero_ps();
208
209         /* Reset potential sums */
210         velecsum         = _mm_setzero_ps();
211         vvdwsum          = _mm_setzero_ps();
212
213         /* Start inner kernel loop */
214         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215         {
216
217             /* Get j neighbor index, and coordinate index */
218             jnrA             = jjnr[jidx];
219             jnrB             = jjnr[jidx+1];
220             jnrC             = jjnr[jidx+2];
221             jnrD             = jjnr[jidx+3];
222             j_coord_offsetA  = DIM*jnrA;
223             j_coord_offsetB  = DIM*jnrB;
224             j_coord_offsetC  = DIM*jnrC;
225             j_coord_offsetD  = DIM*jnrD;
226
227             /* load j atom coordinates */
228             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
229                                               x+j_coord_offsetC,x+j_coord_offsetD,
230                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
231
232             /* Calculate displacement vector */
233             dx00             = _mm_sub_ps(ix0,jx0);
234             dy00             = _mm_sub_ps(iy0,jy0);
235             dz00             = _mm_sub_ps(iz0,jz0);
236             dx01             = _mm_sub_ps(ix0,jx1);
237             dy01             = _mm_sub_ps(iy0,jy1);
238             dz01             = _mm_sub_ps(iz0,jz1);
239             dx02             = _mm_sub_ps(ix0,jx2);
240             dy02             = _mm_sub_ps(iy0,jy2);
241             dz02             = _mm_sub_ps(iz0,jz2);
242             dx10             = _mm_sub_ps(ix1,jx0);
243             dy10             = _mm_sub_ps(iy1,jy0);
244             dz10             = _mm_sub_ps(iz1,jz0);
245             dx11             = _mm_sub_ps(ix1,jx1);
246             dy11             = _mm_sub_ps(iy1,jy1);
247             dz11             = _mm_sub_ps(iz1,jz1);
248             dx12             = _mm_sub_ps(ix1,jx2);
249             dy12             = _mm_sub_ps(iy1,jy2);
250             dz12             = _mm_sub_ps(iz1,jz2);
251             dx20             = _mm_sub_ps(ix2,jx0);
252             dy20             = _mm_sub_ps(iy2,jy0);
253             dz20             = _mm_sub_ps(iz2,jz0);
254             dx21             = _mm_sub_ps(ix2,jx1);
255             dy21             = _mm_sub_ps(iy2,jy1);
256             dz21             = _mm_sub_ps(iz2,jz1);
257             dx22             = _mm_sub_ps(ix2,jx2);
258             dy22             = _mm_sub_ps(iy2,jy2);
259             dz22             = _mm_sub_ps(iz2,jz2);
260
261             /* Calculate squared distance and things based on it */
262             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
263             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
264             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
265             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
266             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
267             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
268             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
269             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
270             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
271
272             rinv00           = gmx_mm_invsqrt_ps(rsq00);
273             rinv01           = gmx_mm_invsqrt_ps(rsq01);
274             rinv02           = gmx_mm_invsqrt_ps(rsq02);
275             rinv10           = gmx_mm_invsqrt_ps(rsq10);
276             rinv11           = gmx_mm_invsqrt_ps(rsq11);
277             rinv12           = gmx_mm_invsqrt_ps(rsq12);
278             rinv20           = gmx_mm_invsqrt_ps(rsq20);
279             rinv21           = gmx_mm_invsqrt_ps(rsq21);
280             rinv22           = gmx_mm_invsqrt_ps(rsq22);
281
282             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
283             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
284             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
285             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
286             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
287             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
288             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
289             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
290             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
291
292             fjx0             = _mm_setzero_ps();
293             fjy0             = _mm_setzero_ps();
294             fjz0             = _mm_setzero_ps();
295             fjx1             = _mm_setzero_ps();
296             fjy1             = _mm_setzero_ps();
297             fjz1             = _mm_setzero_ps();
298             fjx2             = _mm_setzero_ps();
299             fjy2             = _mm_setzero_ps();
300             fjz2             = _mm_setzero_ps();
301
302             /**************************
303              * CALCULATE INTERACTIONS *
304              **************************/
305
306             r00              = _mm_mul_ps(rsq00,rinv00);
307
308             /* EWALD ELECTROSTATICS */
309
310             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
311             ewrt             = _mm_mul_ps(r00,ewtabscale);
312             ewitab           = _mm_cvttps_epi32(ewrt);
313             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
314             ewitab           = _mm_slli_epi32(ewitab,2);
315             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
316             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
317             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
318             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
319             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
320             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
321             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
322             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
323             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
324
325             /* LENNARD-JONES DISPERSION/REPULSION */
326
327             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
328             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
329             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
330             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
331             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
332
333             /* Update potential sum for this i atom from the interaction with this j atom. */
334             velecsum         = _mm_add_ps(velecsum,velec);
335             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
336
337             fscal            = _mm_add_ps(felec,fvdw);
338
339             /* Calculate temporary vectorial force */
340             tx               = _mm_mul_ps(fscal,dx00);
341             ty               = _mm_mul_ps(fscal,dy00);
342             tz               = _mm_mul_ps(fscal,dz00);
343
344             /* Update vectorial force */
345             fix0             = _mm_add_ps(fix0,tx);
346             fiy0             = _mm_add_ps(fiy0,ty);
347             fiz0             = _mm_add_ps(fiz0,tz);
348
349             fjx0             = _mm_add_ps(fjx0,tx);
350             fjy0             = _mm_add_ps(fjy0,ty);
351             fjz0             = _mm_add_ps(fjz0,tz);
352             
353             /**************************
354              * CALCULATE INTERACTIONS *
355              **************************/
356
357             r01              = _mm_mul_ps(rsq01,rinv01);
358
359             /* EWALD ELECTROSTATICS */
360
361             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
362             ewrt             = _mm_mul_ps(r01,ewtabscale);
363             ewitab           = _mm_cvttps_epi32(ewrt);
364             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
365             ewitab           = _mm_slli_epi32(ewitab,2);
366             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
367             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
368             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
369             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
370             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
371             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
372             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
373             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
374             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
375
376             /* Update potential sum for this i atom from the interaction with this j atom. */
377             velecsum         = _mm_add_ps(velecsum,velec);
378
379             fscal            = felec;
380
381             /* Calculate temporary vectorial force */
382             tx               = _mm_mul_ps(fscal,dx01);
383             ty               = _mm_mul_ps(fscal,dy01);
384             tz               = _mm_mul_ps(fscal,dz01);
385
386             /* Update vectorial force */
387             fix0             = _mm_add_ps(fix0,tx);
388             fiy0             = _mm_add_ps(fiy0,ty);
389             fiz0             = _mm_add_ps(fiz0,tz);
390
391             fjx1             = _mm_add_ps(fjx1,tx);
392             fjy1             = _mm_add_ps(fjy1,ty);
393             fjz1             = _mm_add_ps(fjz1,tz);
394             
395             /**************************
396              * CALCULATE INTERACTIONS *
397              **************************/
398
399             r02              = _mm_mul_ps(rsq02,rinv02);
400
401             /* EWALD ELECTROSTATICS */
402
403             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
404             ewrt             = _mm_mul_ps(r02,ewtabscale);
405             ewitab           = _mm_cvttps_epi32(ewrt);
406             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
407             ewitab           = _mm_slli_epi32(ewitab,2);
408             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
409             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
410             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
411             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
412             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
413             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
414             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
415             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
416             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
417
418             /* Update potential sum for this i atom from the interaction with this j atom. */
419             velecsum         = _mm_add_ps(velecsum,velec);
420
421             fscal            = felec;
422
423             /* Calculate temporary vectorial force */
424             tx               = _mm_mul_ps(fscal,dx02);
425             ty               = _mm_mul_ps(fscal,dy02);
426             tz               = _mm_mul_ps(fscal,dz02);
427
428             /* Update vectorial force */
429             fix0             = _mm_add_ps(fix0,tx);
430             fiy0             = _mm_add_ps(fiy0,ty);
431             fiz0             = _mm_add_ps(fiz0,tz);
432
433             fjx2             = _mm_add_ps(fjx2,tx);
434             fjy2             = _mm_add_ps(fjy2,ty);
435             fjz2             = _mm_add_ps(fjz2,tz);
436             
437             /**************************
438              * CALCULATE INTERACTIONS *
439              **************************/
440
441             r10              = _mm_mul_ps(rsq10,rinv10);
442
443             /* EWALD ELECTROSTATICS */
444
445             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
446             ewrt             = _mm_mul_ps(r10,ewtabscale);
447             ewitab           = _mm_cvttps_epi32(ewrt);
448             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
449             ewitab           = _mm_slli_epi32(ewitab,2);
450             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
451             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
452             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
453             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
454             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
455             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
456             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
457             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
458             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
459
460             /* Update potential sum for this i atom from the interaction with this j atom. */
461             velecsum         = _mm_add_ps(velecsum,velec);
462
463             fscal            = felec;
464
465             /* Calculate temporary vectorial force */
466             tx               = _mm_mul_ps(fscal,dx10);
467             ty               = _mm_mul_ps(fscal,dy10);
468             tz               = _mm_mul_ps(fscal,dz10);
469
470             /* Update vectorial force */
471             fix1             = _mm_add_ps(fix1,tx);
472             fiy1             = _mm_add_ps(fiy1,ty);
473             fiz1             = _mm_add_ps(fiz1,tz);
474
475             fjx0             = _mm_add_ps(fjx0,tx);
476             fjy0             = _mm_add_ps(fjy0,ty);
477             fjz0             = _mm_add_ps(fjz0,tz);
478             
479             /**************************
480              * CALCULATE INTERACTIONS *
481              **************************/
482
483             r11              = _mm_mul_ps(rsq11,rinv11);
484
485             /* EWALD ELECTROSTATICS */
486
487             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
488             ewrt             = _mm_mul_ps(r11,ewtabscale);
489             ewitab           = _mm_cvttps_epi32(ewrt);
490             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
491             ewitab           = _mm_slli_epi32(ewitab,2);
492             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
493             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
494             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
495             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
496             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
497             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
498             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
499             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
500             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
501
502             /* Update potential sum for this i atom from the interaction with this j atom. */
503             velecsum         = _mm_add_ps(velecsum,velec);
504
505             fscal            = felec;
506
507             /* Calculate temporary vectorial force */
508             tx               = _mm_mul_ps(fscal,dx11);
509             ty               = _mm_mul_ps(fscal,dy11);
510             tz               = _mm_mul_ps(fscal,dz11);
511
512             /* Update vectorial force */
513             fix1             = _mm_add_ps(fix1,tx);
514             fiy1             = _mm_add_ps(fiy1,ty);
515             fiz1             = _mm_add_ps(fiz1,tz);
516
517             fjx1             = _mm_add_ps(fjx1,tx);
518             fjy1             = _mm_add_ps(fjy1,ty);
519             fjz1             = _mm_add_ps(fjz1,tz);
520             
521             /**************************
522              * CALCULATE INTERACTIONS *
523              **************************/
524
525             r12              = _mm_mul_ps(rsq12,rinv12);
526
527             /* EWALD ELECTROSTATICS */
528
529             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
530             ewrt             = _mm_mul_ps(r12,ewtabscale);
531             ewitab           = _mm_cvttps_epi32(ewrt);
532             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
533             ewitab           = _mm_slli_epi32(ewitab,2);
534             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
535             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
536             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
537             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
538             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
539             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
540             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
541             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
542             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
543
544             /* Update potential sum for this i atom from the interaction with this j atom. */
545             velecsum         = _mm_add_ps(velecsum,velec);
546
547             fscal            = felec;
548
549             /* Calculate temporary vectorial force */
550             tx               = _mm_mul_ps(fscal,dx12);
551             ty               = _mm_mul_ps(fscal,dy12);
552             tz               = _mm_mul_ps(fscal,dz12);
553
554             /* Update vectorial force */
555             fix1             = _mm_add_ps(fix1,tx);
556             fiy1             = _mm_add_ps(fiy1,ty);
557             fiz1             = _mm_add_ps(fiz1,tz);
558
559             fjx2             = _mm_add_ps(fjx2,tx);
560             fjy2             = _mm_add_ps(fjy2,ty);
561             fjz2             = _mm_add_ps(fjz2,tz);
562             
563             /**************************
564              * CALCULATE INTERACTIONS *
565              **************************/
566
567             r20              = _mm_mul_ps(rsq20,rinv20);
568
569             /* EWALD ELECTROSTATICS */
570
571             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
572             ewrt             = _mm_mul_ps(r20,ewtabscale);
573             ewitab           = _mm_cvttps_epi32(ewrt);
574             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
575             ewitab           = _mm_slli_epi32(ewitab,2);
576             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
577             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
578             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
579             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
580             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
581             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
582             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
583             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
584             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
585
586             /* Update potential sum for this i atom from the interaction with this j atom. */
587             velecsum         = _mm_add_ps(velecsum,velec);
588
589             fscal            = felec;
590
591             /* Calculate temporary vectorial force */
592             tx               = _mm_mul_ps(fscal,dx20);
593             ty               = _mm_mul_ps(fscal,dy20);
594             tz               = _mm_mul_ps(fscal,dz20);
595
596             /* Update vectorial force */
597             fix2             = _mm_add_ps(fix2,tx);
598             fiy2             = _mm_add_ps(fiy2,ty);
599             fiz2             = _mm_add_ps(fiz2,tz);
600
601             fjx0             = _mm_add_ps(fjx0,tx);
602             fjy0             = _mm_add_ps(fjy0,ty);
603             fjz0             = _mm_add_ps(fjz0,tz);
604             
605             /**************************
606              * CALCULATE INTERACTIONS *
607              **************************/
608
609             r21              = _mm_mul_ps(rsq21,rinv21);
610
611             /* EWALD ELECTROSTATICS */
612
613             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
614             ewrt             = _mm_mul_ps(r21,ewtabscale);
615             ewitab           = _mm_cvttps_epi32(ewrt);
616             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
617             ewitab           = _mm_slli_epi32(ewitab,2);
618             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
619             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
620             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
621             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
622             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
623             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
624             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
625             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
626             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
627
628             /* Update potential sum for this i atom from the interaction with this j atom. */
629             velecsum         = _mm_add_ps(velecsum,velec);
630
631             fscal            = felec;
632
633             /* Calculate temporary vectorial force */
634             tx               = _mm_mul_ps(fscal,dx21);
635             ty               = _mm_mul_ps(fscal,dy21);
636             tz               = _mm_mul_ps(fscal,dz21);
637
638             /* Update vectorial force */
639             fix2             = _mm_add_ps(fix2,tx);
640             fiy2             = _mm_add_ps(fiy2,ty);
641             fiz2             = _mm_add_ps(fiz2,tz);
642
643             fjx1             = _mm_add_ps(fjx1,tx);
644             fjy1             = _mm_add_ps(fjy1,ty);
645             fjz1             = _mm_add_ps(fjz1,tz);
646             
647             /**************************
648              * CALCULATE INTERACTIONS *
649              **************************/
650
651             r22              = _mm_mul_ps(rsq22,rinv22);
652
653             /* EWALD ELECTROSTATICS */
654
655             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
656             ewrt             = _mm_mul_ps(r22,ewtabscale);
657             ewitab           = _mm_cvttps_epi32(ewrt);
658             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
659             ewitab           = _mm_slli_epi32(ewitab,2);
660             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
661             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
662             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
663             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
664             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
665             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
666             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
667             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
668             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
669
670             /* Update potential sum for this i atom from the interaction with this j atom. */
671             velecsum         = _mm_add_ps(velecsum,velec);
672
673             fscal            = felec;
674
675             /* Calculate temporary vectorial force */
676             tx               = _mm_mul_ps(fscal,dx22);
677             ty               = _mm_mul_ps(fscal,dy22);
678             tz               = _mm_mul_ps(fscal,dz22);
679
680             /* Update vectorial force */
681             fix2             = _mm_add_ps(fix2,tx);
682             fiy2             = _mm_add_ps(fiy2,ty);
683             fiz2             = _mm_add_ps(fiz2,tz);
684
685             fjx2             = _mm_add_ps(fjx2,tx);
686             fjy2             = _mm_add_ps(fjy2,ty);
687             fjz2             = _mm_add_ps(fjz2,tz);
688             
689             fjptrA             = f+j_coord_offsetA;
690             fjptrB             = f+j_coord_offsetB;
691             fjptrC             = f+j_coord_offsetC;
692             fjptrD             = f+j_coord_offsetD;
693
694             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
695                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
696
697             /* Inner loop uses 381 flops */
698         }
699
700         if(jidx<j_index_end)
701         {
702
703             /* Get j neighbor index, and coordinate index */
704             jnrlistA         = jjnr[jidx];
705             jnrlistB         = jjnr[jidx+1];
706             jnrlistC         = jjnr[jidx+2];
707             jnrlistD         = jjnr[jidx+3];
708             /* Sign of each element will be negative for non-real atoms.
709              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
710              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
711              */
712             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
713             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
714             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
715             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
716             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
717             j_coord_offsetA  = DIM*jnrA;
718             j_coord_offsetB  = DIM*jnrB;
719             j_coord_offsetC  = DIM*jnrC;
720             j_coord_offsetD  = DIM*jnrD;
721
722             /* load j atom coordinates */
723             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
724                                               x+j_coord_offsetC,x+j_coord_offsetD,
725                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
726
727             /* Calculate displacement vector */
728             dx00             = _mm_sub_ps(ix0,jx0);
729             dy00             = _mm_sub_ps(iy0,jy0);
730             dz00             = _mm_sub_ps(iz0,jz0);
731             dx01             = _mm_sub_ps(ix0,jx1);
732             dy01             = _mm_sub_ps(iy0,jy1);
733             dz01             = _mm_sub_ps(iz0,jz1);
734             dx02             = _mm_sub_ps(ix0,jx2);
735             dy02             = _mm_sub_ps(iy0,jy2);
736             dz02             = _mm_sub_ps(iz0,jz2);
737             dx10             = _mm_sub_ps(ix1,jx0);
738             dy10             = _mm_sub_ps(iy1,jy0);
739             dz10             = _mm_sub_ps(iz1,jz0);
740             dx11             = _mm_sub_ps(ix1,jx1);
741             dy11             = _mm_sub_ps(iy1,jy1);
742             dz11             = _mm_sub_ps(iz1,jz1);
743             dx12             = _mm_sub_ps(ix1,jx2);
744             dy12             = _mm_sub_ps(iy1,jy2);
745             dz12             = _mm_sub_ps(iz1,jz2);
746             dx20             = _mm_sub_ps(ix2,jx0);
747             dy20             = _mm_sub_ps(iy2,jy0);
748             dz20             = _mm_sub_ps(iz2,jz0);
749             dx21             = _mm_sub_ps(ix2,jx1);
750             dy21             = _mm_sub_ps(iy2,jy1);
751             dz21             = _mm_sub_ps(iz2,jz1);
752             dx22             = _mm_sub_ps(ix2,jx2);
753             dy22             = _mm_sub_ps(iy2,jy2);
754             dz22             = _mm_sub_ps(iz2,jz2);
755
756             /* Calculate squared distance and things based on it */
757             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
758             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
759             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
760             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
761             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
762             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
763             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
764             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
765             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
766
767             rinv00           = gmx_mm_invsqrt_ps(rsq00);
768             rinv01           = gmx_mm_invsqrt_ps(rsq01);
769             rinv02           = gmx_mm_invsqrt_ps(rsq02);
770             rinv10           = gmx_mm_invsqrt_ps(rsq10);
771             rinv11           = gmx_mm_invsqrt_ps(rsq11);
772             rinv12           = gmx_mm_invsqrt_ps(rsq12);
773             rinv20           = gmx_mm_invsqrt_ps(rsq20);
774             rinv21           = gmx_mm_invsqrt_ps(rsq21);
775             rinv22           = gmx_mm_invsqrt_ps(rsq22);
776
777             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
778             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
779             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
780             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
781             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
782             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
783             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
784             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
785             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
786
787             fjx0             = _mm_setzero_ps();
788             fjy0             = _mm_setzero_ps();
789             fjz0             = _mm_setzero_ps();
790             fjx1             = _mm_setzero_ps();
791             fjy1             = _mm_setzero_ps();
792             fjz1             = _mm_setzero_ps();
793             fjx2             = _mm_setzero_ps();
794             fjy2             = _mm_setzero_ps();
795             fjz2             = _mm_setzero_ps();
796
797             /**************************
798              * CALCULATE INTERACTIONS *
799              **************************/
800
801             r00              = _mm_mul_ps(rsq00,rinv00);
802             r00              = _mm_andnot_ps(dummy_mask,r00);
803
804             /* EWALD ELECTROSTATICS */
805
806             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
807             ewrt             = _mm_mul_ps(r00,ewtabscale);
808             ewitab           = _mm_cvttps_epi32(ewrt);
809             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
810             ewitab           = _mm_slli_epi32(ewitab,2);
811             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
812             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
813             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
814             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
815             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
816             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
817             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
818             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
819             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
820
821             /* LENNARD-JONES DISPERSION/REPULSION */
822
823             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
824             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
825             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
826             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
827             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
828
829             /* Update potential sum for this i atom from the interaction with this j atom. */
830             velec            = _mm_andnot_ps(dummy_mask,velec);
831             velecsum         = _mm_add_ps(velecsum,velec);
832             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
833             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
834
835             fscal            = _mm_add_ps(felec,fvdw);
836
837             fscal            = _mm_andnot_ps(dummy_mask,fscal);
838
839             /* Calculate temporary vectorial force */
840             tx               = _mm_mul_ps(fscal,dx00);
841             ty               = _mm_mul_ps(fscal,dy00);
842             tz               = _mm_mul_ps(fscal,dz00);
843
844             /* Update vectorial force */
845             fix0             = _mm_add_ps(fix0,tx);
846             fiy0             = _mm_add_ps(fiy0,ty);
847             fiz0             = _mm_add_ps(fiz0,tz);
848
849             fjx0             = _mm_add_ps(fjx0,tx);
850             fjy0             = _mm_add_ps(fjy0,ty);
851             fjz0             = _mm_add_ps(fjz0,tz);
852             
853             /**************************
854              * CALCULATE INTERACTIONS *
855              **************************/
856
857             r01              = _mm_mul_ps(rsq01,rinv01);
858             r01              = _mm_andnot_ps(dummy_mask,r01);
859
860             /* EWALD ELECTROSTATICS */
861
862             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
863             ewrt             = _mm_mul_ps(r01,ewtabscale);
864             ewitab           = _mm_cvttps_epi32(ewrt);
865             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
866             ewitab           = _mm_slli_epi32(ewitab,2);
867             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
868             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
869             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
870             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
871             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
872             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
873             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
874             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
875             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
876
877             /* Update potential sum for this i atom from the interaction with this j atom. */
878             velec            = _mm_andnot_ps(dummy_mask,velec);
879             velecsum         = _mm_add_ps(velecsum,velec);
880
881             fscal            = felec;
882
883             fscal            = _mm_andnot_ps(dummy_mask,fscal);
884
885             /* Calculate temporary vectorial force */
886             tx               = _mm_mul_ps(fscal,dx01);
887             ty               = _mm_mul_ps(fscal,dy01);
888             tz               = _mm_mul_ps(fscal,dz01);
889
890             /* Update vectorial force */
891             fix0             = _mm_add_ps(fix0,tx);
892             fiy0             = _mm_add_ps(fiy0,ty);
893             fiz0             = _mm_add_ps(fiz0,tz);
894
895             fjx1             = _mm_add_ps(fjx1,tx);
896             fjy1             = _mm_add_ps(fjy1,ty);
897             fjz1             = _mm_add_ps(fjz1,tz);
898             
899             /**************************
900              * CALCULATE INTERACTIONS *
901              **************************/
902
903             r02              = _mm_mul_ps(rsq02,rinv02);
904             r02              = _mm_andnot_ps(dummy_mask,r02);
905
906             /* EWALD ELECTROSTATICS */
907
908             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
909             ewrt             = _mm_mul_ps(r02,ewtabscale);
910             ewitab           = _mm_cvttps_epi32(ewrt);
911             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
912             ewitab           = _mm_slli_epi32(ewitab,2);
913             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
914             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
915             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
916             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
917             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
918             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
919             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
920             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
921             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
922
923             /* Update potential sum for this i atom from the interaction with this j atom. */
924             velec            = _mm_andnot_ps(dummy_mask,velec);
925             velecsum         = _mm_add_ps(velecsum,velec);
926
927             fscal            = felec;
928
929             fscal            = _mm_andnot_ps(dummy_mask,fscal);
930
931             /* Calculate temporary vectorial force */
932             tx               = _mm_mul_ps(fscal,dx02);
933             ty               = _mm_mul_ps(fscal,dy02);
934             tz               = _mm_mul_ps(fscal,dz02);
935
936             /* Update vectorial force */
937             fix0             = _mm_add_ps(fix0,tx);
938             fiy0             = _mm_add_ps(fiy0,ty);
939             fiz0             = _mm_add_ps(fiz0,tz);
940
941             fjx2             = _mm_add_ps(fjx2,tx);
942             fjy2             = _mm_add_ps(fjy2,ty);
943             fjz2             = _mm_add_ps(fjz2,tz);
944             
945             /**************************
946              * CALCULATE INTERACTIONS *
947              **************************/
948
949             r10              = _mm_mul_ps(rsq10,rinv10);
950             r10              = _mm_andnot_ps(dummy_mask,r10);
951
952             /* EWALD ELECTROSTATICS */
953
954             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
955             ewrt             = _mm_mul_ps(r10,ewtabscale);
956             ewitab           = _mm_cvttps_epi32(ewrt);
957             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
958             ewitab           = _mm_slli_epi32(ewitab,2);
959             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
960             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
961             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
962             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
963             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
964             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
965             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
966             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
967             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
968
969             /* Update potential sum for this i atom from the interaction with this j atom. */
970             velec            = _mm_andnot_ps(dummy_mask,velec);
971             velecsum         = _mm_add_ps(velecsum,velec);
972
973             fscal            = felec;
974
975             fscal            = _mm_andnot_ps(dummy_mask,fscal);
976
977             /* Calculate temporary vectorial force */
978             tx               = _mm_mul_ps(fscal,dx10);
979             ty               = _mm_mul_ps(fscal,dy10);
980             tz               = _mm_mul_ps(fscal,dz10);
981
982             /* Update vectorial force */
983             fix1             = _mm_add_ps(fix1,tx);
984             fiy1             = _mm_add_ps(fiy1,ty);
985             fiz1             = _mm_add_ps(fiz1,tz);
986
987             fjx0             = _mm_add_ps(fjx0,tx);
988             fjy0             = _mm_add_ps(fjy0,ty);
989             fjz0             = _mm_add_ps(fjz0,tz);
990             
991             /**************************
992              * CALCULATE INTERACTIONS *
993              **************************/
994
995             r11              = _mm_mul_ps(rsq11,rinv11);
996             r11              = _mm_andnot_ps(dummy_mask,r11);
997
998             /* EWALD ELECTROSTATICS */
999
1000             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1001             ewrt             = _mm_mul_ps(r11,ewtabscale);
1002             ewitab           = _mm_cvttps_epi32(ewrt);
1003             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1004             ewitab           = _mm_slli_epi32(ewitab,2);
1005             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1006             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1007             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1008             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1009             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1010             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1011             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1012             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1013             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1014
1015             /* Update potential sum for this i atom from the interaction with this j atom. */
1016             velec            = _mm_andnot_ps(dummy_mask,velec);
1017             velecsum         = _mm_add_ps(velecsum,velec);
1018
1019             fscal            = felec;
1020
1021             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1022
1023             /* Calculate temporary vectorial force */
1024             tx               = _mm_mul_ps(fscal,dx11);
1025             ty               = _mm_mul_ps(fscal,dy11);
1026             tz               = _mm_mul_ps(fscal,dz11);
1027
1028             /* Update vectorial force */
1029             fix1             = _mm_add_ps(fix1,tx);
1030             fiy1             = _mm_add_ps(fiy1,ty);
1031             fiz1             = _mm_add_ps(fiz1,tz);
1032
1033             fjx1             = _mm_add_ps(fjx1,tx);
1034             fjy1             = _mm_add_ps(fjy1,ty);
1035             fjz1             = _mm_add_ps(fjz1,tz);
1036             
1037             /**************************
1038              * CALCULATE INTERACTIONS *
1039              **************************/
1040
1041             r12              = _mm_mul_ps(rsq12,rinv12);
1042             r12              = _mm_andnot_ps(dummy_mask,r12);
1043
1044             /* EWALD ELECTROSTATICS */
1045
1046             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1047             ewrt             = _mm_mul_ps(r12,ewtabscale);
1048             ewitab           = _mm_cvttps_epi32(ewrt);
1049             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1050             ewitab           = _mm_slli_epi32(ewitab,2);
1051             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1052             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1053             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1054             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1055             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1056             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1057             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1058             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1059             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1060
1061             /* Update potential sum for this i atom from the interaction with this j atom. */
1062             velec            = _mm_andnot_ps(dummy_mask,velec);
1063             velecsum         = _mm_add_ps(velecsum,velec);
1064
1065             fscal            = felec;
1066
1067             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1068
1069             /* Calculate temporary vectorial force */
1070             tx               = _mm_mul_ps(fscal,dx12);
1071             ty               = _mm_mul_ps(fscal,dy12);
1072             tz               = _mm_mul_ps(fscal,dz12);
1073
1074             /* Update vectorial force */
1075             fix1             = _mm_add_ps(fix1,tx);
1076             fiy1             = _mm_add_ps(fiy1,ty);
1077             fiz1             = _mm_add_ps(fiz1,tz);
1078
1079             fjx2             = _mm_add_ps(fjx2,tx);
1080             fjy2             = _mm_add_ps(fjy2,ty);
1081             fjz2             = _mm_add_ps(fjz2,tz);
1082             
1083             /**************************
1084              * CALCULATE INTERACTIONS *
1085              **************************/
1086
1087             r20              = _mm_mul_ps(rsq20,rinv20);
1088             r20              = _mm_andnot_ps(dummy_mask,r20);
1089
1090             /* EWALD ELECTROSTATICS */
1091
1092             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1093             ewrt             = _mm_mul_ps(r20,ewtabscale);
1094             ewitab           = _mm_cvttps_epi32(ewrt);
1095             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1096             ewitab           = _mm_slli_epi32(ewitab,2);
1097             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1098             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1099             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1100             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1101             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1102             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1103             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1104             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1105             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1106
1107             /* Update potential sum for this i atom from the interaction with this j atom. */
1108             velec            = _mm_andnot_ps(dummy_mask,velec);
1109             velecsum         = _mm_add_ps(velecsum,velec);
1110
1111             fscal            = felec;
1112
1113             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1114
1115             /* Calculate temporary vectorial force */
1116             tx               = _mm_mul_ps(fscal,dx20);
1117             ty               = _mm_mul_ps(fscal,dy20);
1118             tz               = _mm_mul_ps(fscal,dz20);
1119
1120             /* Update vectorial force */
1121             fix2             = _mm_add_ps(fix2,tx);
1122             fiy2             = _mm_add_ps(fiy2,ty);
1123             fiz2             = _mm_add_ps(fiz2,tz);
1124
1125             fjx0             = _mm_add_ps(fjx0,tx);
1126             fjy0             = _mm_add_ps(fjy0,ty);
1127             fjz0             = _mm_add_ps(fjz0,tz);
1128             
1129             /**************************
1130              * CALCULATE INTERACTIONS *
1131              **************************/
1132
1133             r21              = _mm_mul_ps(rsq21,rinv21);
1134             r21              = _mm_andnot_ps(dummy_mask,r21);
1135
1136             /* EWALD ELECTROSTATICS */
1137
1138             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1139             ewrt             = _mm_mul_ps(r21,ewtabscale);
1140             ewitab           = _mm_cvttps_epi32(ewrt);
1141             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1142             ewitab           = _mm_slli_epi32(ewitab,2);
1143             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1144             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1145             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1146             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1147             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1148             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1149             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1150             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1151             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1152
1153             /* Update potential sum for this i atom from the interaction with this j atom. */
1154             velec            = _mm_andnot_ps(dummy_mask,velec);
1155             velecsum         = _mm_add_ps(velecsum,velec);
1156
1157             fscal            = felec;
1158
1159             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1160
1161             /* Calculate temporary vectorial force */
1162             tx               = _mm_mul_ps(fscal,dx21);
1163             ty               = _mm_mul_ps(fscal,dy21);
1164             tz               = _mm_mul_ps(fscal,dz21);
1165
1166             /* Update vectorial force */
1167             fix2             = _mm_add_ps(fix2,tx);
1168             fiy2             = _mm_add_ps(fiy2,ty);
1169             fiz2             = _mm_add_ps(fiz2,tz);
1170
1171             fjx1             = _mm_add_ps(fjx1,tx);
1172             fjy1             = _mm_add_ps(fjy1,ty);
1173             fjz1             = _mm_add_ps(fjz1,tz);
1174             
1175             /**************************
1176              * CALCULATE INTERACTIONS *
1177              **************************/
1178
1179             r22              = _mm_mul_ps(rsq22,rinv22);
1180             r22              = _mm_andnot_ps(dummy_mask,r22);
1181
1182             /* EWALD ELECTROSTATICS */
1183
1184             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1185             ewrt             = _mm_mul_ps(r22,ewtabscale);
1186             ewitab           = _mm_cvttps_epi32(ewrt);
1187             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1188             ewitab           = _mm_slli_epi32(ewitab,2);
1189             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1190             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1191             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1192             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1193             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1194             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1195             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1196             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1197             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1198
1199             /* Update potential sum for this i atom from the interaction with this j atom. */
1200             velec            = _mm_andnot_ps(dummy_mask,velec);
1201             velecsum         = _mm_add_ps(velecsum,velec);
1202
1203             fscal            = felec;
1204
1205             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1206
1207             /* Calculate temporary vectorial force */
1208             tx               = _mm_mul_ps(fscal,dx22);
1209             ty               = _mm_mul_ps(fscal,dy22);
1210             tz               = _mm_mul_ps(fscal,dz22);
1211
1212             /* Update vectorial force */
1213             fix2             = _mm_add_ps(fix2,tx);
1214             fiy2             = _mm_add_ps(fiy2,ty);
1215             fiz2             = _mm_add_ps(fiz2,tz);
1216
1217             fjx2             = _mm_add_ps(fjx2,tx);
1218             fjy2             = _mm_add_ps(fjy2,ty);
1219             fjz2             = _mm_add_ps(fjz2,tz);
1220             
1221             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1222             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1223             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1224             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1225
1226             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1227                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1228
1229             /* Inner loop uses 390 flops */
1230         }
1231
1232         /* End of innermost loop */
1233
1234         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1235                                               f+i_coord_offset,fshift+i_shift_offset);
1236
1237         ggid                        = gid[iidx];
1238         /* Update potential energies */
1239         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1240         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1241
1242         /* Increment number of inner iterations */
1243         inneriter                  += j_index_end - j_index_start;
1244
1245         /* Outer loop uses 20 flops */
1246     }
1247
1248     /* Increment number of outer iterations */
1249     outeriter        += nri;
1250
1251     /* Update outer/inner flops */
1252
1253     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*390);
1254 }
1255 /*
1256  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single
1257  * Electrostatics interaction: Ewald
1258  * VdW interaction:            LennardJones
1259  * Geometry:                   Water3-Water3
1260  * Calculate force/pot:        Force
1261  */
1262 void
1263 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single
1264                     (t_nblist                    * gmx_restrict       nlist,
1265                      rvec                        * gmx_restrict          xx,
1266                      rvec                        * gmx_restrict          ff,
1267                      t_forcerec                  * gmx_restrict          fr,
1268                      t_mdatoms                   * gmx_restrict     mdatoms,
1269                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1270                      t_nrnb                      * gmx_restrict        nrnb)
1271 {
1272     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1273      * just 0 for non-waters.
1274      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1275      * jnr indices corresponding to data put in the four positions in the SIMD register.
1276      */
1277     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1278     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1279     int              jnrA,jnrB,jnrC,jnrD;
1280     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1281     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1282     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1283     real             rcutoff_scalar;
1284     real             *shiftvec,*fshift,*x,*f;
1285     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1286     real             scratch[4*DIM];
1287     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1288     int              vdwioffset0;
1289     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1290     int              vdwioffset1;
1291     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1292     int              vdwioffset2;
1293     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1294     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1295     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1296     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1297     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1298     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1299     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1300     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1301     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1302     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1303     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1304     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1305     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1306     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1307     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1308     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1309     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1310     real             *charge;
1311     int              nvdwtype;
1312     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1313     int              *vdwtype;
1314     real             *vdwparam;
1315     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1316     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1317     __m128i          ewitab;
1318     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1319     real             *ewtab;
1320     __m128           dummy_mask,cutoff_mask;
1321     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1322     __m128           one     = _mm_set1_ps(1.0);
1323     __m128           two     = _mm_set1_ps(2.0);
1324     x                = xx[0];
1325     f                = ff[0];
1326
1327     nri              = nlist->nri;
1328     iinr             = nlist->iinr;
1329     jindex           = nlist->jindex;
1330     jjnr             = nlist->jjnr;
1331     shiftidx         = nlist->shift;
1332     gid              = nlist->gid;
1333     shiftvec         = fr->shift_vec[0];
1334     fshift           = fr->fshift[0];
1335     facel            = _mm_set1_ps(fr->epsfac);
1336     charge           = mdatoms->chargeA;
1337     nvdwtype         = fr->ntype;
1338     vdwparam         = fr->nbfp;
1339     vdwtype          = mdatoms->typeA;
1340
1341     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1342     ewtab            = fr->ic->tabq_coul_F;
1343     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1344     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1345
1346     /* Setup water-specific parameters */
1347     inr              = nlist->iinr[0];
1348     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1349     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1350     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1351     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1352
1353     jq0              = _mm_set1_ps(charge[inr+0]);
1354     jq1              = _mm_set1_ps(charge[inr+1]);
1355     jq2              = _mm_set1_ps(charge[inr+2]);
1356     vdwjidx0A        = 2*vdwtype[inr+0];
1357     qq00             = _mm_mul_ps(iq0,jq0);
1358     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1359     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1360     qq01             = _mm_mul_ps(iq0,jq1);
1361     qq02             = _mm_mul_ps(iq0,jq2);
1362     qq10             = _mm_mul_ps(iq1,jq0);
1363     qq11             = _mm_mul_ps(iq1,jq1);
1364     qq12             = _mm_mul_ps(iq1,jq2);
1365     qq20             = _mm_mul_ps(iq2,jq0);
1366     qq21             = _mm_mul_ps(iq2,jq1);
1367     qq22             = _mm_mul_ps(iq2,jq2);
1368
1369     /* Avoid stupid compiler warnings */
1370     jnrA = jnrB = jnrC = jnrD = 0;
1371     j_coord_offsetA = 0;
1372     j_coord_offsetB = 0;
1373     j_coord_offsetC = 0;
1374     j_coord_offsetD = 0;
1375
1376     outeriter        = 0;
1377     inneriter        = 0;
1378
1379     for(iidx=0;iidx<4*DIM;iidx++)
1380     {
1381         scratch[iidx] = 0.0;
1382     }  
1383
1384     /* Start outer loop over neighborlists */
1385     for(iidx=0; iidx<nri; iidx++)
1386     {
1387         /* Load shift vector for this list */
1388         i_shift_offset   = DIM*shiftidx[iidx];
1389
1390         /* Load limits for loop over neighbors */
1391         j_index_start    = jindex[iidx];
1392         j_index_end      = jindex[iidx+1];
1393
1394         /* Get outer coordinate index */
1395         inr              = iinr[iidx];
1396         i_coord_offset   = DIM*inr;
1397
1398         /* Load i particle coords and add shift vector */
1399         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1400                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1401         
1402         fix0             = _mm_setzero_ps();
1403         fiy0             = _mm_setzero_ps();
1404         fiz0             = _mm_setzero_ps();
1405         fix1             = _mm_setzero_ps();
1406         fiy1             = _mm_setzero_ps();
1407         fiz1             = _mm_setzero_ps();
1408         fix2             = _mm_setzero_ps();
1409         fiy2             = _mm_setzero_ps();
1410         fiz2             = _mm_setzero_ps();
1411
1412         /* Start inner kernel loop */
1413         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1414         {
1415
1416             /* Get j neighbor index, and coordinate index */
1417             jnrA             = jjnr[jidx];
1418             jnrB             = jjnr[jidx+1];
1419             jnrC             = jjnr[jidx+2];
1420             jnrD             = jjnr[jidx+3];
1421             j_coord_offsetA  = DIM*jnrA;
1422             j_coord_offsetB  = DIM*jnrB;
1423             j_coord_offsetC  = DIM*jnrC;
1424             j_coord_offsetD  = DIM*jnrD;
1425
1426             /* load j atom coordinates */
1427             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1428                                               x+j_coord_offsetC,x+j_coord_offsetD,
1429                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1430
1431             /* Calculate displacement vector */
1432             dx00             = _mm_sub_ps(ix0,jx0);
1433             dy00             = _mm_sub_ps(iy0,jy0);
1434             dz00             = _mm_sub_ps(iz0,jz0);
1435             dx01             = _mm_sub_ps(ix0,jx1);
1436             dy01             = _mm_sub_ps(iy0,jy1);
1437             dz01             = _mm_sub_ps(iz0,jz1);
1438             dx02             = _mm_sub_ps(ix0,jx2);
1439             dy02             = _mm_sub_ps(iy0,jy2);
1440             dz02             = _mm_sub_ps(iz0,jz2);
1441             dx10             = _mm_sub_ps(ix1,jx0);
1442             dy10             = _mm_sub_ps(iy1,jy0);
1443             dz10             = _mm_sub_ps(iz1,jz0);
1444             dx11             = _mm_sub_ps(ix1,jx1);
1445             dy11             = _mm_sub_ps(iy1,jy1);
1446             dz11             = _mm_sub_ps(iz1,jz1);
1447             dx12             = _mm_sub_ps(ix1,jx2);
1448             dy12             = _mm_sub_ps(iy1,jy2);
1449             dz12             = _mm_sub_ps(iz1,jz2);
1450             dx20             = _mm_sub_ps(ix2,jx0);
1451             dy20             = _mm_sub_ps(iy2,jy0);
1452             dz20             = _mm_sub_ps(iz2,jz0);
1453             dx21             = _mm_sub_ps(ix2,jx1);
1454             dy21             = _mm_sub_ps(iy2,jy1);
1455             dz21             = _mm_sub_ps(iz2,jz1);
1456             dx22             = _mm_sub_ps(ix2,jx2);
1457             dy22             = _mm_sub_ps(iy2,jy2);
1458             dz22             = _mm_sub_ps(iz2,jz2);
1459
1460             /* Calculate squared distance and things based on it */
1461             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1462             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1463             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1464             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1465             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1466             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1467             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1468             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1469             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1470
1471             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1472             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1473             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1474             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1475             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1476             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1477             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1478             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1479             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1480
1481             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1482             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1483             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1484             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1485             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1486             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1487             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1488             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1489             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1490
1491             fjx0             = _mm_setzero_ps();
1492             fjy0             = _mm_setzero_ps();
1493             fjz0             = _mm_setzero_ps();
1494             fjx1             = _mm_setzero_ps();
1495             fjy1             = _mm_setzero_ps();
1496             fjz1             = _mm_setzero_ps();
1497             fjx2             = _mm_setzero_ps();
1498             fjy2             = _mm_setzero_ps();
1499             fjz2             = _mm_setzero_ps();
1500
1501             /**************************
1502              * CALCULATE INTERACTIONS *
1503              **************************/
1504
1505             r00              = _mm_mul_ps(rsq00,rinv00);
1506
1507             /* EWALD ELECTROSTATICS */
1508
1509             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1510             ewrt             = _mm_mul_ps(r00,ewtabscale);
1511             ewitab           = _mm_cvttps_epi32(ewrt);
1512             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1513             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1514                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1515                                          &ewtabF,&ewtabFn);
1516             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1517             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1518
1519             /* LENNARD-JONES DISPERSION/REPULSION */
1520
1521             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1522             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1523
1524             fscal            = _mm_add_ps(felec,fvdw);
1525
1526             /* Calculate temporary vectorial force */
1527             tx               = _mm_mul_ps(fscal,dx00);
1528             ty               = _mm_mul_ps(fscal,dy00);
1529             tz               = _mm_mul_ps(fscal,dz00);
1530
1531             /* Update vectorial force */
1532             fix0             = _mm_add_ps(fix0,tx);
1533             fiy0             = _mm_add_ps(fiy0,ty);
1534             fiz0             = _mm_add_ps(fiz0,tz);
1535
1536             fjx0             = _mm_add_ps(fjx0,tx);
1537             fjy0             = _mm_add_ps(fjy0,ty);
1538             fjz0             = _mm_add_ps(fjz0,tz);
1539             
1540             /**************************
1541              * CALCULATE INTERACTIONS *
1542              **************************/
1543
1544             r01              = _mm_mul_ps(rsq01,rinv01);
1545
1546             /* EWALD ELECTROSTATICS */
1547
1548             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1549             ewrt             = _mm_mul_ps(r01,ewtabscale);
1550             ewitab           = _mm_cvttps_epi32(ewrt);
1551             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1552             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1553                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1554                                          &ewtabF,&ewtabFn);
1555             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1556             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1557
1558             fscal            = felec;
1559
1560             /* Calculate temporary vectorial force */
1561             tx               = _mm_mul_ps(fscal,dx01);
1562             ty               = _mm_mul_ps(fscal,dy01);
1563             tz               = _mm_mul_ps(fscal,dz01);
1564
1565             /* Update vectorial force */
1566             fix0             = _mm_add_ps(fix0,tx);
1567             fiy0             = _mm_add_ps(fiy0,ty);
1568             fiz0             = _mm_add_ps(fiz0,tz);
1569
1570             fjx1             = _mm_add_ps(fjx1,tx);
1571             fjy1             = _mm_add_ps(fjy1,ty);
1572             fjz1             = _mm_add_ps(fjz1,tz);
1573             
1574             /**************************
1575              * CALCULATE INTERACTIONS *
1576              **************************/
1577
1578             r02              = _mm_mul_ps(rsq02,rinv02);
1579
1580             /* EWALD ELECTROSTATICS */
1581
1582             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1583             ewrt             = _mm_mul_ps(r02,ewtabscale);
1584             ewitab           = _mm_cvttps_epi32(ewrt);
1585             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1586             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1587                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1588                                          &ewtabF,&ewtabFn);
1589             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1590             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1591
1592             fscal            = felec;
1593
1594             /* Calculate temporary vectorial force */
1595             tx               = _mm_mul_ps(fscal,dx02);
1596             ty               = _mm_mul_ps(fscal,dy02);
1597             tz               = _mm_mul_ps(fscal,dz02);
1598
1599             /* Update vectorial force */
1600             fix0             = _mm_add_ps(fix0,tx);
1601             fiy0             = _mm_add_ps(fiy0,ty);
1602             fiz0             = _mm_add_ps(fiz0,tz);
1603
1604             fjx2             = _mm_add_ps(fjx2,tx);
1605             fjy2             = _mm_add_ps(fjy2,ty);
1606             fjz2             = _mm_add_ps(fjz2,tz);
1607             
1608             /**************************
1609              * CALCULATE INTERACTIONS *
1610              **************************/
1611
1612             r10              = _mm_mul_ps(rsq10,rinv10);
1613
1614             /* EWALD ELECTROSTATICS */
1615
1616             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1617             ewrt             = _mm_mul_ps(r10,ewtabscale);
1618             ewitab           = _mm_cvttps_epi32(ewrt);
1619             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1620             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1621                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1622                                          &ewtabF,&ewtabFn);
1623             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1624             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1625
1626             fscal            = felec;
1627
1628             /* Calculate temporary vectorial force */
1629             tx               = _mm_mul_ps(fscal,dx10);
1630             ty               = _mm_mul_ps(fscal,dy10);
1631             tz               = _mm_mul_ps(fscal,dz10);
1632
1633             /* Update vectorial force */
1634             fix1             = _mm_add_ps(fix1,tx);
1635             fiy1             = _mm_add_ps(fiy1,ty);
1636             fiz1             = _mm_add_ps(fiz1,tz);
1637
1638             fjx0             = _mm_add_ps(fjx0,tx);
1639             fjy0             = _mm_add_ps(fjy0,ty);
1640             fjz0             = _mm_add_ps(fjz0,tz);
1641             
1642             /**************************
1643              * CALCULATE INTERACTIONS *
1644              **************************/
1645
1646             r11              = _mm_mul_ps(rsq11,rinv11);
1647
1648             /* EWALD ELECTROSTATICS */
1649
1650             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1651             ewrt             = _mm_mul_ps(r11,ewtabscale);
1652             ewitab           = _mm_cvttps_epi32(ewrt);
1653             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1654             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1655                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1656                                          &ewtabF,&ewtabFn);
1657             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1658             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1659
1660             fscal            = felec;
1661
1662             /* Calculate temporary vectorial force */
1663             tx               = _mm_mul_ps(fscal,dx11);
1664             ty               = _mm_mul_ps(fscal,dy11);
1665             tz               = _mm_mul_ps(fscal,dz11);
1666
1667             /* Update vectorial force */
1668             fix1             = _mm_add_ps(fix1,tx);
1669             fiy1             = _mm_add_ps(fiy1,ty);
1670             fiz1             = _mm_add_ps(fiz1,tz);
1671
1672             fjx1             = _mm_add_ps(fjx1,tx);
1673             fjy1             = _mm_add_ps(fjy1,ty);
1674             fjz1             = _mm_add_ps(fjz1,tz);
1675             
1676             /**************************
1677              * CALCULATE INTERACTIONS *
1678              **************************/
1679
1680             r12              = _mm_mul_ps(rsq12,rinv12);
1681
1682             /* EWALD ELECTROSTATICS */
1683
1684             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1685             ewrt             = _mm_mul_ps(r12,ewtabscale);
1686             ewitab           = _mm_cvttps_epi32(ewrt);
1687             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1688             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1689                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1690                                          &ewtabF,&ewtabFn);
1691             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1692             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1693
1694             fscal            = felec;
1695
1696             /* Calculate temporary vectorial force */
1697             tx               = _mm_mul_ps(fscal,dx12);
1698             ty               = _mm_mul_ps(fscal,dy12);
1699             tz               = _mm_mul_ps(fscal,dz12);
1700
1701             /* Update vectorial force */
1702             fix1             = _mm_add_ps(fix1,tx);
1703             fiy1             = _mm_add_ps(fiy1,ty);
1704             fiz1             = _mm_add_ps(fiz1,tz);
1705
1706             fjx2             = _mm_add_ps(fjx2,tx);
1707             fjy2             = _mm_add_ps(fjy2,ty);
1708             fjz2             = _mm_add_ps(fjz2,tz);
1709             
1710             /**************************
1711              * CALCULATE INTERACTIONS *
1712              **************************/
1713
1714             r20              = _mm_mul_ps(rsq20,rinv20);
1715
1716             /* EWALD ELECTROSTATICS */
1717
1718             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1719             ewrt             = _mm_mul_ps(r20,ewtabscale);
1720             ewitab           = _mm_cvttps_epi32(ewrt);
1721             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1722             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1723                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1724                                          &ewtabF,&ewtabFn);
1725             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1726             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1727
1728             fscal            = felec;
1729
1730             /* Calculate temporary vectorial force */
1731             tx               = _mm_mul_ps(fscal,dx20);
1732             ty               = _mm_mul_ps(fscal,dy20);
1733             tz               = _mm_mul_ps(fscal,dz20);
1734
1735             /* Update vectorial force */
1736             fix2             = _mm_add_ps(fix2,tx);
1737             fiy2             = _mm_add_ps(fiy2,ty);
1738             fiz2             = _mm_add_ps(fiz2,tz);
1739
1740             fjx0             = _mm_add_ps(fjx0,tx);
1741             fjy0             = _mm_add_ps(fjy0,ty);
1742             fjz0             = _mm_add_ps(fjz0,tz);
1743             
1744             /**************************
1745              * CALCULATE INTERACTIONS *
1746              **************************/
1747
1748             r21              = _mm_mul_ps(rsq21,rinv21);
1749
1750             /* EWALD ELECTROSTATICS */
1751
1752             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1753             ewrt             = _mm_mul_ps(r21,ewtabscale);
1754             ewitab           = _mm_cvttps_epi32(ewrt);
1755             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1756             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1757                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1758                                          &ewtabF,&ewtabFn);
1759             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1760             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1761
1762             fscal            = felec;
1763
1764             /* Calculate temporary vectorial force */
1765             tx               = _mm_mul_ps(fscal,dx21);
1766             ty               = _mm_mul_ps(fscal,dy21);
1767             tz               = _mm_mul_ps(fscal,dz21);
1768
1769             /* Update vectorial force */
1770             fix2             = _mm_add_ps(fix2,tx);
1771             fiy2             = _mm_add_ps(fiy2,ty);
1772             fiz2             = _mm_add_ps(fiz2,tz);
1773
1774             fjx1             = _mm_add_ps(fjx1,tx);
1775             fjy1             = _mm_add_ps(fjy1,ty);
1776             fjz1             = _mm_add_ps(fjz1,tz);
1777             
1778             /**************************
1779              * CALCULATE INTERACTIONS *
1780              **************************/
1781
1782             r22              = _mm_mul_ps(rsq22,rinv22);
1783
1784             /* EWALD ELECTROSTATICS */
1785
1786             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1787             ewrt             = _mm_mul_ps(r22,ewtabscale);
1788             ewitab           = _mm_cvttps_epi32(ewrt);
1789             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1790             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1791                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1792                                          &ewtabF,&ewtabFn);
1793             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1794             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1795
1796             fscal            = felec;
1797
1798             /* Calculate temporary vectorial force */
1799             tx               = _mm_mul_ps(fscal,dx22);
1800             ty               = _mm_mul_ps(fscal,dy22);
1801             tz               = _mm_mul_ps(fscal,dz22);
1802
1803             /* Update vectorial force */
1804             fix2             = _mm_add_ps(fix2,tx);
1805             fiy2             = _mm_add_ps(fiy2,ty);
1806             fiz2             = _mm_add_ps(fiz2,tz);
1807
1808             fjx2             = _mm_add_ps(fjx2,tx);
1809             fjy2             = _mm_add_ps(fjy2,ty);
1810             fjz2             = _mm_add_ps(fjz2,tz);
1811             
1812             fjptrA             = f+j_coord_offsetA;
1813             fjptrB             = f+j_coord_offsetB;
1814             fjptrC             = f+j_coord_offsetC;
1815             fjptrD             = f+j_coord_offsetD;
1816
1817             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1818                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1819
1820             /* Inner loop uses 331 flops */
1821         }
1822
1823         if(jidx<j_index_end)
1824         {
1825
1826             /* Get j neighbor index, and coordinate index */
1827             jnrlistA         = jjnr[jidx];
1828             jnrlistB         = jjnr[jidx+1];
1829             jnrlistC         = jjnr[jidx+2];
1830             jnrlistD         = jjnr[jidx+3];
1831             /* Sign of each element will be negative for non-real atoms.
1832              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1833              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1834              */
1835             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1836             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1837             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1838             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1839             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1840             j_coord_offsetA  = DIM*jnrA;
1841             j_coord_offsetB  = DIM*jnrB;
1842             j_coord_offsetC  = DIM*jnrC;
1843             j_coord_offsetD  = DIM*jnrD;
1844
1845             /* load j atom coordinates */
1846             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1847                                               x+j_coord_offsetC,x+j_coord_offsetD,
1848                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1849
1850             /* Calculate displacement vector */
1851             dx00             = _mm_sub_ps(ix0,jx0);
1852             dy00             = _mm_sub_ps(iy0,jy0);
1853             dz00             = _mm_sub_ps(iz0,jz0);
1854             dx01             = _mm_sub_ps(ix0,jx1);
1855             dy01             = _mm_sub_ps(iy0,jy1);
1856             dz01             = _mm_sub_ps(iz0,jz1);
1857             dx02             = _mm_sub_ps(ix0,jx2);
1858             dy02             = _mm_sub_ps(iy0,jy2);
1859             dz02             = _mm_sub_ps(iz0,jz2);
1860             dx10             = _mm_sub_ps(ix1,jx0);
1861             dy10             = _mm_sub_ps(iy1,jy0);
1862             dz10             = _mm_sub_ps(iz1,jz0);
1863             dx11             = _mm_sub_ps(ix1,jx1);
1864             dy11             = _mm_sub_ps(iy1,jy1);
1865             dz11             = _mm_sub_ps(iz1,jz1);
1866             dx12             = _mm_sub_ps(ix1,jx2);
1867             dy12             = _mm_sub_ps(iy1,jy2);
1868             dz12             = _mm_sub_ps(iz1,jz2);
1869             dx20             = _mm_sub_ps(ix2,jx0);
1870             dy20             = _mm_sub_ps(iy2,jy0);
1871             dz20             = _mm_sub_ps(iz2,jz0);
1872             dx21             = _mm_sub_ps(ix2,jx1);
1873             dy21             = _mm_sub_ps(iy2,jy1);
1874             dz21             = _mm_sub_ps(iz2,jz1);
1875             dx22             = _mm_sub_ps(ix2,jx2);
1876             dy22             = _mm_sub_ps(iy2,jy2);
1877             dz22             = _mm_sub_ps(iz2,jz2);
1878
1879             /* Calculate squared distance and things based on it */
1880             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1881             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1882             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1883             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1884             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1885             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1886             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1887             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1888             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1889
1890             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1891             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1892             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1893             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1894             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1895             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1896             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1897             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1898             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1899
1900             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1901             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1902             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1903             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1904             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1905             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1906             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1907             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1908             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1909
1910             fjx0             = _mm_setzero_ps();
1911             fjy0             = _mm_setzero_ps();
1912             fjz0             = _mm_setzero_ps();
1913             fjx1             = _mm_setzero_ps();
1914             fjy1             = _mm_setzero_ps();
1915             fjz1             = _mm_setzero_ps();
1916             fjx2             = _mm_setzero_ps();
1917             fjy2             = _mm_setzero_ps();
1918             fjz2             = _mm_setzero_ps();
1919
1920             /**************************
1921              * CALCULATE INTERACTIONS *
1922              **************************/
1923
1924             r00              = _mm_mul_ps(rsq00,rinv00);
1925             r00              = _mm_andnot_ps(dummy_mask,r00);
1926
1927             /* EWALD ELECTROSTATICS */
1928
1929             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1930             ewrt             = _mm_mul_ps(r00,ewtabscale);
1931             ewitab           = _mm_cvttps_epi32(ewrt);
1932             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1933             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1934                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1935                                          &ewtabF,&ewtabFn);
1936             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1937             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1938
1939             /* LENNARD-JONES DISPERSION/REPULSION */
1940
1941             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1942             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1943
1944             fscal            = _mm_add_ps(felec,fvdw);
1945
1946             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1947
1948             /* Calculate temporary vectorial force */
1949             tx               = _mm_mul_ps(fscal,dx00);
1950             ty               = _mm_mul_ps(fscal,dy00);
1951             tz               = _mm_mul_ps(fscal,dz00);
1952
1953             /* Update vectorial force */
1954             fix0             = _mm_add_ps(fix0,tx);
1955             fiy0             = _mm_add_ps(fiy0,ty);
1956             fiz0             = _mm_add_ps(fiz0,tz);
1957
1958             fjx0             = _mm_add_ps(fjx0,tx);
1959             fjy0             = _mm_add_ps(fjy0,ty);
1960             fjz0             = _mm_add_ps(fjz0,tz);
1961             
1962             /**************************
1963              * CALCULATE INTERACTIONS *
1964              **************************/
1965
1966             r01              = _mm_mul_ps(rsq01,rinv01);
1967             r01              = _mm_andnot_ps(dummy_mask,r01);
1968
1969             /* EWALD ELECTROSTATICS */
1970
1971             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1972             ewrt             = _mm_mul_ps(r01,ewtabscale);
1973             ewitab           = _mm_cvttps_epi32(ewrt);
1974             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1975             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1976                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1977                                          &ewtabF,&ewtabFn);
1978             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1979             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1980
1981             fscal            = felec;
1982
1983             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1984
1985             /* Calculate temporary vectorial force */
1986             tx               = _mm_mul_ps(fscal,dx01);
1987             ty               = _mm_mul_ps(fscal,dy01);
1988             tz               = _mm_mul_ps(fscal,dz01);
1989
1990             /* Update vectorial force */
1991             fix0             = _mm_add_ps(fix0,tx);
1992             fiy0             = _mm_add_ps(fiy0,ty);
1993             fiz0             = _mm_add_ps(fiz0,tz);
1994
1995             fjx1             = _mm_add_ps(fjx1,tx);
1996             fjy1             = _mm_add_ps(fjy1,ty);
1997             fjz1             = _mm_add_ps(fjz1,tz);
1998             
1999             /**************************
2000              * CALCULATE INTERACTIONS *
2001              **************************/
2002
2003             r02              = _mm_mul_ps(rsq02,rinv02);
2004             r02              = _mm_andnot_ps(dummy_mask,r02);
2005
2006             /* EWALD ELECTROSTATICS */
2007
2008             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2009             ewrt             = _mm_mul_ps(r02,ewtabscale);
2010             ewitab           = _mm_cvttps_epi32(ewrt);
2011             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2012             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2013                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2014                                          &ewtabF,&ewtabFn);
2015             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2016             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2017
2018             fscal            = felec;
2019
2020             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2021
2022             /* Calculate temporary vectorial force */
2023             tx               = _mm_mul_ps(fscal,dx02);
2024             ty               = _mm_mul_ps(fscal,dy02);
2025             tz               = _mm_mul_ps(fscal,dz02);
2026
2027             /* Update vectorial force */
2028             fix0             = _mm_add_ps(fix0,tx);
2029             fiy0             = _mm_add_ps(fiy0,ty);
2030             fiz0             = _mm_add_ps(fiz0,tz);
2031
2032             fjx2             = _mm_add_ps(fjx2,tx);
2033             fjy2             = _mm_add_ps(fjy2,ty);
2034             fjz2             = _mm_add_ps(fjz2,tz);
2035             
2036             /**************************
2037              * CALCULATE INTERACTIONS *
2038              **************************/
2039
2040             r10              = _mm_mul_ps(rsq10,rinv10);
2041             r10              = _mm_andnot_ps(dummy_mask,r10);
2042
2043             /* EWALD ELECTROSTATICS */
2044
2045             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2046             ewrt             = _mm_mul_ps(r10,ewtabscale);
2047             ewitab           = _mm_cvttps_epi32(ewrt);
2048             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2049             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2050                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2051                                          &ewtabF,&ewtabFn);
2052             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2053             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2054
2055             fscal            = felec;
2056
2057             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2058
2059             /* Calculate temporary vectorial force */
2060             tx               = _mm_mul_ps(fscal,dx10);
2061             ty               = _mm_mul_ps(fscal,dy10);
2062             tz               = _mm_mul_ps(fscal,dz10);
2063
2064             /* Update vectorial force */
2065             fix1             = _mm_add_ps(fix1,tx);
2066             fiy1             = _mm_add_ps(fiy1,ty);
2067             fiz1             = _mm_add_ps(fiz1,tz);
2068
2069             fjx0             = _mm_add_ps(fjx0,tx);
2070             fjy0             = _mm_add_ps(fjy0,ty);
2071             fjz0             = _mm_add_ps(fjz0,tz);
2072             
2073             /**************************
2074              * CALCULATE INTERACTIONS *
2075              **************************/
2076
2077             r11              = _mm_mul_ps(rsq11,rinv11);
2078             r11              = _mm_andnot_ps(dummy_mask,r11);
2079
2080             /* EWALD ELECTROSTATICS */
2081
2082             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2083             ewrt             = _mm_mul_ps(r11,ewtabscale);
2084             ewitab           = _mm_cvttps_epi32(ewrt);
2085             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2086             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2087                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2088                                          &ewtabF,&ewtabFn);
2089             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2090             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2091
2092             fscal            = felec;
2093
2094             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2095
2096             /* Calculate temporary vectorial force */
2097             tx               = _mm_mul_ps(fscal,dx11);
2098             ty               = _mm_mul_ps(fscal,dy11);
2099             tz               = _mm_mul_ps(fscal,dz11);
2100
2101             /* Update vectorial force */
2102             fix1             = _mm_add_ps(fix1,tx);
2103             fiy1             = _mm_add_ps(fiy1,ty);
2104             fiz1             = _mm_add_ps(fiz1,tz);
2105
2106             fjx1             = _mm_add_ps(fjx1,tx);
2107             fjy1             = _mm_add_ps(fjy1,ty);
2108             fjz1             = _mm_add_ps(fjz1,tz);
2109             
2110             /**************************
2111              * CALCULATE INTERACTIONS *
2112              **************************/
2113
2114             r12              = _mm_mul_ps(rsq12,rinv12);
2115             r12              = _mm_andnot_ps(dummy_mask,r12);
2116
2117             /* EWALD ELECTROSTATICS */
2118
2119             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2120             ewrt             = _mm_mul_ps(r12,ewtabscale);
2121             ewitab           = _mm_cvttps_epi32(ewrt);
2122             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2123             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2124                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2125                                          &ewtabF,&ewtabFn);
2126             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2127             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2128
2129             fscal            = felec;
2130
2131             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2132
2133             /* Calculate temporary vectorial force */
2134             tx               = _mm_mul_ps(fscal,dx12);
2135             ty               = _mm_mul_ps(fscal,dy12);
2136             tz               = _mm_mul_ps(fscal,dz12);
2137
2138             /* Update vectorial force */
2139             fix1             = _mm_add_ps(fix1,tx);
2140             fiy1             = _mm_add_ps(fiy1,ty);
2141             fiz1             = _mm_add_ps(fiz1,tz);
2142
2143             fjx2             = _mm_add_ps(fjx2,tx);
2144             fjy2             = _mm_add_ps(fjy2,ty);
2145             fjz2             = _mm_add_ps(fjz2,tz);
2146             
2147             /**************************
2148              * CALCULATE INTERACTIONS *
2149              **************************/
2150
2151             r20              = _mm_mul_ps(rsq20,rinv20);
2152             r20              = _mm_andnot_ps(dummy_mask,r20);
2153
2154             /* EWALD ELECTROSTATICS */
2155
2156             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2157             ewrt             = _mm_mul_ps(r20,ewtabscale);
2158             ewitab           = _mm_cvttps_epi32(ewrt);
2159             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2160             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2161                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2162                                          &ewtabF,&ewtabFn);
2163             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2164             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2165
2166             fscal            = felec;
2167
2168             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2169
2170             /* Calculate temporary vectorial force */
2171             tx               = _mm_mul_ps(fscal,dx20);
2172             ty               = _mm_mul_ps(fscal,dy20);
2173             tz               = _mm_mul_ps(fscal,dz20);
2174
2175             /* Update vectorial force */
2176             fix2             = _mm_add_ps(fix2,tx);
2177             fiy2             = _mm_add_ps(fiy2,ty);
2178             fiz2             = _mm_add_ps(fiz2,tz);
2179
2180             fjx0             = _mm_add_ps(fjx0,tx);
2181             fjy0             = _mm_add_ps(fjy0,ty);
2182             fjz0             = _mm_add_ps(fjz0,tz);
2183             
2184             /**************************
2185              * CALCULATE INTERACTIONS *
2186              **************************/
2187
2188             r21              = _mm_mul_ps(rsq21,rinv21);
2189             r21              = _mm_andnot_ps(dummy_mask,r21);
2190
2191             /* EWALD ELECTROSTATICS */
2192
2193             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2194             ewrt             = _mm_mul_ps(r21,ewtabscale);
2195             ewitab           = _mm_cvttps_epi32(ewrt);
2196             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2197             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2198                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2199                                          &ewtabF,&ewtabFn);
2200             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2201             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2202
2203             fscal            = felec;
2204
2205             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2206
2207             /* Calculate temporary vectorial force */
2208             tx               = _mm_mul_ps(fscal,dx21);
2209             ty               = _mm_mul_ps(fscal,dy21);
2210             tz               = _mm_mul_ps(fscal,dz21);
2211
2212             /* Update vectorial force */
2213             fix2             = _mm_add_ps(fix2,tx);
2214             fiy2             = _mm_add_ps(fiy2,ty);
2215             fiz2             = _mm_add_ps(fiz2,tz);
2216
2217             fjx1             = _mm_add_ps(fjx1,tx);
2218             fjy1             = _mm_add_ps(fjy1,ty);
2219             fjz1             = _mm_add_ps(fjz1,tz);
2220             
2221             /**************************
2222              * CALCULATE INTERACTIONS *
2223              **************************/
2224
2225             r22              = _mm_mul_ps(rsq22,rinv22);
2226             r22              = _mm_andnot_ps(dummy_mask,r22);
2227
2228             /* EWALD ELECTROSTATICS */
2229
2230             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2231             ewrt             = _mm_mul_ps(r22,ewtabscale);
2232             ewitab           = _mm_cvttps_epi32(ewrt);
2233             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2234             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2235                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2236                                          &ewtabF,&ewtabFn);
2237             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2238             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2239
2240             fscal            = felec;
2241
2242             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2243
2244             /* Calculate temporary vectorial force */
2245             tx               = _mm_mul_ps(fscal,dx22);
2246             ty               = _mm_mul_ps(fscal,dy22);
2247             tz               = _mm_mul_ps(fscal,dz22);
2248
2249             /* Update vectorial force */
2250             fix2             = _mm_add_ps(fix2,tx);
2251             fiy2             = _mm_add_ps(fiy2,ty);
2252             fiz2             = _mm_add_ps(fiz2,tz);
2253
2254             fjx2             = _mm_add_ps(fjx2,tx);
2255             fjy2             = _mm_add_ps(fjy2,ty);
2256             fjz2             = _mm_add_ps(fjz2,tz);
2257             
2258             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2259             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2260             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2261             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2262
2263             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2264                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2265
2266             /* Inner loop uses 340 flops */
2267         }
2268
2269         /* End of innermost loop */
2270
2271         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2272                                               f+i_coord_offset,fshift+i_shift_offset);
2273
2274         /* Increment number of inner iterations */
2275         inneriter                  += j_index_end - j_index_start;
2276
2277         /* Outer loop uses 18 flops */
2278     }
2279
2280     /* Increment number of outer iterations */
2281     outeriter        += nri;
2282
2283     /* Update outer/inner flops */
2284
2285     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*340);
2286 }