Rename remaining GMX_ACCELERATION to GMX_CPU_ACCELERATION
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEw_VdwLJ_GeomW3W3_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             shX,shY,shZ,rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     int              nvdwtype;
90     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91     int              *vdwtype;
92     real             *vdwparam;
93     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
94     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
95     __m128i          ewitab;
96     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
97     real             *ewtab;
98     __m128           dummy_mask,cutoff_mask;
99     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
100     __m128           one     = _mm_set1_ps(1.0);
101     __m128           two     = _mm_set1_ps(2.0);
102     x                = xx[0];
103     f                = ff[0];
104
105     nri              = nlist->nri;
106     iinr             = nlist->iinr;
107     jindex           = nlist->jindex;
108     jjnr             = nlist->jjnr;
109     shiftidx         = nlist->shift;
110     gid              = nlist->gid;
111     shiftvec         = fr->shift_vec[0];
112     fshift           = fr->fshift[0];
113     facel            = _mm_set1_ps(fr->epsfac);
114     charge           = mdatoms->chargeA;
115     nvdwtype         = fr->ntype;
116     vdwparam         = fr->nbfp;
117     vdwtype          = mdatoms->typeA;
118
119     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
120     ewtab            = fr->ic->tabq_coul_FDV0;
121     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
122     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
123
124     /* Setup water-specific parameters */
125     inr              = nlist->iinr[0];
126     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
127     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
128     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
129     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
130
131     jq0              = _mm_set1_ps(charge[inr+0]);
132     jq1              = _mm_set1_ps(charge[inr+1]);
133     jq2              = _mm_set1_ps(charge[inr+2]);
134     vdwjidx0A        = 2*vdwtype[inr+0];
135     qq00             = _mm_mul_ps(iq0,jq0);
136     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
137     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
138     qq01             = _mm_mul_ps(iq0,jq1);
139     qq02             = _mm_mul_ps(iq0,jq2);
140     qq10             = _mm_mul_ps(iq1,jq0);
141     qq11             = _mm_mul_ps(iq1,jq1);
142     qq12             = _mm_mul_ps(iq1,jq2);
143     qq20             = _mm_mul_ps(iq2,jq0);
144     qq21             = _mm_mul_ps(iq2,jq1);
145     qq22             = _mm_mul_ps(iq2,jq2);
146
147     /* Avoid stupid compiler warnings */
148     jnrA = jnrB = jnrC = jnrD = 0;
149     j_coord_offsetA = 0;
150     j_coord_offsetB = 0;
151     j_coord_offsetC = 0;
152     j_coord_offsetD = 0;
153
154     outeriter        = 0;
155     inneriter        = 0;
156
157     /* Start outer loop over neighborlists */
158     for(iidx=0; iidx<nri; iidx++)
159     {
160         /* Load shift vector for this list */
161         i_shift_offset   = DIM*shiftidx[iidx];
162         shX              = shiftvec[i_shift_offset+XX];
163         shY              = shiftvec[i_shift_offset+YY];
164         shZ              = shiftvec[i_shift_offset+ZZ];
165
166         /* Load limits for loop over neighbors */
167         j_index_start    = jindex[iidx];
168         j_index_end      = jindex[iidx+1];
169
170         /* Get outer coordinate index */
171         inr              = iinr[iidx];
172         i_coord_offset   = DIM*inr;
173
174         /* Load i particle coords and add shift vector */
175         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
176         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
177         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
178         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
179         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
180         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
181         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
182         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
183         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
184
185         fix0             = _mm_setzero_ps();
186         fiy0             = _mm_setzero_ps();
187         fiz0             = _mm_setzero_ps();
188         fix1             = _mm_setzero_ps();
189         fiy1             = _mm_setzero_ps();
190         fiz1             = _mm_setzero_ps();
191         fix2             = _mm_setzero_ps();
192         fiy2             = _mm_setzero_ps();
193         fiz2             = _mm_setzero_ps();
194
195         /* Reset potential sums */
196         velecsum         = _mm_setzero_ps();
197         vvdwsum          = _mm_setzero_ps();
198
199         /* Start inner kernel loop */
200         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
201         {
202
203             /* Get j neighbor index, and coordinate index */
204             jnrA             = jjnr[jidx];
205             jnrB             = jjnr[jidx+1];
206             jnrC             = jjnr[jidx+2];
207             jnrD             = jjnr[jidx+3];
208
209             j_coord_offsetA  = DIM*jnrA;
210             j_coord_offsetB  = DIM*jnrB;
211             j_coord_offsetC  = DIM*jnrC;
212             j_coord_offsetD  = DIM*jnrD;
213
214             /* load j atom coordinates */
215             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
216                                               x+j_coord_offsetC,x+j_coord_offsetD,
217                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
218
219             /* Calculate displacement vector */
220             dx00             = _mm_sub_ps(ix0,jx0);
221             dy00             = _mm_sub_ps(iy0,jy0);
222             dz00             = _mm_sub_ps(iz0,jz0);
223             dx01             = _mm_sub_ps(ix0,jx1);
224             dy01             = _mm_sub_ps(iy0,jy1);
225             dz01             = _mm_sub_ps(iz0,jz1);
226             dx02             = _mm_sub_ps(ix0,jx2);
227             dy02             = _mm_sub_ps(iy0,jy2);
228             dz02             = _mm_sub_ps(iz0,jz2);
229             dx10             = _mm_sub_ps(ix1,jx0);
230             dy10             = _mm_sub_ps(iy1,jy0);
231             dz10             = _mm_sub_ps(iz1,jz0);
232             dx11             = _mm_sub_ps(ix1,jx1);
233             dy11             = _mm_sub_ps(iy1,jy1);
234             dz11             = _mm_sub_ps(iz1,jz1);
235             dx12             = _mm_sub_ps(ix1,jx2);
236             dy12             = _mm_sub_ps(iy1,jy2);
237             dz12             = _mm_sub_ps(iz1,jz2);
238             dx20             = _mm_sub_ps(ix2,jx0);
239             dy20             = _mm_sub_ps(iy2,jy0);
240             dz20             = _mm_sub_ps(iz2,jz0);
241             dx21             = _mm_sub_ps(ix2,jx1);
242             dy21             = _mm_sub_ps(iy2,jy1);
243             dz21             = _mm_sub_ps(iz2,jz1);
244             dx22             = _mm_sub_ps(ix2,jx2);
245             dy22             = _mm_sub_ps(iy2,jy2);
246             dz22             = _mm_sub_ps(iz2,jz2);
247
248             /* Calculate squared distance and things based on it */
249             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
250             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
251             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
252             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
253             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
254             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
255             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
256             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
257             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
258
259             rinv00           = gmx_mm_invsqrt_ps(rsq00);
260             rinv01           = gmx_mm_invsqrt_ps(rsq01);
261             rinv02           = gmx_mm_invsqrt_ps(rsq02);
262             rinv10           = gmx_mm_invsqrt_ps(rsq10);
263             rinv11           = gmx_mm_invsqrt_ps(rsq11);
264             rinv12           = gmx_mm_invsqrt_ps(rsq12);
265             rinv20           = gmx_mm_invsqrt_ps(rsq20);
266             rinv21           = gmx_mm_invsqrt_ps(rsq21);
267             rinv22           = gmx_mm_invsqrt_ps(rsq22);
268
269             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
270             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
271             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
272             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
273             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
274             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
275             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
276             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
277             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
278
279             fjx0             = _mm_setzero_ps();
280             fjy0             = _mm_setzero_ps();
281             fjz0             = _mm_setzero_ps();
282             fjx1             = _mm_setzero_ps();
283             fjy1             = _mm_setzero_ps();
284             fjz1             = _mm_setzero_ps();
285             fjx2             = _mm_setzero_ps();
286             fjy2             = _mm_setzero_ps();
287             fjz2             = _mm_setzero_ps();
288
289             /**************************
290              * CALCULATE INTERACTIONS *
291              **************************/
292
293             r00              = _mm_mul_ps(rsq00,rinv00);
294
295             /* EWALD ELECTROSTATICS */
296
297             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
298             ewrt             = _mm_mul_ps(r00,ewtabscale);
299             ewitab           = _mm_cvttps_epi32(ewrt);
300             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
301             ewitab           = _mm_slli_epi32(ewitab,2);
302             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
303             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
304             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
305             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
306             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
307             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
308             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
309             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
310             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
311
312             /* LENNARD-JONES DISPERSION/REPULSION */
313
314             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
315             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
316             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
317             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
318             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
319
320             /* Update potential sum for this i atom from the interaction with this j atom. */
321             velecsum         = _mm_add_ps(velecsum,velec);
322             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
323
324             fscal            = _mm_add_ps(felec,fvdw);
325
326             /* Calculate temporary vectorial force */
327             tx               = _mm_mul_ps(fscal,dx00);
328             ty               = _mm_mul_ps(fscal,dy00);
329             tz               = _mm_mul_ps(fscal,dz00);
330
331             /* Update vectorial force */
332             fix0             = _mm_add_ps(fix0,tx);
333             fiy0             = _mm_add_ps(fiy0,ty);
334             fiz0             = _mm_add_ps(fiz0,tz);
335
336             fjx0             = _mm_add_ps(fjx0,tx);
337             fjy0             = _mm_add_ps(fjy0,ty);
338             fjz0             = _mm_add_ps(fjz0,tz);
339
340             /**************************
341              * CALCULATE INTERACTIONS *
342              **************************/
343
344             r01              = _mm_mul_ps(rsq01,rinv01);
345
346             /* EWALD ELECTROSTATICS */
347
348             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
349             ewrt             = _mm_mul_ps(r01,ewtabscale);
350             ewitab           = _mm_cvttps_epi32(ewrt);
351             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
352             ewitab           = _mm_slli_epi32(ewitab,2);
353             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
354             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
355             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
356             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
357             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
358             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
359             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
360             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
361             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
362
363             /* Update potential sum for this i atom from the interaction with this j atom. */
364             velecsum         = _mm_add_ps(velecsum,velec);
365
366             fscal            = felec;
367
368             /* Calculate temporary vectorial force */
369             tx               = _mm_mul_ps(fscal,dx01);
370             ty               = _mm_mul_ps(fscal,dy01);
371             tz               = _mm_mul_ps(fscal,dz01);
372
373             /* Update vectorial force */
374             fix0             = _mm_add_ps(fix0,tx);
375             fiy0             = _mm_add_ps(fiy0,ty);
376             fiz0             = _mm_add_ps(fiz0,tz);
377
378             fjx1             = _mm_add_ps(fjx1,tx);
379             fjy1             = _mm_add_ps(fjy1,ty);
380             fjz1             = _mm_add_ps(fjz1,tz);
381
382             /**************************
383              * CALCULATE INTERACTIONS *
384              **************************/
385
386             r02              = _mm_mul_ps(rsq02,rinv02);
387
388             /* EWALD ELECTROSTATICS */
389
390             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
391             ewrt             = _mm_mul_ps(r02,ewtabscale);
392             ewitab           = _mm_cvttps_epi32(ewrt);
393             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
394             ewitab           = _mm_slli_epi32(ewitab,2);
395             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
396             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
397             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
398             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
399             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
400             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
401             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
402             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
403             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
404
405             /* Update potential sum for this i atom from the interaction with this j atom. */
406             velecsum         = _mm_add_ps(velecsum,velec);
407
408             fscal            = felec;
409
410             /* Calculate temporary vectorial force */
411             tx               = _mm_mul_ps(fscal,dx02);
412             ty               = _mm_mul_ps(fscal,dy02);
413             tz               = _mm_mul_ps(fscal,dz02);
414
415             /* Update vectorial force */
416             fix0             = _mm_add_ps(fix0,tx);
417             fiy0             = _mm_add_ps(fiy0,ty);
418             fiz0             = _mm_add_ps(fiz0,tz);
419
420             fjx2             = _mm_add_ps(fjx2,tx);
421             fjy2             = _mm_add_ps(fjy2,ty);
422             fjz2             = _mm_add_ps(fjz2,tz);
423
424             /**************************
425              * CALCULATE INTERACTIONS *
426              **************************/
427
428             r10              = _mm_mul_ps(rsq10,rinv10);
429
430             /* EWALD ELECTROSTATICS */
431
432             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
433             ewrt             = _mm_mul_ps(r10,ewtabscale);
434             ewitab           = _mm_cvttps_epi32(ewrt);
435             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
436             ewitab           = _mm_slli_epi32(ewitab,2);
437             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
438             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
439             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
440             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
441             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
442             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
443             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
444             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
445             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
446
447             /* Update potential sum for this i atom from the interaction with this j atom. */
448             velecsum         = _mm_add_ps(velecsum,velec);
449
450             fscal            = felec;
451
452             /* Calculate temporary vectorial force */
453             tx               = _mm_mul_ps(fscal,dx10);
454             ty               = _mm_mul_ps(fscal,dy10);
455             tz               = _mm_mul_ps(fscal,dz10);
456
457             /* Update vectorial force */
458             fix1             = _mm_add_ps(fix1,tx);
459             fiy1             = _mm_add_ps(fiy1,ty);
460             fiz1             = _mm_add_ps(fiz1,tz);
461
462             fjx0             = _mm_add_ps(fjx0,tx);
463             fjy0             = _mm_add_ps(fjy0,ty);
464             fjz0             = _mm_add_ps(fjz0,tz);
465
466             /**************************
467              * CALCULATE INTERACTIONS *
468              **************************/
469
470             r11              = _mm_mul_ps(rsq11,rinv11);
471
472             /* EWALD ELECTROSTATICS */
473
474             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
475             ewrt             = _mm_mul_ps(r11,ewtabscale);
476             ewitab           = _mm_cvttps_epi32(ewrt);
477             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
478             ewitab           = _mm_slli_epi32(ewitab,2);
479             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
480             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
481             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
482             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
483             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
484             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
485             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
486             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
487             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
488
489             /* Update potential sum for this i atom from the interaction with this j atom. */
490             velecsum         = _mm_add_ps(velecsum,velec);
491
492             fscal            = felec;
493
494             /* Calculate temporary vectorial force */
495             tx               = _mm_mul_ps(fscal,dx11);
496             ty               = _mm_mul_ps(fscal,dy11);
497             tz               = _mm_mul_ps(fscal,dz11);
498
499             /* Update vectorial force */
500             fix1             = _mm_add_ps(fix1,tx);
501             fiy1             = _mm_add_ps(fiy1,ty);
502             fiz1             = _mm_add_ps(fiz1,tz);
503
504             fjx1             = _mm_add_ps(fjx1,tx);
505             fjy1             = _mm_add_ps(fjy1,ty);
506             fjz1             = _mm_add_ps(fjz1,tz);
507
508             /**************************
509              * CALCULATE INTERACTIONS *
510              **************************/
511
512             r12              = _mm_mul_ps(rsq12,rinv12);
513
514             /* EWALD ELECTROSTATICS */
515
516             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
517             ewrt             = _mm_mul_ps(r12,ewtabscale);
518             ewitab           = _mm_cvttps_epi32(ewrt);
519             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
520             ewitab           = _mm_slli_epi32(ewitab,2);
521             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
522             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
523             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
524             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
525             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
526             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
527             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
528             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
529             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
530
531             /* Update potential sum for this i atom from the interaction with this j atom. */
532             velecsum         = _mm_add_ps(velecsum,velec);
533
534             fscal            = felec;
535
536             /* Calculate temporary vectorial force */
537             tx               = _mm_mul_ps(fscal,dx12);
538             ty               = _mm_mul_ps(fscal,dy12);
539             tz               = _mm_mul_ps(fscal,dz12);
540
541             /* Update vectorial force */
542             fix1             = _mm_add_ps(fix1,tx);
543             fiy1             = _mm_add_ps(fiy1,ty);
544             fiz1             = _mm_add_ps(fiz1,tz);
545
546             fjx2             = _mm_add_ps(fjx2,tx);
547             fjy2             = _mm_add_ps(fjy2,ty);
548             fjz2             = _mm_add_ps(fjz2,tz);
549
550             /**************************
551              * CALCULATE INTERACTIONS *
552              **************************/
553
554             r20              = _mm_mul_ps(rsq20,rinv20);
555
556             /* EWALD ELECTROSTATICS */
557
558             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
559             ewrt             = _mm_mul_ps(r20,ewtabscale);
560             ewitab           = _mm_cvttps_epi32(ewrt);
561             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
562             ewitab           = _mm_slli_epi32(ewitab,2);
563             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
564             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
565             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
566             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
567             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
568             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
569             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
570             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
571             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
572
573             /* Update potential sum for this i atom from the interaction with this j atom. */
574             velecsum         = _mm_add_ps(velecsum,velec);
575
576             fscal            = felec;
577
578             /* Calculate temporary vectorial force */
579             tx               = _mm_mul_ps(fscal,dx20);
580             ty               = _mm_mul_ps(fscal,dy20);
581             tz               = _mm_mul_ps(fscal,dz20);
582
583             /* Update vectorial force */
584             fix2             = _mm_add_ps(fix2,tx);
585             fiy2             = _mm_add_ps(fiy2,ty);
586             fiz2             = _mm_add_ps(fiz2,tz);
587
588             fjx0             = _mm_add_ps(fjx0,tx);
589             fjy0             = _mm_add_ps(fjy0,ty);
590             fjz0             = _mm_add_ps(fjz0,tz);
591
592             /**************************
593              * CALCULATE INTERACTIONS *
594              **************************/
595
596             r21              = _mm_mul_ps(rsq21,rinv21);
597
598             /* EWALD ELECTROSTATICS */
599
600             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
601             ewrt             = _mm_mul_ps(r21,ewtabscale);
602             ewitab           = _mm_cvttps_epi32(ewrt);
603             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
604             ewitab           = _mm_slli_epi32(ewitab,2);
605             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
606             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
607             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
608             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
609             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
610             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
611             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
612             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
613             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
614
615             /* Update potential sum for this i atom from the interaction with this j atom. */
616             velecsum         = _mm_add_ps(velecsum,velec);
617
618             fscal            = felec;
619
620             /* Calculate temporary vectorial force */
621             tx               = _mm_mul_ps(fscal,dx21);
622             ty               = _mm_mul_ps(fscal,dy21);
623             tz               = _mm_mul_ps(fscal,dz21);
624
625             /* Update vectorial force */
626             fix2             = _mm_add_ps(fix2,tx);
627             fiy2             = _mm_add_ps(fiy2,ty);
628             fiz2             = _mm_add_ps(fiz2,tz);
629
630             fjx1             = _mm_add_ps(fjx1,tx);
631             fjy1             = _mm_add_ps(fjy1,ty);
632             fjz1             = _mm_add_ps(fjz1,tz);
633
634             /**************************
635              * CALCULATE INTERACTIONS *
636              **************************/
637
638             r22              = _mm_mul_ps(rsq22,rinv22);
639
640             /* EWALD ELECTROSTATICS */
641
642             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
643             ewrt             = _mm_mul_ps(r22,ewtabscale);
644             ewitab           = _mm_cvttps_epi32(ewrt);
645             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
646             ewitab           = _mm_slli_epi32(ewitab,2);
647             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
648             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
649             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
650             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
651             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
652             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
653             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
654             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
655             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
656
657             /* Update potential sum for this i atom from the interaction with this j atom. */
658             velecsum         = _mm_add_ps(velecsum,velec);
659
660             fscal            = felec;
661
662             /* Calculate temporary vectorial force */
663             tx               = _mm_mul_ps(fscal,dx22);
664             ty               = _mm_mul_ps(fscal,dy22);
665             tz               = _mm_mul_ps(fscal,dz22);
666
667             /* Update vectorial force */
668             fix2             = _mm_add_ps(fix2,tx);
669             fiy2             = _mm_add_ps(fiy2,ty);
670             fiz2             = _mm_add_ps(fiz2,tz);
671
672             fjx2             = _mm_add_ps(fjx2,tx);
673             fjy2             = _mm_add_ps(fjy2,ty);
674             fjz2             = _mm_add_ps(fjz2,tz);
675
676             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
677                                                    f+j_coord_offsetC,f+j_coord_offsetD,
678                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
679
680             /* Inner loop uses 381 flops */
681         }
682
683         if(jidx<j_index_end)
684         {
685
686             /* Get j neighbor index, and coordinate index */
687             jnrA             = jjnr[jidx];
688             jnrB             = jjnr[jidx+1];
689             jnrC             = jjnr[jidx+2];
690             jnrD             = jjnr[jidx+3];
691
692             /* Sign of each element will be negative for non-real atoms.
693              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
694              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
695              */
696             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
697             jnrA       = (jnrA>=0) ? jnrA : 0;
698             jnrB       = (jnrB>=0) ? jnrB : 0;
699             jnrC       = (jnrC>=0) ? jnrC : 0;
700             jnrD       = (jnrD>=0) ? jnrD : 0;
701
702             j_coord_offsetA  = DIM*jnrA;
703             j_coord_offsetB  = DIM*jnrB;
704             j_coord_offsetC  = DIM*jnrC;
705             j_coord_offsetD  = DIM*jnrD;
706
707             /* load j atom coordinates */
708             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
709                                               x+j_coord_offsetC,x+j_coord_offsetD,
710                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
711
712             /* Calculate displacement vector */
713             dx00             = _mm_sub_ps(ix0,jx0);
714             dy00             = _mm_sub_ps(iy0,jy0);
715             dz00             = _mm_sub_ps(iz0,jz0);
716             dx01             = _mm_sub_ps(ix0,jx1);
717             dy01             = _mm_sub_ps(iy0,jy1);
718             dz01             = _mm_sub_ps(iz0,jz1);
719             dx02             = _mm_sub_ps(ix0,jx2);
720             dy02             = _mm_sub_ps(iy0,jy2);
721             dz02             = _mm_sub_ps(iz0,jz2);
722             dx10             = _mm_sub_ps(ix1,jx0);
723             dy10             = _mm_sub_ps(iy1,jy0);
724             dz10             = _mm_sub_ps(iz1,jz0);
725             dx11             = _mm_sub_ps(ix1,jx1);
726             dy11             = _mm_sub_ps(iy1,jy1);
727             dz11             = _mm_sub_ps(iz1,jz1);
728             dx12             = _mm_sub_ps(ix1,jx2);
729             dy12             = _mm_sub_ps(iy1,jy2);
730             dz12             = _mm_sub_ps(iz1,jz2);
731             dx20             = _mm_sub_ps(ix2,jx0);
732             dy20             = _mm_sub_ps(iy2,jy0);
733             dz20             = _mm_sub_ps(iz2,jz0);
734             dx21             = _mm_sub_ps(ix2,jx1);
735             dy21             = _mm_sub_ps(iy2,jy1);
736             dz21             = _mm_sub_ps(iz2,jz1);
737             dx22             = _mm_sub_ps(ix2,jx2);
738             dy22             = _mm_sub_ps(iy2,jy2);
739             dz22             = _mm_sub_ps(iz2,jz2);
740
741             /* Calculate squared distance and things based on it */
742             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
743             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
744             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
745             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
746             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
747             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
748             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
749             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
750             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
751
752             rinv00           = gmx_mm_invsqrt_ps(rsq00);
753             rinv01           = gmx_mm_invsqrt_ps(rsq01);
754             rinv02           = gmx_mm_invsqrt_ps(rsq02);
755             rinv10           = gmx_mm_invsqrt_ps(rsq10);
756             rinv11           = gmx_mm_invsqrt_ps(rsq11);
757             rinv12           = gmx_mm_invsqrt_ps(rsq12);
758             rinv20           = gmx_mm_invsqrt_ps(rsq20);
759             rinv21           = gmx_mm_invsqrt_ps(rsq21);
760             rinv22           = gmx_mm_invsqrt_ps(rsq22);
761
762             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
763             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
764             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
765             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
766             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
767             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
768             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
769             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
770             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
771
772             fjx0             = _mm_setzero_ps();
773             fjy0             = _mm_setzero_ps();
774             fjz0             = _mm_setzero_ps();
775             fjx1             = _mm_setzero_ps();
776             fjy1             = _mm_setzero_ps();
777             fjz1             = _mm_setzero_ps();
778             fjx2             = _mm_setzero_ps();
779             fjy2             = _mm_setzero_ps();
780             fjz2             = _mm_setzero_ps();
781
782             /**************************
783              * CALCULATE INTERACTIONS *
784              **************************/
785
786             r00              = _mm_mul_ps(rsq00,rinv00);
787             r00              = _mm_andnot_ps(dummy_mask,r00);
788
789             /* EWALD ELECTROSTATICS */
790
791             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
792             ewrt             = _mm_mul_ps(r00,ewtabscale);
793             ewitab           = _mm_cvttps_epi32(ewrt);
794             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
795             ewitab           = _mm_slli_epi32(ewitab,2);
796             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
797             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
798             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
799             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
800             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
801             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
802             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
803             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
804             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
805
806             /* LENNARD-JONES DISPERSION/REPULSION */
807
808             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
809             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
810             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
811             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
812             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
813
814             /* Update potential sum for this i atom from the interaction with this j atom. */
815             velec            = _mm_andnot_ps(dummy_mask,velec);
816             velecsum         = _mm_add_ps(velecsum,velec);
817             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
818             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
819
820             fscal            = _mm_add_ps(felec,fvdw);
821
822             fscal            = _mm_andnot_ps(dummy_mask,fscal);
823
824             /* Calculate temporary vectorial force */
825             tx               = _mm_mul_ps(fscal,dx00);
826             ty               = _mm_mul_ps(fscal,dy00);
827             tz               = _mm_mul_ps(fscal,dz00);
828
829             /* Update vectorial force */
830             fix0             = _mm_add_ps(fix0,tx);
831             fiy0             = _mm_add_ps(fiy0,ty);
832             fiz0             = _mm_add_ps(fiz0,tz);
833
834             fjx0             = _mm_add_ps(fjx0,tx);
835             fjy0             = _mm_add_ps(fjy0,ty);
836             fjz0             = _mm_add_ps(fjz0,tz);
837
838             /**************************
839              * CALCULATE INTERACTIONS *
840              **************************/
841
842             r01              = _mm_mul_ps(rsq01,rinv01);
843             r01              = _mm_andnot_ps(dummy_mask,r01);
844
845             /* EWALD ELECTROSTATICS */
846
847             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
848             ewrt             = _mm_mul_ps(r01,ewtabscale);
849             ewitab           = _mm_cvttps_epi32(ewrt);
850             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
851             ewitab           = _mm_slli_epi32(ewitab,2);
852             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
853             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
854             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
855             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
856             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
857             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
858             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
859             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
860             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
861
862             /* Update potential sum for this i atom from the interaction with this j atom. */
863             velec            = _mm_andnot_ps(dummy_mask,velec);
864             velecsum         = _mm_add_ps(velecsum,velec);
865
866             fscal            = felec;
867
868             fscal            = _mm_andnot_ps(dummy_mask,fscal);
869
870             /* Calculate temporary vectorial force */
871             tx               = _mm_mul_ps(fscal,dx01);
872             ty               = _mm_mul_ps(fscal,dy01);
873             tz               = _mm_mul_ps(fscal,dz01);
874
875             /* Update vectorial force */
876             fix0             = _mm_add_ps(fix0,tx);
877             fiy0             = _mm_add_ps(fiy0,ty);
878             fiz0             = _mm_add_ps(fiz0,tz);
879
880             fjx1             = _mm_add_ps(fjx1,tx);
881             fjy1             = _mm_add_ps(fjy1,ty);
882             fjz1             = _mm_add_ps(fjz1,tz);
883
884             /**************************
885              * CALCULATE INTERACTIONS *
886              **************************/
887
888             r02              = _mm_mul_ps(rsq02,rinv02);
889             r02              = _mm_andnot_ps(dummy_mask,r02);
890
891             /* EWALD ELECTROSTATICS */
892
893             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
894             ewrt             = _mm_mul_ps(r02,ewtabscale);
895             ewitab           = _mm_cvttps_epi32(ewrt);
896             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
897             ewitab           = _mm_slli_epi32(ewitab,2);
898             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
899             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
900             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
901             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
902             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
903             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
904             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
905             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
906             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
907
908             /* Update potential sum for this i atom from the interaction with this j atom. */
909             velec            = _mm_andnot_ps(dummy_mask,velec);
910             velecsum         = _mm_add_ps(velecsum,velec);
911
912             fscal            = felec;
913
914             fscal            = _mm_andnot_ps(dummy_mask,fscal);
915
916             /* Calculate temporary vectorial force */
917             tx               = _mm_mul_ps(fscal,dx02);
918             ty               = _mm_mul_ps(fscal,dy02);
919             tz               = _mm_mul_ps(fscal,dz02);
920
921             /* Update vectorial force */
922             fix0             = _mm_add_ps(fix0,tx);
923             fiy0             = _mm_add_ps(fiy0,ty);
924             fiz0             = _mm_add_ps(fiz0,tz);
925
926             fjx2             = _mm_add_ps(fjx2,tx);
927             fjy2             = _mm_add_ps(fjy2,ty);
928             fjz2             = _mm_add_ps(fjz2,tz);
929
930             /**************************
931              * CALCULATE INTERACTIONS *
932              **************************/
933
934             r10              = _mm_mul_ps(rsq10,rinv10);
935             r10              = _mm_andnot_ps(dummy_mask,r10);
936
937             /* EWALD ELECTROSTATICS */
938
939             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
940             ewrt             = _mm_mul_ps(r10,ewtabscale);
941             ewitab           = _mm_cvttps_epi32(ewrt);
942             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
943             ewitab           = _mm_slli_epi32(ewitab,2);
944             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
945             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
946             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
947             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
948             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
949             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
950             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
951             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
952             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
953
954             /* Update potential sum for this i atom from the interaction with this j atom. */
955             velec            = _mm_andnot_ps(dummy_mask,velec);
956             velecsum         = _mm_add_ps(velecsum,velec);
957
958             fscal            = felec;
959
960             fscal            = _mm_andnot_ps(dummy_mask,fscal);
961
962             /* Calculate temporary vectorial force */
963             tx               = _mm_mul_ps(fscal,dx10);
964             ty               = _mm_mul_ps(fscal,dy10);
965             tz               = _mm_mul_ps(fscal,dz10);
966
967             /* Update vectorial force */
968             fix1             = _mm_add_ps(fix1,tx);
969             fiy1             = _mm_add_ps(fiy1,ty);
970             fiz1             = _mm_add_ps(fiz1,tz);
971
972             fjx0             = _mm_add_ps(fjx0,tx);
973             fjy0             = _mm_add_ps(fjy0,ty);
974             fjz0             = _mm_add_ps(fjz0,tz);
975
976             /**************************
977              * CALCULATE INTERACTIONS *
978              **************************/
979
980             r11              = _mm_mul_ps(rsq11,rinv11);
981             r11              = _mm_andnot_ps(dummy_mask,r11);
982
983             /* EWALD ELECTROSTATICS */
984
985             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
986             ewrt             = _mm_mul_ps(r11,ewtabscale);
987             ewitab           = _mm_cvttps_epi32(ewrt);
988             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
989             ewitab           = _mm_slli_epi32(ewitab,2);
990             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
991             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
992             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
993             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
994             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
995             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
996             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
997             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
998             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
999
1000             /* Update potential sum for this i atom from the interaction with this j atom. */
1001             velec            = _mm_andnot_ps(dummy_mask,velec);
1002             velecsum         = _mm_add_ps(velecsum,velec);
1003
1004             fscal            = felec;
1005
1006             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1007
1008             /* Calculate temporary vectorial force */
1009             tx               = _mm_mul_ps(fscal,dx11);
1010             ty               = _mm_mul_ps(fscal,dy11);
1011             tz               = _mm_mul_ps(fscal,dz11);
1012
1013             /* Update vectorial force */
1014             fix1             = _mm_add_ps(fix1,tx);
1015             fiy1             = _mm_add_ps(fiy1,ty);
1016             fiz1             = _mm_add_ps(fiz1,tz);
1017
1018             fjx1             = _mm_add_ps(fjx1,tx);
1019             fjy1             = _mm_add_ps(fjy1,ty);
1020             fjz1             = _mm_add_ps(fjz1,tz);
1021
1022             /**************************
1023              * CALCULATE INTERACTIONS *
1024              **************************/
1025
1026             r12              = _mm_mul_ps(rsq12,rinv12);
1027             r12              = _mm_andnot_ps(dummy_mask,r12);
1028
1029             /* EWALD ELECTROSTATICS */
1030
1031             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1032             ewrt             = _mm_mul_ps(r12,ewtabscale);
1033             ewitab           = _mm_cvttps_epi32(ewrt);
1034             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1035             ewitab           = _mm_slli_epi32(ewitab,2);
1036             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1037             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1038             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1039             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1040             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1041             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1042             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1043             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1044             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1045
1046             /* Update potential sum for this i atom from the interaction with this j atom. */
1047             velec            = _mm_andnot_ps(dummy_mask,velec);
1048             velecsum         = _mm_add_ps(velecsum,velec);
1049
1050             fscal            = felec;
1051
1052             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1053
1054             /* Calculate temporary vectorial force */
1055             tx               = _mm_mul_ps(fscal,dx12);
1056             ty               = _mm_mul_ps(fscal,dy12);
1057             tz               = _mm_mul_ps(fscal,dz12);
1058
1059             /* Update vectorial force */
1060             fix1             = _mm_add_ps(fix1,tx);
1061             fiy1             = _mm_add_ps(fiy1,ty);
1062             fiz1             = _mm_add_ps(fiz1,tz);
1063
1064             fjx2             = _mm_add_ps(fjx2,tx);
1065             fjy2             = _mm_add_ps(fjy2,ty);
1066             fjz2             = _mm_add_ps(fjz2,tz);
1067
1068             /**************************
1069              * CALCULATE INTERACTIONS *
1070              **************************/
1071
1072             r20              = _mm_mul_ps(rsq20,rinv20);
1073             r20              = _mm_andnot_ps(dummy_mask,r20);
1074
1075             /* EWALD ELECTROSTATICS */
1076
1077             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1078             ewrt             = _mm_mul_ps(r20,ewtabscale);
1079             ewitab           = _mm_cvttps_epi32(ewrt);
1080             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1081             ewitab           = _mm_slli_epi32(ewitab,2);
1082             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1083             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1084             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1085             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1086             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1087             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1088             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1089             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1090             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1091
1092             /* Update potential sum for this i atom from the interaction with this j atom. */
1093             velec            = _mm_andnot_ps(dummy_mask,velec);
1094             velecsum         = _mm_add_ps(velecsum,velec);
1095
1096             fscal            = felec;
1097
1098             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1099
1100             /* Calculate temporary vectorial force */
1101             tx               = _mm_mul_ps(fscal,dx20);
1102             ty               = _mm_mul_ps(fscal,dy20);
1103             tz               = _mm_mul_ps(fscal,dz20);
1104
1105             /* Update vectorial force */
1106             fix2             = _mm_add_ps(fix2,tx);
1107             fiy2             = _mm_add_ps(fiy2,ty);
1108             fiz2             = _mm_add_ps(fiz2,tz);
1109
1110             fjx0             = _mm_add_ps(fjx0,tx);
1111             fjy0             = _mm_add_ps(fjy0,ty);
1112             fjz0             = _mm_add_ps(fjz0,tz);
1113
1114             /**************************
1115              * CALCULATE INTERACTIONS *
1116              **************************/
1117
1118             r21              = _mm_mul_ps(rsq21,rinv21);
1119             r21              = _mm_andnot_ps(dummy_mask,r21);
1120
1121             /* EWALD ELECTROSTATICS */
1122
1123             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1124             ewrt             = _mm_mul_ps(r21,ewtabscale);
1125             ewitab           = _mm_cvttps_epi32(ewrt);
1126             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1127             ewitab           = _mm_slli_epi32(ewitab,2);
1128             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1129             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1130             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1131             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1132             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1133             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1134             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1135             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1136             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1137
1138             /* Update potential sum for this i atom from the interaction with this j atom. */
1139             velec            = _mm_andnot_ps(dummy_mask,velec);
1140             velecsum         = _mm_add_ps(velecsum,velec);
1141
1142             fscal            = felec;
1143
1144             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1145
1146             /* Calculate temporary vectorial force */
1147             tx               = _mm_mul_ps(fscal,dx21);
1148             ty               = _mm_mul_ps(fscal,dy21);
1149             tz               = _mm_mul_ps(fscal,dz21);
1150
1151             /* Update vectorial force */
1152             fix2             = _mm_add_ps(fix2,tx);
1153             fiy2             = _mm_add_ps(fiy2,ty);
1154             fiz2             = _mm_add_ps(fiz2,tz);
1155
1156             fjx1             = _mm_add_ps(fjx1,tx);
1157             fjy1             = _mm_add_ps(fjy1,ty);
1158             fjz1             = _mm_add_ps(fjz1,tz);
1159
1160             /**************************
1161              * CALCULATE INTERACTIONS *
1162              **************************/
1163
1164             r22              = _mm_mul_ps(rsq22,rinv22);
1165             r22              = _mm_andnot_ps(dummy_mask,r22);
1166
1167             /* EWALD ELECTROSTATICS */
1168
1169             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1170             ewrt             = _mm_mul_ps(r22,ewtabscale);
1171             ewitab           = _mm_cvttps_epi32(ewrt);
1172             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1173             ewitab           = _mm_slli_epi32(ewitab,2);
1174             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1175             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1176             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1177             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1178             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1179             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1180             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1181             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1182             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1183
1184             /* Update potential sum for this i atom from the interaction with this j atom. */
1185             velec            = _mm_andnot_ps(dummy_mask,velec);
1186             velecsum         = _mm_add_ps(velecsum,velec);
1187
1188             fscal            = felec;
1189
1190             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1191
1192             /* Calculate temporary vectorial force */
1193             tx               = _mm_mul_ps(fscal,dx22);
1194             ty               = _mm_mul_ps(fscal,dy22);
1195             tz               = _mm_mul_ps(fscal,dz22);
1196
1197             /* Update vectorial force */
1198             fix2             = _mm_add_ps(fix2,tx);
1199             fiy2             = _mm_add_ps(fiy2,ty);
1200             fiz2             = _mm_add_ps(fiz2,tz);
1201
1202             fjx2             = _mm_add_ps(fjx2,tx);
1203             fjy2             = _mm_add_ps(fjy2,ty);
1204             fjz2             = _mm_add_ps(fjz2,tz);
1205
1206             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1207                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1208                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1209
1210             /* Inner loop uses 390 flops */
1211         }
1212
1213         /* End of innermost loop */
1214
1215         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1216                                               f+i_coord_offset,fshift+i_shift_offset);
1217
1218         ggid                        = gid[iidx];
1219         /* Update potential energies */
1220         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1221         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1222
1223         /* Increment number of inner iterations */
1224         inneriter                  += j_index_end - j_index_start;
1225
1226         /* Outer loop uses 29 flops */
1227     }
1228
1229     /* Increment number of outer iterations */
1230     outeriter        += nri;
1231
1232     /* Update outer/inner flops */
1233
1234     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*390);
1235 }
1236 /*
1237  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single
1238  * Electrostatics interaction: Ewald
1239  * VdW interaction:            LennardJones
1240  * Geometry:                   Water3-Water3
1241  * Calculate force/pot:        Force
1242  */
1243 void
1244 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single
1245                     (t_nblist * gmx_restrict                nlist,
1246                      rvec * gmx_restrict                    xx,
1247                      rvec * gmx_restrict                    ff,
1248                      t_forcerec * gmx_restrict              fr,
1249                      t_mdatoms * gmx_restrict               mdatoms,
1250                      nb_kernel_data_t * gmx_restrict        kernel_data,
1251                      t_nrnb * gmx_restrict                  nrnb)
1252 {
1253     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1254      * just 0 for non-waters.
1255      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1256      * jnr indices corresponding to data put in the four positions in the SIMD register.
1257      */
1258     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1259     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1260     int              jnrA,jnrB,jnrC,jnrD;
1261     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1262     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1263     real             shX,shY,shZ,rcutoff_scalar;
1264     real             *shiftvec,*fshift,*x,*f;
1265     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1266     int              vdwioffset0;
1267     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1268     int              vdwioffset1;
1269     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1270     int              vdwioffset2;
1271     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1272     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1273     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1274     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1275     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1276     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1277     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1278     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1279     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1280     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1281     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1282     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1283     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1284     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1285     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1286     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1287     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1288     real             *charge;
1289     int              nvdwtype;
1290     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1291     int              *vdwtype;
1292     real             *vdwparam;
1293     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1294     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1295     __m128i          ewitab;
1296     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1297     real             *ewtab;
1298     __m128           dummy_mask,cutoff_mask;
1299     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1300     __m128           one     = _mm_set1_ps(1.0);
1301     __m128           two     = _mm_set1_ps(2.0);
1302     x                = xx[0];
1303     f                = ff[0];
1304
1305     nri              = nlist->nri;
1306     iinr             = nlist->iinr;
1307     jindex           = nlist->jindex;
1308     jjnr             = nlist->jjnr;
1309     shiftidx         = nlist->shift;
1310     gid              = nlist->gid;
1311     shiftvec         = fr->shift_vec[0];
1312     fshift           = fr->fshift[0];
1313     facel            = _mm_set1_ps(fr->epsfac);
1314     charge           = mdatoms->chargeA;
1315     nvdwtype         = fr->ntype;
1316     vdwparam         = fr->nbfp;
1317     vdwtype          = mdatoms->typeA;
1318
1319     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1320     ewtab            = fr->ic->tabq_coul_F;
1321     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1322     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1323
1324     /* Setup water-specific parameters */
1325     inr              = nlist->iinr[0];
1326     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1327     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1328     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1329     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1330
1331     jq0              = _mm_set1_ps(charge[inr+0]);
1332     jq1              = _mm_set1_ps(charge[inr+1]);
1333     jq2              = _mm_set1_ps(charge[inr+2]);
1334     vdwjidx0A        = 2*vdwtype[inr+0];
1335     qq00             = _mm_mul_ps(iq0,jq0);
1336     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1337     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1338     qq01             = _mm_mul_ps(iq0,jq1);
1339     qq02             = _mm_mul_ps(iq0,jq2);
1340     qq10             = _mm_mul_ps(iq1,jq0);
1341     qq11             = _mm_mul_ps(iq1,jq1);
1342     qq12             = _mm_mul_ps(iq1,jq2);
1343     qq20             = _mm_mul_ps(iq2,jq0);
1344     qq21             = _mm_mul_ps(iq2,jq1);
1345     qq22             = _mm_mul_ps(iq2,jq2);
1346
1347     /* Avoid stupid compiler warnings */
1348     jnrA = jnrB = jnrC = jnrD = 0;
1349     j_coord_offsetA = 0;
1350     j_coord_offsetB = 0;
1351     j_coord_offsetC = 0;
1352     j_coord_offsetD = 0;
1353
1354     outeriter        = 0;
1355     inneriter        = 0;
1356
1357     /* Start outer loop over neighborlists */
1358     for(iidx=0; iidx<nri; iidx++)
1359     {
1360         /* Load shift vector for this list */
1361         i_shift_offset   = DIM*shiftidx[iidx];
1362         shX              = shiftvec[i_shift_offset+XX];
1363         shY              = shiftvec[i_shift_offset+YY];
1364         shZ              = shiftvec[i_shift_offset+ZZ];
1365
1366         /* Load limits for loop over neighbors */
1367         j_index_start    = jindex[iidx];
1368         j_index_end      = jindex[iidx+1];
1369
1370         /* Get outer coordinate index */
1371         inr              = iinr[iidx];
1372         i_coord_offset   = DIM*inr;
1373
1374         /* Load i particle coords and add shift vector */
1375         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1376         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1377         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1378         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1379         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1380         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1381         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1382         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1383         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1384
1385         fix0             = _mm_setzero_ps();
1386         fiy0             = _mm_setzero_ps();
1387         fiz0             = _mm_setzero_ps();
1388         fix1             = _mm_setzero_ps();
1389         fiy1             = _mm_setzero_ps();
1390         fiz1             = _mm_setzero_ps();
1391         fix2             = _mm_setzero_ps();
1392         fiy2             = _mm_setzero_ps();
1393         fiz2             = _mm_setzero_ps();
1394
1395         /* Start inner kernel loop */
1396         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1397         {
1398
1399             /* Get j neighbor index, and coordinate index */
1400             jnrA             = jjnr[jidx];
1401             jnrB             = jjnr[jidx+1];
1402             jnrC             = jjnr[jidx+2];
1403             jnrD             = jjnr[jidx+3];
1404
1405             j_coord_offsetA  = DIM*jnrA;
1406             j_coord_offsetB  = DIM*jnrB;
1407             j_coord_offsetC  = DIM*jnrC;
1408             j_coord_offsetD  = DIM*jnrD;
1409
1410             /* load j atom coordinates */
1411             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1412                                               x+j_coord_offsetC,x+j_coord_offsetD,
1413                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1414
1415             /* Calculate displacement vector */
1416             dx00             = _mm_sub_ps(ix0,jx0);
1417             dy00             = _mm_sub_ps(iy0,jy0);
1418             dz00             = _mm_sub_ps(iz0,jz0);
1419             dx01             = _mm_sub_ps(ix0,jx1);
1420             dy01             = _mm_sub_ps(iy0,jy1);
1421             dz01             = _mm_sub_ps(iz0,jz1);
1422             dx02             = _mm_sub_ps(ix0,jx2);
1423             dy02             = _mm_sub_ps(iy0,jy2);
1424             dz02             = _mm_sub_ps(iz0,jz2);
1425             dx10             = _mm_sub_ps(ix1,jx0);
1426             dy10             = _mm_sub_ps(iy1,jy0);
1427             dz10             = _mm_sub_ps(iz1,jz0);
1428             dx11             = _mm_sub_ps(ix1,jx1);
1429             dy11             = _mm_sub_ps(iy1,jy1);
1430             dz11             = _mm_sub_ps(iz1,jz1);
1431             dx12             = _mm_sub_ps(ix1,jx2);
1432             dy12             = _mm_sub_ps(iy1,jy2);
1433             dz12             = _mm_sub_ps(iz1,jz2);
1434             dx20             = _mm_sub_ps(ix2,jx0);
1435             dy20             = _mm_sub_ps(iy2,jy0);
1436             dz20             = _mm_sub_ps(iz2,jz0);
1437             dx21             = _mm_sub_ps(ix2,jx1);
1438             dy21             = _mm_sub_ps(iy2,jy1);
1439             dz21             = _mm_sub_ps(iz2,jz1);
1440             dx22             = _mm_sub_ps(ix2,jx2);
1441             dy22             = _mm_sub_ps(iy2,jy2);
1442             dz22             = _mm_sub_ps(iz2,jz2);
1443
1444             /* Calculate squared distance and things based on it */
1445             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1446             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1447             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1448             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1449             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1450             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1451             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1452             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1453             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1454
1455             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1456             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1457             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1458             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1459             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1460             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1461             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1462             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1463             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1464
1465             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1466             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1467             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1468             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1469             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1470             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1471             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1472             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1473             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1474
1475             fjx0             = _mm_setzero_ps();
1476             fjy0             = _mm_setzero_ps();
1477             fjz0             = _mm_setzero_ps();
1478             fjx1             = _mm_setzero_ps();
1479             fjy1             = _mm_setzero_ps();
1480             fjz1             = _mm_setzero_ps();
1481             fjx2             = _mm_setzero_ps();
1482             fjy2             = _mm_setzero_ps();
1483             fjz2             = _mm_setzero_ps();
1484
1485             /**************************
1486              * CALCULATE INTERACTIONS *
1487              **************************/
1488
1489             r00              = _mm_mul_ps(rsq00,rinv00);
1490
1491             /* EWALD ELECTROSTATICS */
1492
1493             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1494             ewrt             = _mm_mul_ps(r00,ewtabscale);
1495             ewitab           = _mm_cvttps_epi32(ewrt);
1496             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1497             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1498                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1499                                          &ewtabF,&ewtabFn);
1500             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1501             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1502
1503             /* LENNARD-JONES DISPERSION/REPULSION */
1504
1505             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1506             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1507
1508             fscal            = _mm_add_ps(felec,fvdw);
1509
1510             /* Calculate temporary vectorial force */
1511             tx               = _mm_mul_ps(fscal,dx00);
1512             ty               = _mm_mul_ps(fscal,dy00);
1513             tz               = _mm_mul_ps(fscal,dz00);
1514
1515             /* Update vectorial force */
1516             fix0             = _mm_add_ps(fix0,tx);
1517             fiy0             = _mm_add_ps(fiy0,ty);
1518             fiz0             = _mm_add_ps(fiz0,tz);
1519
1520             fjx0             = _mm_add_ps(fjx0,tx);
1521             fjy0             = _mm_add_ps(fjy0,ty);
1522             fjz0             = _mm_add_ps(fjz0,tz);
1523
1524             /**************************
1525              * CALCULATE INTERACTIONS *
1526              **************************/
1527
1528             r01              = _mm_mul_ps(rsq01,rinv01);
1529
1530             /* EWALD ELECTROSTATICS */
1531
1532             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1533             ewrt             = _mm_mul_ps(r01,ewtabscale);
1534             ewitab           = _mm_cvttps_epi32(ewrt);
1535             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1536             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1537                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1538                                          &ewtabF,&ewtabFn);
1539             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1540             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1541
1542             fscal            = felec;
1543
1544             /* Calculate temporary vectorial force */
1545             tx               = _mm_mul_ps(fscal,dx01);
1546             ty               = _mm_mul_ps(fscal,dy01);
1547             tz               = _mm_mul_ps(fscal,dz01);
1548
1549             /* Update vectorial force */
1550             fix0             = _mm_add_ps(fix0,tx);
1551             fiy0             = _mm_add_ps(fiy0,ty);
1552             fiz0             = _mm_add_ps(fiz0,tz);
1553
1554             fjx1             = _mm_add_ps(fjx1,tx);
1555             fjy1             = _mm_add_ps(fjy1,ty);
1556             fjz1             = _mm_add_ps(fjz1,tz);
1557
1558             /**************************
1559              * CALCULATE INTERACTIONS *
1560              **************************/
1561
1562             r02              = _mm_mul_ps(rsq02,rinv02);
1563
1564             /* EWALD ELECTROSTATICS */
1565
1566             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1567             ewrt             = _mm_mul_ps(r02,ewtabscale);
1568             ewitab           = _mm_cvttps_epi32(ewrt);
1569             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1570             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1571                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1572                                          &ewtabF,&ewtabFn);
1573             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1574             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1575
1576             fscal            = felec;
1577
1578             /* Calculate temporary vectorial force */
1579             tx               = _mm_mul_ps(fscal,dx02);
1580             ty               = _mm_mul_ps(fscal,dy02);
1581             tz               = _mm_mul_ps(fscal,dz02);
1582
1583             /* Update vectorial force */
1584             fix0             = _mm_add_ps(fix0,tx);
1585             fiy0             = _mm_add_ps(fiy0,ty);
1586             fiz0             = _mm_add_ps(fiz0,tz);
1587
1588             fjx2             = _mm_add_ps(fjx2,tx);
1589             fjy2             = _mm_add_ps(fjy2,ty);
1590             fjz2             = _mm_add_ps(fjz2,tz);
1591
1592             /**************************
1593              * CALCULATE INTERACTIONS *
1594              **************************/
1595
1596             r10              = _mm_mul_ps(rsq10,rinv10);
1597
1598             /* EWALD ELECTROSTATICS */
1599
1600             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1601             ewrt             = _mm_mul_ps(r10,ewtabscale);
1602             ewitab           = _mm_cvttps_epi32(ewrt);
1603             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1604             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1605                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1606                                          &ewtabF,&ewtabFn);
1607             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1608             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1609
1610             fscal            = felec;
1611
1612             /* Calculate temporary vectorial force */
1613             tx               = _mm_mul_ps(fscal,dx10);
1614             ty               = _mm_mul_ps(fscal,dy10);
1615             tz               = _mm_mul_ps(fscal,dz10);
1616
1617             /* Update vectorial force */
1618             fix1             = _mm_add_ps(fix1,tx);
1619             fiy1             = _mm_add_ps(fiy1,ty);
1620             fiz1             = _mm_add_ps(fiz1,tz);
1621
1622             fjx0             = _mm_add_ps(fjx0,tx);
1623             fjy0             = _mm_add_ps(fjy0,ty);
1624             fjz0             = _mm_add_ps(fjz0,tz);
1625
1626             /**************************
1627              * CALCULATE INTERACTIONS *
1628              **************************/
1629
1630             r11              = _mm_mul_ps(rsq11,rinv11);
1631
1632             /* EWALD ELECTROSTATICS */
1633
1634             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1635             ewrt             = _mm_mul_ps(r11,ewtabscale);
1636             ewitab           = _mm_cvttps_epi32(ewrt);
1637             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1638             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1639                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1640                                          &ewtabF,&ewtabFn);
1641             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1642             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1643
1644             fscal            = felec;
1645
1646             /* Calculate temporary vectorial force */
1647             tx               = _mm_mul_ps(fscal,dx11);
1648             ty               = _mm_mul_ps(fscal,dy11);
1649             tz               = _mm_mul_ps(fscal,dz11);
1650
1651             /* Update vectorial force */
1652             fix1             = _mm_add_ps(fix1,tx);
1653             fiy1             = _mm_add_ps(fiy1,ty);
1654             fiz1             = _mm_add_ps(fiz1,tz);
1655
1656             fjx1             = _mm_add_ps(fjx1,tx);
1657             fjy1             = _mm_add_ps(fjy1,ty);
1658             fjz1             = _mm_add_ps(fjz1,tz);
1659
1660             /**************************
1661              * CALCULATE INTERACTIONS *
1662              **************************/
1663
1664             r12              = _mm_mul_ps(rsq12,rinv12);
1665
1666             /* EWALD ELECTROSTATICS */
1667
1668             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1669             ewrt             = _mm_mul_ps(r12,ewtabscale);
1670             ewitab           = _mm_cvttps_epi32(ewrt);
1671             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1672             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1673                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1674                                          &ewtabF,&ewtabFn);
1675             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1676             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1677
1678             fscal            = felec;
1679
1680             /* Calculate temporary vectorial force */
1681             tx               = _mm_mul_ps(fscal,dx12);
1682             ty               = _mm_mul_ps(fscal,dy12);
1683             tz               = _mm_mul_ps(fscal,dz12);
1684
1685             /* Update vectorial force */
1686             fix1             = _mm_add_ps(fix1,tx);
1687             fiy1             = _mm_add_ps(fiy1,ty);
1688             fiz1             = _mm_add_ps(fiz1,tz);
1689
1690             fjx2             = _mm_add_ps(fjx2,tx);
1691             fjy2             = _mm_add_ps(fjy2,ty);
1692             fjz2             = _mm_add_ps(fjz2,tz);
1693
1694             /**************************
1695              * CALCULATE INTERACTIONS *
1696              **************************/
1697
1698             r20              = _mm_mul_ps(rsq20,rinv20);
1699
1700             /* EWALD ELECTROSTATICS */
1701
1702             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1703             ewrt             = _mm_mul_ps(r20,ewtabscale);
1704             ewitab           = _mm_cvttps_epi32(ewrt);
1705             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1706             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1707                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1708                                          &ewtabF,&ewtabFn);
1709             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1710             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1711
1712             fscal            = felec;
1713
1714             /* Calculate temporary vectorial force */
1715             tx               = _mm_mul_ps(fscal,dx20);
1716             ty               = _mm_mul_ps(fscal,dy20);
1717             tz               = _mm_mul_ps(fscal,dz20);
1718
1719             /* Update vectorial force */
1720             fix2             = _mm_add_ps(fix2,tx);
1721             fiy2             = _mm_add_ps(fiy2,ty);
1722             fiz2             = _mm_add_ps(fiz2,tz);
1723
1724             fjx0             = _mm_add_ps(fjx0,tx);
1725             fjy0             = _mm_add_ps(fjy0,ty);
1726             fjz0             = _mm_add_ps(fjz0,tz);
1727
1728             /**************************
1729              * CALCULATE INTERACTIONS *
1730              **************************/
1731
1732             r21              = _mm_mul_ps(rsq21,rinv21);
1733
1734             /* EWALD ELECTROSTATICS */
1735
1736             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1737             ewrt             = _mm_mul_ps(r21,ewtabscale);
1738             ewitab           = _mm_cvttps_epi32(ewrt);
1739             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1740             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1741                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1742                                          &ewtabF,&ewtabFn);
1743             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1744             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1745
1746             fscal            = felec;
1747
1748             /* Calculate temporary vectorial force */
1749             tx               = _mm_mul_ps(fscal,dx21);
1750             ty               = _mm_mul_ps(fscal,dy21);
1751             tz               = _mm_mul_ps(fscal,dz21);
1752
1753             /* Update vectorial force */
1754             fix2             = _mm_add_ps(fix2,tx);
1755             fiy2             = _mm_add_ps(fiy2,ty);
1756             fiz2             = _mm_add_ps(fiz2,tz);
1757
1758             fjx1             = _mm_add_ps(fjx1,tx);
1759             fjy1             = _mm_add_ps(fjy1,ty);
1760             fjz1             = _mm_add_ps(fjz1,tz);
1761
1762             /**************************
1763              * CALCULATE INTERACTIONS *
1764              **************************/
1765
1766             r22              = _mm_mul_ps(rsq22,rinv22);
1767
1768             /* EWALD ELECTROSTATICS */
1769
1770             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1771             ewrt             = _mm_mul_ps(r22,ewtabscale);
1772             ewitab           = _mm_cvttps_epi32(ewrt);
1773             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1774             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1775                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1776                                          &ewtabF,&ewtabFn);
1777             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1778             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1779
1780             fscal            = felec;
1781
1782             /* Calculate temporary vectorial force */
1783             tx               = _mm_mul_ps(fscal,dx22);
1784             ty               = _mm_mul_ps(fscal,dy22);
1785             tz               = _mm_mul_ps(fscal,dz22);
1786
1787             /* Update vectorial force */
1788             fix2             = _mm_add_ps(fix2,tx);
1789             fiy2             = _mm_add_ps(fiy2,ty);
1790             fiz2             = _mm_add_ps(fiz2,tz);
1791
1792             fjx2             = _mm_add_ps(fjx2,tx);
1793             fjy2             = _mm_add_ps(fjy2,ty);
1794             fjz2             = _mm_add_ps(fjz2,tz);
1795
1796             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1797                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1798                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1799
1800             /* Inner loop uses 331 flops */
1801         }
1802
1803         if(jidx<j_index_end)
1804         {
1805
1806             /* Get j neighbor index, and coordinate index */
1807             jnrA             = jjnr[jidx];
1808             jnrB             = jjnr[jidx+1];
1809             jnrC             = jjnr[jidx+2];
1810             jnrD             = jjnr[jidx+3];
1811
1812             /* Sign of each element will be negative for non-real atoms.
1813              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1814              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1815              */
1816             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1817             jnrA       = (jnrA>=0) ? jnrA : 0;
1818             jnrB       = (jnrB>=0) ? jnrB : 0;
1819             jnrC       = (jnrC>=0) ? jnrC : 0;
1820             jnrD       = (jnrD>=0) ? jnrD : 0;
1821
1822             j_coord_offsetA  = DIM*jnrA;
1823             j_coord_offsetB  = DIM*jnrB;
1824             j_coord_offsetC  = DIM*jnrC;
1825             j_coord_offsetD  = DIM*jnrD;
1826
1827             /* load j atom coordinates */
1828             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1829                                               x+j_coord_offsetC,x+j_coord_offsetD,
1830                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1831
1832             /* Calculate displacement vector */
1833             dx00             = _mm_sub_ps(ix0,jx0);
1834             dy00             = _mm_sub_ps(iy0,jy0);
1835             dz00             = _mm_sub_ps(iz0,jz0);
1836             dx01             = _mm_sub_ps(ix0,jx1);
1837             dy01             = _mm_sub_ps(iy0,jy1);
1838             dz01             = _mm_sub_ps(iz0,jz1);
1839             dx02             = _mm_sub_ps(ix0,jx2);
1840             dy02             = _mm_sub_ps(iy0,jy2);
1841             dz02             = _mm_sub_ps(iz0,jz2);
1842             dx10             = _mm_sub_ps(ix1,jx0);
1843             dy10             = _mm_sub_ps(iy1,jy0);
1844             dz10             = _mm_sub_ps(iz1,jz0);
1845             dx11             = _mm_sub_ps(ix1,jx1);
1846             dy11             = _mm_sub_ps(iy1,jy1);
1847             dz11             = _mm_sub_ps(iz1,jz1);
1848             dx12             = _mm_sub_ps(ix1,jx2);
1849             dy12             = _mm_sub_ps(iy1,jy2);
1850             dz12             = _mm_sub_ps(iz1,jz2);
1851             dx20             = _mm_sub_ps(ix2,jx0);
1852             dy20             = _mm_sub_ps(iy2,jy0);
1853             dz20             = _mm_sub_ps(iz2,jz0);
1854             dx21             = _mm_sub_ps(ix2,jx1);
1855             dy21             = _mm_sub_ps(iy2,jy1);
1856             dz21             = _mm_sub_ps(iz2,jz1);
1857             dx22             = _mm_sub_ps(ix2,jx2);
1858             dy22             = _mm_sub_ps(iy2,jy2);
1859             dz22             = _mm_sub_ps(iz2,jz2);
1860
1861             /* Calculate squared distance and things based on it */
1862             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1863             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1864             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1865             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1866             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1867             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1868             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1869             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1870             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1871
1872             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1873             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1874             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1875             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1876             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1877             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1878             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1879             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1880             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1881
1882             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1883             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1884             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1885             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1886             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1887             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1888             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1889             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1890             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1891
1892             fjx0             = _mm_setzero_ps();
1893             fjy0             = _mm_setzero_ps();
1894             fjz0             = _mm_setzero_ps();
1895             fjx1             = _mm_setzero_ps();
1896             fjy1             = _mm_setzero_ps();
1897             fjz1             = _mm_setzero_ps();
1898             fjx2             = _mm_setzero_ps();
1899             fjy2             = _mm_setzero_ps();
1900             fjz2             = _mm_setzero_ps();
1901
1902             /**************************
1903              * CALCULATE INTERACTIONS *
1904              **************************/
1905
1906             r00              = _mm_mul_ps(rsq00,rinv00);
1907             r00              = _mm_andnot_ps(dummy_mask,r00);
1908
1909             /* EWALD ELECTROSTATICS */
1910
1911             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1912             ewrt             = _mm_mul_ps(r00,ewtabscale);
1913             ewitab           = _mm_cvttps_epi32(ewrt);
1914             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1915             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1916                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1917                                          &ewtabF,&ewtabFn);
1918             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1919             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1920
1921             /* LENNARD-JONES DISPERSION/REPULSION */
1922
1923             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1924             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1925
1926             fscal            = _mm_add_ps(felec,fvdw);
1927
1928             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1929
1930             /* Calculate temporary vectorial force */
1931             tx               = _mm_mul_ps(fscal,dx00);
1932             ty               = _mm_mul_ps(fscal,dy00);
1933             tz               = _mm_mul_ps(fscal,dz00);
1934
1935             /* Update vectorial force */
1936             fix0             = _mm_add_ps(fix0,tx);
1937             fiy0             = _mm_add_ps(fiy0,ty);
1938             fiz0             = _mm_add_ps(fiz0,tz);
1939
1940             fjx0             = _mm_add_ps(fjx0,tx);
1941             fjy0             = _mm_add_ps(fjy0,ty);
1942             fjz0             = _mm_add_ps(fjz0,tz);
1943
1944             /**************************
1945              * CALCULATE INTERACTIONS *
1946              **************************/
1947
1948             r01              = _mm_mul_ps(rsq01,rinv01);
1949             r01              = _mm_andnot_ps(dummy_mask,r01);
1950
1951             /* EWALD ELECTROSTATICS */
1952
1953             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1954             ewrt             = _mm_mul_ps(r01,ewtabscale);
1955             ewitab           = _mm_cvttps_epi32(ewrt);
1956             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1957             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1958                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1959                                          &ewtabF,&ewtabFn);
1960             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1961             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1962
1963             fscal            = felec;
1964
1965             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1966
1967             /* Calculate temporary vectorial force */
1968             tx               = _mm_mul_ps(fscal,dx01);
1969             ty               = _mm_mul_ps(fscal,dy01);
1970             tz               = _mm_mul_ps(fscal,dz01);
1971
1972             /* Update vectorial force */
1973             fix0             = _mm_add_ps(fix0,tx);
1974             fiy0             = _mm_add_ps(fiy0,ty);
1975             fiz0             = _mm_add_ps(fiz0,tz);
1976
1977             fjx1             = _mm_add_ps(fjx1,tx);
1978             fjy1             = _mm_add_ps(fjy1,ty);
1979             fjz1             = _mm_add_ps(fjz1,tz);
1980
1981             /**************************
1982              * CALCULATE INTERACTIONS *
1983              **************************/
1984
1985             r02              = _mm_mul_ps(rsq02,rinv02);
1986             r02              = _mm_andnot_ps(dummy_mask,r02);
1987
1988             /* EWALD ELECTROSTATICS */
1989
1990             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1991             ewrt             = _mm_mul_ps(r02,ewtabscale);
1992             ewitab           = _mm_cvttps_epi32(ewrt);
1993             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1994             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1995                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1996                                          &ewtabF,&ewtabFn);
1997             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1998             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1999
2000             fscal            = felec;
2001
2002             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2003
2004             /* Calculate temporary vectorial force */
2005             tx               = _mm_mul_ps(fscal,dx02);
2006             ty               = _mm_mul_ps(fscal,dy02);
2007             tz               = _mm_mul_ps(fscal,dz02);
2008
2009             /* Update vectorial force */
2010             fix0             = _mm_add_ps(fix0,tx);
2011             fiy0             = _mm_add_ps(fiy0,ty);
2012             fiz0             = _mm_add_ps(fiz0,tz);
2013
2014             fjx2             = _mm_add_ps(fjx2,tx);
2015             fjy2             = _mm_add_ps(fjy2,ty);
2016             fjz2             = _mm_add_ps(fjz2,tz);
2017
2018             /**************************
2019              * CALCULATE INTERACTIONS *
2020              **************************/
2021
2022             r10              = _mm_mul_ps(rsq10,rinv10);
2023             r10              = _mm_andnot_ps(dummy_mask,r10);
2024
2025             /* EWALD ELECTROSTATICS */
2026
2027             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2028             ewrt             = _mm_mul_ps(r10,ewtabscale);
2029             ewitab           = _mm_cvttps_epi32(ewrt);
2030             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2031             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2032                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2033                                          &ewtabF,&ewtabFn);
2034             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2035             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2036
2037             fscal            = felec;
2038
2039             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2040
2041             /* Calculate temporary vectorial force */
2042             tx               = _mm_mul_ps(fscal,dx10);
2043             ty               = _mm_mul_ps(fscal,dy10);
2044             tz               = _mm_mul_ps(fscal,dz10);
2045
2046             /* Update vectorial force */
2047             fix1             = _mm_add_ps(fix1,tx);
2048             fiy1             = _mm_add_ps(fiy1,ty);
2049             fiz1             = _mm_add_ps(fiz1,tz);
2050
2051             fjx0             = _mm_add_ps(fjx0,tx);
2052             fjy0             = _mm_add_ps(fjy0,ty);
2053             fjz0             = _mm_add_ps(fjz0,tz);
2054
2055             /**************************
2056              * CALCULATE INTERACTIONS *
2057              **************************/
2058
2059             r11              = _mm_mul_ps(rsq11,rinv11);
2060             r11              = _mm_andnot_ps(dummy_mask,r11);
2061
2062             /* EWALD ELECTROSTATICS */
2063
2064             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2065             ewrt             = _mm_mul_ps(r11,ewtabscale);
2066             ewitab           = _mm_cvttps_epi32(ewrt);
2067             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2068             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2069                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2070                                          &ewtabF,&ewtabFn);
2071             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2072             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2073
2074             fscal            = felec;
2075
2076             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2077
2078             /* Calculate temporary vectorial force */
2079             tx               = _mm_mul_ps(fscal,dx11);
2080             ty               = _mm_mul_ps(fscal,dy11);
2081             tz               = _mm_mul_ps(fscal,dz11);
2082
2083             /* Update vectorial force */
2084             fix1             = _mm_add_ps(fix1,tx);
2085             fiy1             = _mm_add_ps(fiy1,ty);
2086             fiz1             = _mm_add_ps(fiz1,tz);
2087
2088             fjx1             = _mm_add_ps(fjx1,tx);
2089             fjy1             = _mm_add_ps(fjy1,ty);
2090             fjz1             = _mm_add_ps(fjz1,tz);
2091
2092             /**************************
2093              * CALCULATE INTERACTIONS *
2094              **************************/
2095
2096             r12              = _mm_mul_ps(rsq12,rinv12);
2097             r12              = _mm_andnot_ps(dummy_mask,r12);
2098
2099             /* EWALD ELECTROSTATICS */
2100
2101             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2102             ewrt             = _mm_mul_ps(r12,ewtabscale);
2103             ewitab           = _mm_cvttps_epi32(ewrt);
2104             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2105             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2106                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2107                                          &ewtabF,&ewtabFn);
2108             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2109             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2110
2111             fscal            = felec;
2112
2113             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2114
2115             /* Calculate temporary vectorial force */
2116             tx               = _mm_mul_ps(fscal,dx12);
2117             ty               = _mm_mul_ps(fscal,dy12);
2118             tz               = _mm_mul_ps(fscal,dz12);
2119
2120             /* Update vectorial force */
2121             fix1             = _mm_add_ps(fix1,tx);
2122             fiy1             = _mm_add_ps(fiy1,ty);
2123             fiz1             = _mm_add_ps(fiz1,tz);
2124
2125             fjx2             = _mm_add_ps(fjx2,tx);
2126             fjy2             = _mm_add_ps(fjy2,ty);
2127             fjz2             = _mm_add_ps(fjz2,tz);
2128
2129             /**************************
2130              * CALCULATE INTERACTIONS *
2131              **************************/
2132
2133             r20              = _mm_mul_ps(rsq20,rinv20);
2134             r20              = _mm_andnot_ps(dummy_mask,r20);
2135
2136             /* EWALD ELECTROSTATICS */
2137
2138             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2139             ewrt             = _mm_mul_ps(r20,ewtabscale);
2140             ewitab           = _mm_cvttps_epi32(ewrt);
2141             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2142             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2143                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2144                                          &ewtabF,&ewtabFn);
2145             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2146             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2147
2148             fscal            = felec;
2149
2150             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2151
2152             /* Calculate temporary vectorial force */
2153             tx               = _mm_mul_ps(fscal,dx20);
2154             ty               = _mm_mul_ps(fscal,dy20);
2155             tz               = _mm_mul_ps(fscal,dz20);
2156
2157             /* Update vectorial force */
2158             fix2             = _mm_add_ps(fix2,tx);
2159             fiy2             = _mm_add_ps(fiy2,ty);
2160             fiz2             = _mm_add_ps(fiz2,tz);
2161
2162             fjx0             = _mm_add_ps(fjx0,tx);
2163             fjy0             = _mm_add_ps(fjy0,ty);
2164             fjz0             = _mm_add_ps(fjz0,tz);
2165
2166             /**************************
2167              * CALCULATE INTERACTIONS *
2168              **************************/
2169
2170             r21              = _mm_mul_ps(rsq21,rinv21);
2171             r21              = _mm_andnot_ps(dummy_mask,r21);
2172
2173             /* EWALD ELECTROSTATICS */
2174
2175             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2176             ewrt             = _mm_mul_ps(r21,ewtabscale);
2177             ewitab           = _mm_cvttps_epi32(ewrt);
2178             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2179             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2180                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2181                                          &ewtabF,&ewtabFn);
2182             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2183             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2184
2185             fscal            = felec;
2186
2187             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2188
2189             /* Calculate temporary vectorial force */
2190             tx               = _mm_mul_ps(fscal,dx21);
2191             ty               = _mm_mul_ps(fscal,dy21);
2192             tz               = _mm_mul_ps(fscal,dz21);
2193
2194             /* Update vectorial force */
2195             fix2             = _mm_add_ps(fix2,tx);
2196             fiy2             = _mm_add_ps(fiy2,ty);
2197             fiz2             = _mm_add_ps(fiz2,tz);
2198
2199             fjx1             = _mm_add_ps(fjx1,tx);
2200             fjy1             = _mm_add_ps(fjy1,ty);
2201             fjz1             = _mm_add_ps(fjz1,tz);
2202
2203             /**************************
2204              * CALCULATE INTERACTIONS *
2205              **************************/
2206
2207             r22              = _mm_mul_ps(rsq22,rinv22);
2208             r22              = _mm_andnot_ps(dummy_mask,r22);
2209
2210             /* EWALD ELECTROSTATICS */
2211
2212             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2213             ewrt             = _mm_mul_ps(r22,ewtabscale);
2214             ewitab           = _mm_cvttps_epi32(ewrt);
2215             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2216             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2217                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2218                                          &ewtabF,&ewtabFn);
2219             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2220             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2221
2222             fscal            = felec;
2223
2224             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2225
2226             /* Calculate temporary vectorial force */
2227             tx               = _mm_mul_ps(fscal,dx22);
2228             ty               = _mm_mul_ps(fscal,dy22);
2229             tz               = _mm_mul_ps(fscal,dz22);
2230
2231             /* Update vectorial force */
2232             fix2             = _mm_add_ps(fix2,tx);
2233             fiy2             = _mm_add_ps(fiy2,ty);
2234             fiz2             = _mm_add_ps(fiz2,tz);
2235
2236             fjx2             = _mm_add_ps(fjx2,tx);
2237             fjy2             = _mm_add_ps(fjy2,ty);
2238             fjz2             = _mm_add_ps(fjz2,tz);
2239
2240             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2241                                                    f+j_coord_offsetC,f+j_coord_offsetD,
2242                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2243
2244             /* Inner loop uses 340 flops */
2245         }
2246
2247         /* End of innermost loop */
2248
2249         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2250                                               f+i_coord_offset,fshift+i_shift_offset);
2251
2252         /* Increment number of inner iterations */
2253         inneriter                  += j_index_end - j_index_start;
2254
2255         /* Outer loop uses 27 flops */
2256     }
2257
2258     /* Increment number of outer iterations */
2259     outeriter        += nri;
2260
2261     /* Update outer/inner flops */
2262
2263     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*340);
2264 }