a337ae2f77828081323f6ed54cd03b8af307ab7a
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecEw_VdwLJ_GeomW3W3_sse4_1_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
91     real             *charge;
92     int              nvdwtype;
93     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
94     int              *vdwtype;
95     real             *vdwparam;
96     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
97     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
98     __m128i          ewitab;
99     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
100     real             *ewtab;
101     __m128           dummy_mask,cutoff_mask;
102     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
103     __m128           one     = _mm_set1_ps(1.0);
104     __m128           two     = _mm_set1_ps(2.0);
105     x                = xx[0];
106     f                = ff[0];
107
108     nri              = nlist->nri;
109     iinr             = nlist->iinr;
110     jindex           = nlist->jindex;
111     jjnr             = nlist->jjnr;
112     shiftidx         = nlist->shift;
113     gid              = nlist->gid;
114     shiftvec         = fr->shift_vec[0];
115     fshift           = fr->fshift[0];
116     facel            = _mm_set1_ps(fr->epsfac);
117     charge           = mdatoms->chargeA;
118     nvdwtype         = fr->ntype;
119     vdwparam         = fr->nbfp;
120     vdwtype          = mdatoms->typeA;
121
122     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
123     ewtab            = fr->ic->tabq_coul_FDV0;
124     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
125     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
126
127     /* Setup water-specific parameters */
128     inr              = nlist->iinr[0];
129     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
130     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
131     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
132     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
133
134     jq0              = _mm_set1_ps(charge[inr+0]);
135     jq1              = _mm_set1_ps(charge[inr+1]);
136     jq2              = _mm_set1_ps(charge[inr+2]);
137     vdwjidx0A        = 2*vdwtype[inr+0];
138     qq00             = _mm_mul_ps(iq0,jq0);
139     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
140     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
141     qq01             = _mm_mul_ps(iq0,jq1);
142     qq02             = _mm_mul_ps(iq0,jq2);
143     qq10             = _mm_mul_ps(iq1,jq0);
144     qq11             = _mm_mul_ps(iq1,jq1);
145     qq12             = _mm_mul_ps(iq1,jq2);
146     qq20             = _mm_mul_ps(iq2,jq0);
147     qq21             = _mm_mul_ps(iq2,jq1);
148     qq22             = _mm_mul_ps(iq2,jq2);
149
150     /* Avoid stupid compiler warnings */
151     jnrA = jnrB = jnrC = jnrD = 0;
152     j_coord_offsetA = 0;
153     j_coord_offsetB = 0;
154     j_coord_offsetC = 0;
155     j_coord_offsetD = 0;
156
157     outeriter        = 0;
158     inneriter        = 0;
159
160     for(iidx=0;iidx<4*DIM;iidx++)
161     {
162         scratch[iidx] = 0.0;
163     }
164
165     /* Start outer loop over neighborlists */
166     for(iidx=0; iidx<nri; iidx++)
167     {
168         /* Load shift vector for this list */
169         i_shift_offset   = DIM*shiftidx[iidx];
170
171         /* Load limits for loop over neighbors */
172         j_index_start    = jindex[iidx];
173         j_index_end      = jindex[iidx+1];
174
175         /* Get outer coordinate index */
176         inr              = iinr[iidx];
177         i_coord_offset   = DIM*inr;
178
179         /* Load i particle coords and add shift vector */
180         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
181                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
182
183         fix0             = _mm_setzero_ps();
184         fiy0             = _mm_setzero_ps();
185         fiz0             = _mm_setzero_ps();
186         fix1             = _mm_setzero_ps();
187         fiy1             = _mm_setzero_ps();
188         fiz1             = _mm_setzero_ps();
189         fix2             = _mm_setzero_ps();
190         fiy2             = _mm_setzero_ps();
191         fiz2             = _mm_setzero_ps();
192
193         /* Reset potential sums */
194         velecsum         = _mm_setzero_ps();
195         vvdwsum          = _mm_setzero_ps();
196
197         /* Start inner kernel loop */
198         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
199         {
200
201             /* Get j neighbor index, and coordinate index */
202             jnrA             = jjnr[jidx];
203             jnrB             = jjnr[jidx+1];
204             jnrC             = jjnr[jidx+2];
205             jnrD             = jjnr[jidx+3];
206             j_coord_offsetA  = DIM*jnrA;
207             j_coord_offsetB  = DIM*jnrB;
208             j_coord_offsetC  = DIM*jnrC;
209             j_coord_offsetD  = DIM*jnrD;
210
211             /* load j atom coordinates */
212             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
213                                               x+j_coord_offsetC,x+j_coord_offsetD,
214                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215
216             /* Calculate displacement vector */
217             dx00             = _mm_sub_ps(ix0,jx0);
218             dy00             = _mm_sub_ps(iy0,jy0);
219             dz00             = _mm_sub_ps(iz0,jz0);
220             dx01             = _mm_sub_ps(ix0,jx1);
221             dy01             = _mm_sub_ps(iy0,jy1);
222             dz01             = _mm_sub_ps(iz0,jz1);
223             dx02             = _mm_sub_ps(ix0,jx2);
224             dy02             = _mm_sub_ps(iy0,jy2);
225             dz02             = _mm_sub_ps(iz0,jz2);
226             dx10             = _mm_sub_ps(ix1,jx0);
227             dy10             = _mm_sub_ps(iy1,jy0);
228             dz10             = _mm_sub_ps(iz1,jz0);
229             dx11             = _mm_sub_ps(ix1,jx1);
230             dy11             = _mm_sub_ps(iy1,jy1);
231             dz11             = _mm_sub_ps(iz1,jz1);
232             dx12             = _mm_sub_ps(ix1,jx2);
233             dy12             = _mm_sub_ps(iy1,jy2);
234             dz12             = _mm_sub_ps(iz1,jz2);
235             dx20             = _mm_sub_ps(ix2,jx0);
236             dy20             = _mm_sub_ps(iy2,jy0);
237             dz20             = _mm_sub_ps(iz2,jz0);
238             dx21             = _mm_sub_ps(ix2,jx1);
239             dy21             = _mm_sub_ps(iy2,jy1);
240             dz21             = _mm_sub_ps(iz2,jz1);
241             dx22             = _mm_sub_ps(ix2,jx2);
242             dy22             = _mm_sub_ps(iy2,jy2);
243             dz22             = _mm_sub_ps(iz2,jz2);
244
245             /* Calculate squared distance and things based on it */
246             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
247             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
248             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
249             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
250             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
251             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
252             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
253             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
254             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
255
256             rinv00           = gmx_mm_invsqrt_ps(rsq00);
257             rinv01           = gmx_mm_invsqrt_ps(rsq01);
258             rinv02           = gmx_mm_invsqrt_ps(rsq02);
259             rinv10           = gmx_mm_invsqrt_ps(rsq10);
260             rinv11           = gmx_mm_invsqrt_ps(rsq11);
261             rinv12           = gmx_mm_invsqrt_ps(rsq12);
262             rinv20           = gmx_mm_invsqrt_ps(rsq20);
263             rinv21           = gmx_mm_invsqrt_ps(rsq21);
264             rinv22           = gmx_mm_invsqrt_ps(rsq22);
265
266             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
267             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
268             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
269             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
270             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
271             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
272             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
273             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
274             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
275
276             fjx0             = _mm_setzero_ps();
277             fjy0             = _mm_setzero_ps();
278             fjz0             = _mm_setzero_ps();
279             fjx1             = _mm_setzero_ps();
280             fjy1             = _mm_setzero_ps();
281             fjz1             = _mm_setzero_ps();
282             fjx2             = _mm_setzero_ps();
283             fjy2             = _mm_setzero_ps();
284             fjz2             = _mm_setzero_ps();
285
286             /**************************
287              * CALCULATE INTERACTIONS *
288              **************************/
289
290             r00              = _mm_mul_ps(rsq00,rinv00);
291
292             /* EWALD ELECTROSTATICS */
293
294             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
295             ewrt             = _mm_mul_ps(r00,ewtabscale);
296             ewitab           = _mm_cvttps_epi32(ewrt);
297             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
298             ewitab           = _mm_slli_epi32(ewitab,2);
299             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
300             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
301             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
302             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
303             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
304             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
305             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
306             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
307             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
308
309             /* LENNARD-JONES DISPERSION/REPULSION */
310
311             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
312             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
313             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
314             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
315             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
316
317             /* Update potential sum for this i atom from the interaction with this j atom. */
318             velecsum         = _mm_add_ps(velecsum,velec);
319             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
320
321             fscal            = _mm_add_ps(felec,fvdw);
322
323             /* Calculate temporary vectorial force */
324             tx               = _mm_mul_ps(fscal,dx00);
325             ty               = _mm_mul_ps(fscal,dy00);
326             tz               = _mm_mul_ps(fscal,dz00);
327
328             /* Update vectorial force */
329             fix0             = _mm_add_ps(fix0,tx);
330             fiy0             = _mm_add_ps(fiy0,ty);
331             fiz0             = _mm_add_ps(fiz0,tz);
332
333             fjx0             = _mm_add_ps(fjx0,tx);
334             fjy0             = _mm_add_ps(fjy0,ty);
335             fjz0             = _mm_add_ps(fjz0,tz);
336
337             /**************************
338              * CALCULATE INTERACTIONS *
339              **************************/
340
341             r01              = _mm_mul_ps(rsq01,rinv01);
342
343             /* EWALD ELECTROSTATICS */
344
345             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
346             ewrt             = _mm_mul_ps(r01,ewtabscale);
347             ewitab           = _mm_cvttps_epi32(ewrt);
348             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
349             ewitab           = _mm_slli_epi32(ewitab,2);
350             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
351             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
352             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
353             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
354             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
355             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
356             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
357             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
358             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
359
360             /* Update potential sum for this i atom from the interaction with this j atom. */
361             velecsum         = _mm_add_ps(velecsum,velec);
362
363             fscal            = felec;
364
365             /* Calculate temporary vectorial force */
366             tx               = _mm_mul_ps(fscal,dx01);
367             ty               = _mm_mul_ps(fscal,dy01);
368             tz               = _mm_mul_ps(fscal,dz01);
369
370             /* Update vectorial force */
371             fix0             = _mm_add_ps(fix0,tx);
372             fiy0             = _mm_add_ps(fiy0,ty);
373             fiz0             = _mm_add_ps(fiz0,tz);
374
375             fjx1             = _mm_add_ps(fjx1,tx);
376             fjy1             = _mm_add_ps(fjy1,ty);
377             fjz1             = _mm_add_ps(fjz1,tz);
378
379             /**************************
380              * CALCULATE INTERACTIONS *
381              **************************/
382
383             r02              = _mm_mul_ps(rsq02,rinv02);
384
385             /* EWALD ELECTROSTATICS */
386
387             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
388             ewrt             = _mm_mul_ps(r02,ewtabscale);
389             ewitab           = _mm_cvttps_epi32(ewrt);
390             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
391             ewitab           = _mm_slli_epi32(ewitab,2);
392             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
393             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
394             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
395             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
396             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
397             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
398             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
399             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
400             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
401
402             /* Update potential sum for this i atom from the interaction with this j atom. */
403             velecsum         = _mm_add_ps(velecsum,velec);
404
405             fscal            = felec;
406
407             /* Calculate temporary vectorial force */
408             tx               = _mm_mul_ps(fscal,dx02);
409             ty               = _mm_mul_ps(fscal,dy02);
410             tz               = _mm_mul_ps(fscal,dz02);
411
412             /* Update vectorial force */
413             fix0             = _mm_add_ps(fix0,tx);
414             fiy0             = _mm_add_ps(fiy0,ty);
415             fiz0             = _mm_add_ps(fiz0,tz);
416
417             fjx2             = _mm_add_ps(fjx2,tx);
418             fjy2             = _mm_add_ps(fjy2,ty);
419             fjz2             = _mm_add_ps(fjz2,tz);
420
421             /**************************
422              * CALCULATE INTERACTIONS *
423              **************************/
424
425             r10              = _mm_mul_ps(rsq10,rinv10);
426
427             /* EWALD ELECTROSTATICS */
428
429             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
430             ewrt             = _mm_mul_ps(r10,ewtabscale);
431             ewitab           = _mm_cvttps_epi32(ewrt);
432             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
433             ewitab           = _mm_slli_epi32(ewitab,2);
434             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
435             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
436             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
437             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
438             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
439             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
440             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
441             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
442             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
443
444             /* Update potential sum for this i atom from the interaction with this j atom. */
445             velecsum         = _mm_add_ps(velecsum,velec);
446
447             fscal            = felec;
448
449             /* Calculate temporary vectorial force */
450             tx               = _mm_mul_ps(fscal,dx10);
451             ty               = _mm_mul_ps(fscal,dy10);
452             tz               = _mm_mul_ps(fscal,dz10);
453
454             /* Update vectorial force */
455             fix1             = _mm_add_ps(fix1,tx);
456             fiy1             = _mm_add_ps(fiy1,ty);
457             fiz1             = _mm_add_ps(fiz1,tz);
458
459             fjx0             = _mm_add_ps(fjx0,tx);
460             fjy0             = _mm_add_ps(fjy0,ty);
461             fjz0             = _mm_add_ps(fjz0,tz);
462
463             /**************************
464              * CALCULATE INTERACTIONS *
465              **************************/
466
467             r11              = _mm_mul_ps(rsq11,rinv11);
468
469             /* EWALD ELECTROSTATICS */
470
471             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
472             ewrt             = _mm_mul_ps(r11,ewtabscale);
473             ewitab           = _mm_cvttps_epi32(ewrt);
474             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
475             ewitab           = _mm_slli_epi32(ewitab,2);
476             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
477             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
478             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
479             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
480             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
481             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
482             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
483             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
484             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
485
486             /* Update potential sum for this i atom from the interaction with this j atom. */
487             velecsum         = _mm_add_ps(velecsum,velec);
488
489             fscal            = felec;
490
491             /* Calculate temporary vectorial force */
492             tx               = _mm_mul_ps(fscal,dx11);
493             ty               = _mm_mul_ps(fscal,dy11);
494             tz               = _mm_mul_ps(fscal,dz11);
495
496             /* Update vectorial force */
497             fix1             = _mm_add_ps(fix1,tx);
498             fiy1             = _mm_add_ps(fiy1,ty);
499             fiz1             = _mm_add_ps(fiz1,tz);
500
501             fjx1             = _mm_add_ps(fjx1,tx);
502             fjy1             = _mm_add_ps(fjy1,ty);
503             fjz1             = _mm_add_ps(fjz1,tz);
504
505             /**************************
506              * CALCULATE INTERACTIONS *
507              **************************/
508
509             r12              = _mm_mul_ps(rsq12,rinv12);
510
511             /* EWALD ELECTROSTATICS */
512
513             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
514             ewrt             = _mm_mul_ps(r12,ewtabscale);
515             ewitab           = _mm_cvttps_epi32(ewrt);
516             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
517             ewitab           = _mm_slli_epi32(ewitab,2);
518             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
519             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
520             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
521             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
522             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
523             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
524             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
525             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
526             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
527
528             /* Update potential sum for this i atom from the interaction with this j atom. */
529             velecsum         = _mm_add_ps(velecsum,velec);
530
531             fscal            = felec;
532
533             /* Calculate temporary vectorial force */
534             tx               = _mm_mul_ps(fscal,dx12);
535             ty               = _mm_mul_ps(fscal,dy12);
536             tz               = _mm_mul_ps(fscal,dz12);
537
538             /* Update vectorial force */
539             fix1             = _mm_add_ps(fix1,tx);
540             fiy1             = _mm_add_ps(fiy1,ty);
541             fiz1             = _mm_add_ps(fiz1,tz);
542
543             fjx2             = _mm_add_ps(fjx2,tx);
544             fjy2             = _mm_add_ps(fjy2,ty);
545             fjz2             = _mm_add_ps(fjz2,tz);
546
547             /**************************
548              * CALCULATE INTERACTIONS *
549              **************************/
550
551             r20              = _mm_mul_ps(rsq20,rinv20);
552
553             /* EWALD ELECTROSTATICS */
554
555             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
556             ewrt             = _mm_mul_ps(r20,ewtabscale);
557             ewitab           = _mm_cvttps_epi32(ewrt);
558             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
559             ewitab           = _mm_slli_epi32(ewitab,2);
560             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
561             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
562             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
563             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
564             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
565             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
566             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
567             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
568             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
569
570             /* Update potential sum for this i atom from the interaction with this j atom. */
571             velecsum         = _mm_add_ps(velecsum,velec);
572
573             fscal            = felec;
574
575             /* Calculate temporary vectorial force */
576             tx               = _mm_mul_ps(fscal,dx20);
577             ty               = _mm_mul_ps(fscal,dy20);
578             tz               = _mm_mul_ps(fscal,dz20);
579
580             /* Update vectorial force */
581             fix2             = _mm_add_ps(fix2,tx);
582             fiy2             = _mm_add_ps(fiy2,ty);
583             fiz2             = _mm_add_ps(fiz2,tz);
584
585             fjx0             = _mm_add_ps(fjx0,tx);
586             fjy0             = _mm_add_ps(fjy0,ty);
587             fjz0             = _mm_add_ps(fjz0,tz);
588
589             /**************************
590              * CALCULATE INTERACTIONS *
591              **************************/
592
593             r21              = _mm_mul_ps(rsq21,rinv21);
594
595             /* EWALD ELECTROSTATICS */
596
597             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
598             ewrt             = _mm_mul_ps(r21,ewtabscale);
599             ewitab           = _mm_cvttps_epi32(ewrt);
600             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
601             ewitab           = _mm_slli_epi32(ewitab,2);
602             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
603             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
604             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
605             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
606             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
607             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
608             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
609             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
610             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
611
612             /* Update potential sum for this i atom from the interaction with this j atom. */
613             velecsum         = _mm_add_ps(velecsum,velec);
614
615             fscal            = felec;
616
617             /* Calculate temporary vectorial force */
618             tx               = _mm_mul_ps(fscal,dx21);
619             ty               = _mm_mul_ps(fscal,dy21);
620             tz               = _mm_mul_ps(fscal,dz21);
621
622             /* Update vectorial force */
623             fix2             = _mm_add_ps(fix2,tx);
624             fiy2             = _mm_add_ps(fiy2,ty);
625             fiz2             = _mm_add_ps(fiz2,tz);
626
627             fjx1             = _mm_add_ps(fjx1,tx);
628             fjy1             = _mm_add_ps(fjy1,ty);
629             fjz1             = _mm_add_ps(fjz1,tz);
630
631             /**************************
632              * CALCULATE INTERACTIONS *
633              **************************/
634
635             r22              = _mm_mul_ps(rsq22,rinv22);
636
637             /* EWALD ELECTROSTATICS */
638
639             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
640             ewrt             = _mm_mul_ps(r22,ewtabscale);
641             ewitab           = _mm_cvttps_epi32(ewrt);
642             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
643             ewitab           = _mm_slli_epi32(ewitab,2);
644             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
645             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
646             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
647             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
648             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
649             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
650             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
651             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
652             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
653
654             /* Update potential sum for this i atom from the interaction with this j atom. */
655             velecsum         = _mm_add_ps(velecsum,velec);
656
657             fscal            = felec;
658
659             /* Calculate temporary vectorial force */
660             tx               = _mm_mul_ps(fscal,dx22);
661             ty               = _mm_mul_ps(fscal,dy22);
662             tz               = _mm_mul_ps(fscal,dz22);
663
664             /* Update vectorial force */
665             fix2             = _mm_add_ps(fix2,tx);
666             fiy2             = _mm_add_ps(fiy2,ty);
667             fiz2             = _mm_add_ps(fiz2,tz);
668
669             fjx2             = _mm_add_ps(fjx2,tx);
670             fjy2             = _mm_add_ps(fjy2,ty);
671             fjz2             = _mm_add_ps(fjz2,tz);
672
673             fjptrA             = f+j_coord_offsetA;
674             fjptrB             = f+j_coord_offsetB;
675             fjptrC             = f+j_coord_offsetC;
676             fjptrD             = f+j_coord_offsetD;
677
678             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
679                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
680
681             /* Inner loop uses 381 flops */
682         }
683
684         if(jidx<j_index_end)
685         {
686
687             /* Get j neighbor index, and coordinate index */
688             jnrlistA         = jjnr[jidx];
689             jnrlistB         = jjnr[jidx+1];
690             jnrlistC         = jjnr[jidx+2];
691             jnrlistD         = jjnr[jidx+3];
692             /* Sign of each element will be negative for non-real atoms.
693              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
694              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
695              */
696             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
697             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
698             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
699             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
700             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
701             j_coord_offsetA  = DIM*jnrA;
702             j_coord_offsetB  = DIM*jnrB;
703             j_coord_offsetC  = DIM*jnrC;
704             j_coord_offsetD  = DIM*jnrD;
705
706             /* load j atom coordinates */
707             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
708                                               x+j_coord_offsetC,x+j_coord_offsetD,
709                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
710
711             /* Calculate displacement vector */
712             dx00             = _mm_sub_ps(ix0,jx0);
713             dy00             = _mm_sub_ps(iy0,jy0);
714             dz00             = _mm_sub_ps(iz0,jz0);
715             dx01             = _mm_sub_ps(ix0,jx1);
716             dy01             = _mm_sub_ps(iy0,jy1);
717             dz01             = _mm_sub_ps(iz0,jz1);
718             dx02             = _mm_sub_ps(ix0,jx2);
719             dy02             = _mm_sub_ps(iy0,jy2);
720             dz02             = _mm_sub_ps(iz0,jz2);
721             dx10             = _mm_sub_ps(ix1,jx0);
722             dy10             = _mm_sub_ps(iy1,jy0);
723             dz10             = _mm_sub_ps(iz1,jz0);
724             dx11             = _mm_sub_ps(ix1,jx1);
725             dy11             = _mm_sub_ps(iy1,jy1);
726             dz11             = _mm_sub_ps(iz1,jz1);
727             dx12             = _mm_sub_ps(ix1,jx2);
728             dy12             = _mm_sub_ps(iy1,jy2);
729             dz12             = _mm_sub_ps(iz1,jz2);
730             dx20             = _mm_sub_ps(ix2,jx0);
731             dy20             = _mm_sub_ps(iy2,jy0);
732             dz20             = _mm_sub_ps(iz2,jz0);
733             dx21             = _mm_sub_ps(ix2,jx1);
734             dy21             = _mm_sub_ps(iy2,jy1);
735             dz21             = _mm_sub_ps(iz2,jz1);
736             dx22             = _mm_sub_ps(ix2,jx2);
737             dy22             = _mm_sub_ps(iy2,jy2);
738             dz22             = _mm_sub_ps(iz2,jz2);
739
740             /* Calculate squared distance and things based on it */
741             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
742             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
743             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
744             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
745             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
746             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
747             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
748             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
749             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
750
751             rinv00           = gmx_mm_invsqrt_ps(rsq00);
752             rinv01           = gmx_mm_invsqrt_ps(rsq01);
753             rinv02           = gmx_mm_invsqrt_ps(rsq02);
754             rinv10           = gmx_mm_invsqrt_ps(rsq10);
755             rinv11           = gmx_mm_invsqrt_ps(rsq11);
756             rinv12           = gmx_mm_invsqrt_ps(rsq12);
757             rinv20           = gmx_mm_invsqrt_ps(rsq20);
758             rinv21           = gmx_mm_invsqrt_ps(rsq21);
759             rinv22           = gmx_mm_invsqrt_ps(rsq22);
760
761             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
762             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
763             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
764             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
765             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
766             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
767             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
768             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
769             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
770
771             fjx0             = _mm_setzero_ps();
772             fjy0             = _mm_setzero_ps();
773             fjz0             = _mm_setzero_ps();
774             fjx1             = _mm_setzero_ps();
775             fjy1             = _mm_setzero_ps();
776             fjz1             = _mm_setzero_ps();
777             fjx2             = _mm_setzero_ps();
778             fjy2             = _mm_setzero_ps();
779             fjz2             = _mm_setzero_ps();
780
781             /**************************
782              * CALCULATE INTERACTIONS *
783              **************************/
784
785             r00              = _mm_mul_ps(rsq00,rinv00);
786             r00              = _mm_andnot_ps(dummy_mask,r00);
787
788             /* EWALD ELECTROSTATICS */
789
790             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
791             ewrt             = _mm_mul_ps(r00,ewtabscale);
792             ewitab           = _mm_cvttps_epi32(ewrt);
793             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
794             ewitab           = _mm_slli_epi32(ewitab,2);
795             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
796             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
797             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
798             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
799             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
800             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
801             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
802             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
803             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
804
805             /* LENNARD-JONES DISPERSION/REPULSION */
806
807             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
808             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
809             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
810             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
811             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
812
813             /* Update potential sum for this i atom from the interaction with this j atom. */
814             velec            = _mm_andnot_ps(dummy_mask,velec);
815             velecsum         = _mm_add_ps(velecsum,velec);
816             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
817             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
818
819             fscal            = _mm_add_ps(felec,fvdw);
820
821             fscal            = _mm_andnot_ps(dummy_mask,fscal);
822
823             /* Calculate temporary vectorial force */
824             tx               = _mm_mul_ps(fscal,dx00);
825             ty               = _mm_mul_ps(fscal,dy00);
826             tz               = _mm_mul_ps(fscal,dz00);
827
828             /* Update vectorial force */
829             fix0             = _mm_add_ps(fix0,tx);
830             fiy0             = _mm_add_ps(fiy0,ty);
831             fiz0             = _mm_add_ps(fiz0,tz);
832
833             fjx0             = _mm_add_ps(fjx0,tx);
834             fjy0             = _mm_add_ps(fjy0,ty);
835             fjz0             = _mm_add_ps(fjz0,tz);
836
837             /**************************
838              * CALCULATE INTERACTIONS *
839              **************************/
840
841             r01              = _mm_mul_ps(rsq01,rinv01);
842             r01              = _mm_andnot_ps(dummy_mask,r01);
843
844             /* EWALD ELECTROSTATICS */
845
846             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
847             ewrt             = _mm_mul_ps(r01,ewtabscale);
848             ewitab           = _mm_cvttps_epi32(ewrt);
849             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
850             ewitab           = _mm_slli_epi32(ewitab,2);
851             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
852             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
853             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
854             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
855             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
856             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
857             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
858             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
859             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
860
861             /* Update potential sum for this i atom from the interaction with this j atom. */
862             velec            = _mm_andnot_ps(dummy_mask,velec);
863             velecsum         = _mm_add_ps(velecsum,velec);
864
865             fscal            = felec;
866
867             fscal            = _mm_andnot_ps(dummy_mask,fscal);
868
869             /* Calculate temporary vectorial force */
870             tx               = _mm_mul_ps(fscal,dx01);
871             ty               = _mm_mul_ps(fscal,dy01);
872             tz               = _mm_mul_ps(fscal,dz01);
873
874             /* Update vectorial force */
875             fix0             = _mm_add_ps(fix0,tx);
876             fiy0             = _mm_add_ps(fiy0,ty);
877             fiz0             = _mm_add_ps(fiz0,tz);
878
879             fjx1             = _mm_add_ps(fjx1,tx);
880             fjy1             = _mm_add_ps(fjy1,ty);
881             fjz1             = _mm_add_ps(fjz1,tz);
882
883             /**************************
884              * CALCULATE INTERACTIONS *
885              **************************/
886
887             r02              = _mm_mul_ps(rsq02,rinv02);
888             r02              = _mm_andnot_ps(dummy_mask,r02);
889
890             /* EWALD ELECTROSTATICS */
891
892             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
893             ewrt             = _mm_mul_ps(r02,ewtabscale);
894             ewitab           = _mm_cvttps_epi32(ewrt);
895             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
896             ewitab           = _mm_slli_epi32(ewitab,2);
897             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
898             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
899             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
900             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
901             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
902             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
903             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
904             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
905             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
906
907             /* Update potential sum for this i atom from the interaction with this j atom. */
908             velec            = _mm_andnot_ps(dummy_mask,velec);
909             velecsum         = _mm_add_ps(velecsum,velec);
910
911             fscal            = felec;
912
913             fscal            = _mm_andnot_ps(dummy_mask,fscal);
914
915             /* Calculate temporary vectorial force */
916             tx               = _mm_mul_ps(fscal,dx02);
917             ty               = _mm_mul_ps(fscal,dy02);
918             tz               = _mm_mul_ps(fscal,dz02);
919
920             /* Update vectorial force */
921             fix0             = _mm_add_ps(fix0,tx);
922             fiy0             = _mm_add_ps(fiy0,ty);
923             fiz0             = _mm_add_ps(fiz0,tz);
924
925             fjx2             = _mm_add_ps(fjx2,tx);
926             fjy2             = _mm_add_ps(fjy2,ty);
927             fjz2             = _mm_add_ps(fjz2,tz);
928
929             /**************************
930              * CALCULATE INTERACTIONS *
931              **************************/
932
933             r10              = _mm_mul_ps(rsq10,rinv10);
934             r10              = _mm_andnot_ps(dummy_mask,r10);
935
936             /* EWALD ELECTROSTATICS */
937
938             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
939             ewrt             = _mm_mul_ps(r10,ewtabscale);
940             ewitab           = _mm_cvttps_epi32(ewrt);
941             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
942             ewitab           = _mm_slli_epi32(ewitab,2);
943             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
944             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
945             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
946             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
947             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
948             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
949             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
950             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
951             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
952
953             /* Update potential sum for this i atom from the interaction with this j atom. */
954             velec            = _mm_andnot_ps(dummy_mask,velec);
955             velecsum         = _mm_add_ps(velecsum,velec);
956
957             fscal            = felec;
958
959             fscal            = _mm_andnot_ps(dummy_mask,fscal);
960
961             /* Calculate temporary vectorial force */
962             tx               = _mm_mul_ps(fscal,dx10);
963             ty               = _mm_mul_ps(fscal,dy10);
964             tz               = _mm_mul_ps(fscal,dz10);
965
966             /* Update vectorial force */
967             fix1             = _mm_add_ps(fix1,tx);
968             fiy1             = _mm_add_ps(fiy1,ty);
969             fiz1             = _mm_add_ps(fiz1,tz);
970
971             fjx0             = _mm_add_ps(fjx0,tx);
972             fjy0             = _mm_add_ps(fjy0,ty);
973             fjz0             = _mm_add_ps(fjz0,tz);
974
975             /**************************
976              * CALCULATE INTERACTIONS *
977              **************************/
978
979             r11              = _mm_mul_ps(rsq11,rinv11);
980             r11              = _mm_andnot_ps(dummy_mask,r11);
981
982             /* EWALD ELECTROSTATICS */
983
984             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
985             ewrt             = _mm_mul_ps(r11,ewtabscale);
986             ewitab           = _mm_cvttps_epi32(ewrt);
987             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
988             ewitab           = _mm_slli_epi32(ewitab,2);
989             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
990             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
991             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
992             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
993             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
994             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
995             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
996             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
997             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
998
999             /* Update potential sum for this i atom from the interaction with this j atom. */
1000             velec            = _mm_andnot_ps(dummy_mask,velec);
1001             velecsum         = _mm_add_ps(velecsum,velec);
1002
1003             fscal            = felec;
1004
1005             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1006
1007             /* Calculate temporary vectorial force */
1008             tx               = _mm_mul_ps(fscal,dx11);
1009             ty               = _mm_mul_ps(fscal,dy11);
1010             tz               = _mm_mul_ps(fscal,dz11);
1011
1012             /* Update vectorial force */
1013             fix1             = _mm_add_ps(fix1,tx);
1014             fiy1             = _mm_add_ps(fiy1,ty);
1015             fiz1             = _mm_add_ps(fiz1,tz);
1016
1017             fjx1             = _mm_add_ps(fjx1,tx);
1018             fjy1             = _mm_add_ps(fjy1,ty);
1019             fjz1             = _mm_add_ps(fjz1,tz);
1020
1021             /**************************
1022              * CALCULATE INTERACTIONS *
1023              **************************/
1024
1025             r12              = _mm_mul_ps(rsq12,rinv12);
1026             r12              = _mm_andnot_ps(dummy_mask,r12);
1027
1028             /* EWALD ELECTROSTATICS */
1029
1030             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031             ewrt             = _mm_mul_ps(r12,ewtabscale);
1032             ewitab           = _mm_cvttps_epi32(ewrt);
1033             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1034             ewitab           = _mm_slli_epi32(ewitab,2);
1035             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1036             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1037             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1038             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1039             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1040             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1041             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1042             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1043             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1044
1045             /* Update potential sum for this i atom from the interaction with this j atom. */
1046             velec            = _mm_andnot_ps(dummy_mask,velec);
1047             velecsum         = _mm_add_ps(velecsum,velec);
1048
1049             fscal            = felec;
1050
1051             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1052
1053             /* Calculate temporary vectorial force */
1054             tx               = _mm_mul_ps(fscal,dx12);
1055             ty               = _mm_mul_ps(fscal,dy12);
1056             tz               = _mm_mul_ps(fscal,dz12);
1057
1058             /* Update vectorial force */
1059             fix1             = _mm_add_ps(fix1,tx);
1060             fiy1             = _mm_add_ps(fiy1,ty);
1061             fiz1             = _mm_add_ps(fiz1,tz);
1062
1063             fjx2             = _mm_add_ps(fjx2,tx);
1064             fjy2             = _mm_add_ps(fjy2,ty);
1065             fjz2             = _mm_add_ps(fjz2,tz);
1066
1067             /**************************
1068              * CALCULATE INTERACTIONS *
1069              **************************/
1070
1071             r20              = _mm_mul_ps(rsq20,rinv20);
1072             r20              = _mm_andnot_ps(dummy_mask,r20);
1073
1074             /* EWALD ELECTROSTATICS */
1075
1076             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1077             ewrt             = _mm_mul_ps(r20,ewtabscale);
1078             ewitab           = _mm_cvttps_epi32(ewrt);
1079             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1080             ewitab           = _mm_slli_epi32(ewitab,2);
1081             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1082             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1083             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1084             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1085             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1086             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1087             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1088             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1089             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1090
1091             /* Update potential sum for this i atom from the interaction with this j atom. */
1092             velec            = _mm_andnot_ps(dummy_mask,velec);
1093             velecsum         = _mm_add_ps(velecsum,velec);
1094
1095             fscal            = felec;
1096
1097             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1098
1099             /* Calculate temporary vectorial force */
1100             tx               = _mm_mul_ps(fscal,dx20);
1101             ty               = _mm_mul_ps(fscal,dy20);
1102             tz               = _mm_mul_ps(fscal,dz20);
1103
1104             /* Update vectorial force */
1105             fix2             = _mm_add_ps(fix2,tx);
1106             fiy2             = _mm_add_ps(fiy2,ty);
1107             fiz2             = _mm_add_ps(fiz2,tz);
1108
1109             fjx0             = _mm_add_ps(fjx0,tx);
1110             fjy0             = _mm_add_ps(fjy0,ty);
1111             fjz0             = _mm_add_ps(fjz0,tz);
1112
1113             /**************************
1114              * CALCULATE INTERACTIONS *
1115              **************************/
1116
1117             r21              = _mm_mul_ps(rsq21,rinv21);
1118             r21              = _mm_andnot_ps(dummy_mask,r21);
1119
1120             /* EWALD ELECTROSTATICS */
1121
1122             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1123             ewrt             = _mm_mul_ps(r21,ewtabscale);
1124             ewitab           = _mm_cvttps_epi32(ewrt);
1125             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1126             ewitab           = _mm_slli_epi32(ewitab,2);
1127             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1128             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1129             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1130             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1131             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1132             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1133             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1134             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1135             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1136
1137             /* Update potential sum for this i atom from the interaction with this j atom. */
1138             velec            = _mm_andnot_ps(dummy_mask,velec);
1139             velecsum         = _mm_add_ps(velecsum,velec);
1140
1141             fscal            = felec;
1142
1143             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1144
1145             /* Calculate temporary vectorial force */
1146             tx               = _mm_mul_ps(fscal,dx21);
1147             ty               = _mm_mul_ps(fscal,dy21);
1148             tz               = _mm_mul_ps(fscal,dz21);
1149
1150             /* Update vectorial force */
1151             fix2             = _mm_add_ps(fix2,tx);
1152             fiy2             = _mm_add_ps(fiy2,ty);
1153             fiz2             = _mm_add_ps(fiz2,tz);
1154
1155             fjx1             = _mm_add_ps(fjx1,tx);
1156             fjy1             = _mm_add_ps(fjy1,ty);
1157             fjz1             = _mm_add_ps(fjz1,tz);
1158
1159             /**************************
1160              * CALCULATE INTERACTIONS *
1161              **************************/
1162
1163             r22              = _mm_mul_ps(rsq22,rinv22);
1164             r22              = _mm_andnot_ps(dummy_mask,r22);
1165
1166             /* EWALD ELECTROSTATICS */
1167
1168             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1169             ewrt             = _mm_mul_ps(r22,ewtabscale);
1170             ewitab           = _mm_cvttps_epi32(ewrt);
1171             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1172             ewitab           = _mm_slli_epi32(ewitab,2);
1173             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1174             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1175             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1176             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1177             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1178             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1179             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1180             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1181             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1182
1183             /* Update potential sum for this i atom from the interaction with this j atom. */
1184             velec            = _mm_andnot_ps(dummy_mask,velec);
1185             velecsum         = _mm_add_ps(velecsum,velec);
1186
1187             fscal            = felec;
1188
1189             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1190
1191             /* Calculate temporary vectorial force */
1192             tx               = _mm_mul_ps(fscal,dx22);
1193             ty               = _mm_mul_ps(fscal,dy22);
1194             tz               = _mm_mul_ps(fscal,dz22);
1195
1196             /* Update vectorial force */
1197             fix2             = _mm_add_ps(fix2,tx);
1198             fiy2             = _mm_add_ps(fiy2,ty);
1199             fiz2             = _mm_add_ps(fiz2,tz);
1200
1201             fjx2             = _mm_add_ps(fjx2,tx);
1202             fjy2             = _mm_add_ps(fjy2,ty);
1203             fjz2             = _mm_add_ps(fjz2,tz);
1204
1205             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1206             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1207             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1208             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1209
1210             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1211                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1212
1213             /* Inner loop uses 390 flops */
1214         }
1215
1216         /* End of innermost loop */
1217
1218         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1219                                               f+i_coord_offset,fshift+i_shift_offset);
1220
1221         ggid                        = gid[iidx];
1222         /* Update potential energies */
1223         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1224         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1225
1226         /* Increment number of inner iterations */
1227         inneriter                  += j_index_end - j_index_start;
1228
1229         /* Outer loop uses 20 flops */
1230     }
1231
1232     /* Increment number of outer iterations */
1233     outeriter        += nri;
1234
1235     /* Update outer/inner flops */
1236
1237     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*390);
1238 }
1239 /*
1240  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_single
1241  * Electrostatics interaction: Ewald
1242  * VdW interaction:            LennardJones
1243  * Geometry:                   Water3-Water3
1244  * Calculate force/pot:        Force
1245  */
1246 void
1247 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_single
1248                     (t_nblist * gmx_restrict                nlist,
1249                      rvec * gmx_restrict                    xx,
1250                      rvec * gmx_restrict                    ff,
1251                      t_forcerec * gmx_restrict              fr,
1252                      t_mdatoms * gmx_restrict               mdatoms,
1253                      nb_kernel_data_t * gmx_restrict        kernel_data,
1254                      t_nrnb * gmx_restrict                  nrnb)
1255 {
1256     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1257      * just 0 for non-waters.
1258      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1259      * jnr indices corresponding to data put in the four positions in the SIMD register.
1260      */
1261     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1262     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1263     int              jnrA,jnrB,jnrC,jnrD;
1264     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1265     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1266     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1267     real             rcutoff_scalar;
1268     real             *shiftvec,*fshift,*x,*f;
1269     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1270     real             scratch[4*DIM];
1271     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1272     int              vdwioffset0;
1273     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1274     int              vdwioffset1;
1275     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1276     int              vdwioffset2;
1277     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1278     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1279     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1280     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1281     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1282     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1283     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1284     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1285     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1286     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1287     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1288     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1289     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1290     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1291     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1292     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1293     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1294     real             *charge;
1295     int              nvdwtype;
1296     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1297     int              *vdwtype;
1298     real             *vdwparam;
1299     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1300     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1301     __m128i          ewitab;
1302     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1303     real             *ewtab;
1304     __m128           dummy_mask,cutoff_mask;
1305     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1306     __m128           one     = _mm_set1_ps(1.0);
1307     __m128           two     = _mm_set1_ps(2.0);
1308     x                = xx[0];
1309     f                = ff[0];
1310
1311     nri              = nlist->nri;
1312     iinr             = nlist->iinr;
1313     jindex           = nlist->jindex;
1314     jjnr             = nlist->jjnr;
1315     shiftidx         = nlist->shift;
1316     gid              = nlist->gid;
1317     shiftvec         = fr->shift_vec[0];
1318     fshift           = fr->fshift[0];
1319     facel            = _mm_set1_ps(fr->epsfac);
1320     charge           = mdatoms->chargeA;
1321     nvdwtype         = fr->ntype;
1322     vdwparam         = fr->nbfp;
1323     vdwtype          = mdatoms->typeA;
1324
1325     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1326     ewtab            = fr->ic->tabq_coul_F;
1327     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1328     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1329
1330     /* Setup water-specific parameters */
1331     inr              = nlist->iinr[0];
1332     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1333     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1334     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1335     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1336
1337     jq0              = _mm_set1_ps(charge[inr+0]);
1338     jq1              = _mm_set1_ps(charge[inr+1]);
1339     jq2              = _mm_set1_ps(charge[inr+2]);
1340     vdwjidx0A        = 2*vdwtype[inr+0];
1341     qq00             = _mm_mul_ps(iq0,jq0);
1342     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1343     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1344     qq01             = _mm_mul_ps(iq0,jq1);
1345     qq02             = _mm_mul_ps(iq0,jq2);
1346     qq10             = _mm_mul_ps(iq1,jq0);
1347     qq11             = _mm_mul_ps(iq1,jq1);
1348     qq12             = _mm_mul_ps(iq1,jq2);
1349     qq20             = _mm_mul_ps(iq2,jq0);
1350     qq21             = _mm_mul_ps(iq2,jq1);
1351     qq22             = _mm_mul_ps(iq2,jq2);
1352
1353     /* Avoid stupid compiler warnings */
1354     jnrA = jnrB = jnrC = jnrD = 0;
1355     j_coord_offsetA = 0;
1356     j_coord_offsetB = 0;
1357     j_coord_offsetC = 0;
1358     j_coord_offsetD = 0;
1359
1360     outeriter        = 0;
1361     inneriter        = 0;
1362
1363     for(iidx=0;iidx<4*DIM;iidx++)
1364     {
1365         scratch[iidx] = 0.0;
1366     }
1367
1368     /* Start outer loop over neighborlists */
1369     for(iidx=0; iidx<nri; iidx++)
1370     {
1371         /* Load shift vector for this list */
1372         i_shift_offset   = DIM*shiftidx[iidx];
1373
1374         /* Load limits for loop over neighbors */
1375         j_index_start    = jindex[iidx];
1376         j_index_end      = jindex[iidx+1];
1377
1378         /* Get outer coordinate index */
1379         inr              = iinr[iidx];
1380         i_coord_offset   = DIM*inr;
1381
1382         /* Load i particle coords and add shift vector */
1383         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1384                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1385
1386         fix0             = _mm_setzero_ps();
1387         fiy0             = _mm_setzero_ps();
1388         fiz0             = _mm_setzero_ps();
1389         fix1             = _mm_setzero_ps();
1390         fiy1             = _mm_setzero_ps();
1391         fiz1             = _mm_setzero_ps();
1392         fix2             = _mm_setzero_ps();
1393         fiy2             = _mm_setzero_ps();
1394         fiz2             = _mm_setzero_ps();
1395
1396         /* Start inner kernel loop */
1397         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1398         {
1399
1400             /* Get j neighbor index, and coordinate index */
1401             jnrA             = jjnr[jidx];
1402             jnrB             = jjnr[jidx+1];
1403             jnrC             = jjnr[jidx+2];
1404             jnrD             = jjnr[jidx+3];
1405             j_coord_offsetA  = DIM*jnrA;
1406             j_coord_offsetB  = DIM*jnrB;
1407             j_coord_offsetC  = DIM*jnrC;
1408             j_coord_offsetD  = DIM*jnrD;
1409
1410             /* load j atom coordinates */
1411             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1412                                               x+j_coord_offsetC,x+j_coord_offsetD,
1413                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1414
1415             /* Calculate displacement vector */
1416             dx00             = _mm_sub_ps(ix0,jx0);
1417             dy00             = _mm_sub_ps(iy0,jy0);
1418             dz00             = _mm_sub_ps(iz0,jz0);
1419             dx01             = _mm_sub_ps(ix0,jx1);
1420             dy01             = _mm_sub_ps(iy0,jy1);
1421             dz01             = _mm_sub_ps(iz0,jz1);
1422             dx02             = _mm_sub_ps(ix0,jx2);
1423             dy02             = _mm_sub_ps(iy0,jy2);
1424             dz02             = _mm_sub_ps(iz0,jz2);
1425             dx10             = _mm_sub_ps(ix1,jx0);
1426             dy10             = _mm_sub_ps(iy1,jy0);
1427             dz10             = _mm_sub_ps(iz1,jz0);
1428             dx11             = _mm_sub_ps(ix1,jx1);
1429             dy11             = _mm_sub_ps(iy1,jy1);
1430             dz11             = _mm_sub_ps(iz1,jz1);
1431             dx12             = _mm_sub_ps(ix1,jx2);
1432             dy12             = _mm_sub_ps(iy1,jy2);
1433             dz12             = _mm_sub_ps(iz1,jz2);
1434             dx20             = _mm_sub_ps(ix2,jx0);
1435             dy20             = _mm_sub_ps(iy2,jy0);
1436             dz20             = _mm_sub_ps(iz2,jz0);
1437             dx21             = _mm_sub_ps(ix2,jx1);
1438             dy21             = _mm_sub_ps(iy2,jy1);
1439             dz21             = _mm_sub_ps(iz2,jz1);
1440             dx22             = _mm_sub_ps(ix2,jx2);
1441             dy22             = _mm_sub_ps(iy2,jy2);
1442             dz22             = _mm_sub_ps(iz2,jz2);
1443
1444             /* Calculate squared distance and things based on it */
1445             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1446             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1447             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1448             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1449             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1450             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1451             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1452             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1453             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1454
1455             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1456             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1457             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1458             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1459             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1460             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1461             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1462             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1463             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1464
1465             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1466             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1467             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1468             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1469             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1470             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1471             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1472             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1473             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1474
1475             fjx0             = _mm_setzero_ps();
1476             fjy0             = _mm_setzero_ps();
1477             fjz0             = _mm_setzero_ps();
1478             fjx1             = _mm_setzero_ps();
1479             fjy1             = _mm_setzero_ps();
1480             fjz1             = _mm_setzero_ps();
1481             fjx2             = _mm_setzero_ps();
1482             fjy2             = _mm_setzero_ps();
1483             fjz2             = _mm_setzero_ps();
1484
1485             /**************************
1486              * CALCULATE INTERACTIONS *
1487              **************************/
1488
1489             r00              = _mm_mul_ps(rsq00,rinv00);
1490
1491             /* EWALD ELECTROSTATICS */
1492
1493             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1494             ewrt             = _mm_mul_ps(r00,ewtabscale);
1495             ewitab           = _mm_cvttps_epi32(ewrt);
1496             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1497             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1498                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1499                                          &ewtabF,&ewtabFn);
1500             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1501             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1502
1503             /* LENNARD-JONES DISPERSION/REPULSION */
1504
1505             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1506             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1507
1508             fscal            = _mm_add_ps(felec,fvdw);
1509
1510             /* Calculate temporary vectorial force */
1511             tx               = _mm_mul_ps(fscal,dx00);
1512             ty               = _mm_mul_ps(fscal,dy00);
1513             tz               = _mm_mul_ps(fscal,dz00);
1514
1515             /* Update vectorial force */
1516             fix0             = _mm_add_ps(fix0,tx);
1517             fiy0             = _mm_add_ps(fiy0,ty);
1518             fiz0             = _mm_add_ps(fiz0,tz);
1519
1520             fjx0             = _mm_add_ps(fjx0,tx);
1521             fjy0             = _mm_add_ps(fjy0,ty);
1522             fjz0             = _mm_add_ps(fjz0,tz);
1523
1524             /**************************
1525              * CALCULATE INTERACTIONS *
1526              **************************/
1527
1528             r01              = _mm_mul_ps(rsq01,rinv01);
1529
1530             /* EWALD ELECTROSTATICS */
1531
1532             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1533             ewrt             = _mm_mul_ps(r01,ewtabscale);
1534             ewitab           = _mm_cvttps_epi32(ewrt);
1535             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1536             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1537                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1538                                          &ewtabF,&ewtabFn);
1539             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1540             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1541
1542             fscal            = felec;
1543
1544             /* Calculate temporary vectorial force */
1545             tx               = _mm_mul_ps(fscal,dx01);
1546             ty               = _mm_mul_ps(fscal,dy01);
1547             tz               = _mm_mul_ps(fscal,dz01);
1548
1549             /* Update vectorial force */
1550             fix0             = _mm_add_ps(fix0,tx);
1551             fiy0             = _mm_add_ps(fiy0,ty);
1552             fiz0             = _mm_add_ps(fiz0,tz);
1553
1554             fjx1             = _mm_add_ps(fjx1,tx);
1555             fjy1             = _mm_add_ps(fjy1,ty);
1556             fjz1             = _mm_add_ps(fjz1,tz);
1557
1558             /**************************
1559              * CALCULATE INTERACTIONS *
1560              **************************/
1561
1562             r02              = _mm_mul_ps(rsq02,rinv02);
1563
1564             /* EWALD ELECTROSTATICS */
1565
1566             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1567             ewrt             = _mm_mul_ps(r02,ewtabscale);
1568             ewitab           = _mm_cvttps_epi32(ewrt);
1569             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1570             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1571                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1572                                          &ewtabF,&ewtabFn);
1573             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1574             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1575
1576             fscal            = felec;
1577
1578             /* Calculate temporary vectorial force */
1579             tx               = _mm_mul_ps(fscal,dx02);
1580             ty               = _mm_mul_ps(fscal,dy02);
1581             tz               = _mm_mul_ps(fscal,dz02);
1582
1583             /* Update vectorial force */
1584             fix0             = _mm_add_ps(fix0,tx);
1585             fiy0             = _mm_add_ps(fiy0,ty);
1586             fiz0             = _mm_add_ps(fiz0,tz);
1587
1588             fjx2             = _mm_add_ps(fjx2,tx);
1589             fjy2             = _mm_add_ps(fjy2,ty);
1590             fjz2             = _mm_add_ps(fjz2,tz);
1591
1592             /**************************
1593              * CALCULATE INTERACTIONS *
1594              **************************/
1595
1596             r10              = _mm_mul_ps(rsq10,rinv10);
1597
1598             /* EWALD ELECTROSTATICS */
1599
1600             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1601             ewrt             = _mm_mul_ps(r10,ewtabscale);
1602             ewitab           = _mm_cvttps_epi32(ewrt);
1603             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1604             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1605                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1606                                          &ewtabF,&ewtabFn);
1607             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1608             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1609
1610             fscal            = felec;
1611
1612             /* Calculate temporary vectorial force */
1613             tx               = _mm_mul_ps(fscal,dx10);
1614             ty               = _mm_mul_ps(fscal,dy10);
1615             tz               = _mm_mul_ps(fscal,dz10);
1616
1617             /* Update vectorial force */
1618             fix1             = _mm_add_ps(fix1,tx);
1619             fiy1             = _mm_add_ps(fiy1,ty);
1620             fiz1             = _mm_add_ps(fiz1,tz);
1621
1622             fjx0             = _mm_add_ps(fjx0,tx);
1623             fjy0             = _mm_add_ps(fjy0,ty);
1624             fjz0             = _mm_add_ps(fjz0,tz);
1625
1626             /**************************
1627              * CALCULATE INTERACTIONS *
1628              **************************/
1629
1630             r11              = _mm_mul_ps(rsq11,rinv11);
1631
1632             /* EWALD ELECTROSTATICS */
1633
1634             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1635             ewrt             = _mm_mul_ps(r11,ewtabscale);
1636             ewitab           = _mm_cvttps_epi32(ewrt);
1637             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1638             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1639                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1640                                          &ewtabF,&ewtabFn);
1641             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1642             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1643
1644             fscal            = felec;
1645
1646             /* Calculate temporary vectorial force */
1647             tx               = _mm_mul_ps(fscal,dx11);
1648             ty               = _mm_mul_ps(fscal,dy11);
1649             tz               = _mm_mul_ps(fscal,dz11);
1650
1651             /* Update vectorial force */
1652             fix1             = _mm_add_ps(fix1,tx);
1653             fiy1             = _mm_add_ps(fiy1,ty);
1654             fiz1             = _mm_add_ps(fiz1,tz);
1655
1656             fjx1             = _mm_add_ps(fjx1,tx);
1657             fjy1             = _mm_add_ps(fjy1,ty);
1658             fjz1             = _mm_add_ps(fjz1,tz);
1659
1660             /**************************
1661              * CALCULATE INTERACTIONS *
1662              **************************/
1663
1664             r12              = _mm_mul_ps(rsq12,rinv12);
1665
1666             /* EWALD ELECTROSTATICS */
1667
1668             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1669             ewrt             = _mm_mul_ps(r12,ewtabscale);
1670             ewitab           = _mm_cvttps_epi32(ewrt);
1671             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1672             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1673                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1674                                          &ewtabF,&ewtabFn);
1675             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1676             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1677
1678             fscal            = felec;
1679
1680             /* Calculate temporary vectorial force */
1681             tx               = _mm_mul_ps(fscal,dx12);
1682             ty               = _mm_mul_ps(fscal,dy12);
1683             tz               = _mm_mul_ps(fscal,dz12);
1684
1685             /* Update vectorial force */
1686             fix1             = _mm_add_ps(fix1,tx);
1687             fiy1             = _mm_add_ps(fiy1,ty);
1688             fiz1             = _mm_add_ps(fiz1,tz);
1689
1690             fjx2             = _mm_add_ps(fjx2,tx);
1691             fjy2             = _mm_add_ps(fjy2,ty);
1692             fjz2             = _mm_add_ps(fjz2,tz);
1693
1694             /**************************
1695              * CALCULATE INTERACTIONS *
1696              **************************/
1697
1698             r20              = _mm_mul_ps(rsq20,rinv20);
1699
1700             /* EWALD ELECTROSTATICS */
1701
1702             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1703             ewrt             = _mm_mul_ps(r20,ewtabscale);
1704             ewitab           = _mm_cvttps_epi32(ewrt);
1705             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1706             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1707                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1708                                          &ewtabF,&ewtabFn);
1709             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1710             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1711
1712             fscal            = felec;
1713
1714             /* Calculate temporary vectorial force */
1715             tx               = _mm_mul_ps(fscal,dx20);
1716             ty               = _mm_mul_ps(fscal,dy20);
1717             tz               = _mm_mul_ps(fscal,dz20);
1718
1719             /* Update vectorial force */
1720             fix2             = _mm_add_ps(fix2,tx);
1721             fiy2             = _mm_add_ps(fiy2,ty);
1722             fiz2             = _mm_add_ps(fiz2,tz);
1723
1724             fjx0             = _mm_add_ps(fjx0,tx);
1725             fjy0             = _mm_add_ps(fjy0,ty);
1726             fjz0             = _mm_add_ps(fjz0,tz);
1727
1728             /**************************
1729              * CALCULATE INTERACTIONS *
1730              **************************/
1731
1732             r21              = _mm_mul_ps(rsq21,rinv21);
1733
1734             /* EWALD ELECTROSTATICS */
1735
1736             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1737             ewrt             = _mm_mul_ps(r21,ewtabscale);
1738             ewitab           = _mm_cvttps_epi32(ewrt);
1739             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1740             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1741                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1742                                          &ewtabF,&ewtabFn);
1743             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1744             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1745
1746             fscal            = felec;
1747
1748             /* Calculate temporary vectorial force */
1749             tx               = _mm_mul_ps(fscal,dx21);
1750             ty               = _mm_mul_ps(fscal,dy21);
1751             tz               = _mm_mul_ps(fscal,dz21);
1752
1753             /* Update vectorial force */
1754             fix2             = _mm_add_ps(fix2,tx);
1755             fiy2             = _mm_add_ps(fiy2,ty);
1756             fiz2             = _mm_add_ps(fiz2,tz);
1757
1758             fjx1             = _mm_add_ps(fjx1,tx);
1759             fjy1             = _mm_add_ps(fjy1,ty);
1760             fjz1             = _mm_add_ps(fjz1,tz);
1761
1762             /**************************
1763              * CALCULATE INTERACTIONS *
1764              **************************/
1765
1766             r22              = _mm_mul_ps(rsq22,rinv22);
1767
1768             /* EWALD ELECTROSTATICS */
1769
1770             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1771             ewrt             = _mm_mul_ps(r22,ewtabscale);
1772             ewitab           = _mm_cvttps_epi32(ewrt);
1773             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1774             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1775                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1776                                          &ewtabF,&ewtabFn);
1777             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1778             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1779
1780             fscal            = felec;
1781
1782             /* Calculate temporary vectorial force */
1783             tx               = _mm_mul_ps(fscal,dx22);
1784             ty               = _mm_mul_ps(fscal,dy22);
1785             tz               = _mm_mul_ps(fscal,dz22);
1786
1787             /* Update vectorial force */
1788             fix2             = _mm_add_ps(fix2,tx);
1789             fiy2             = _mm_add_ps(fiy2,ty);
1790             fiz2             = _mm_add_ps(fiz2,tz);
1791
1792             fjx2             = _mm_add_ps(fjx2,tx);
1793             fjy2             = _mm_add_ps(fjy2,ty);
1794             fjz2             = _mm_add_ps(fjz2,tz);
1795
1796             fjptrA             = f+j_coord_offsetA;
1797             fjptrB             = f+j_coord_offsetB;
1798             fjptrC             = f+j_coord_offsetC;
1799             fjptrD             = f+j_coord_offsetD;
1800
1801             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1802                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1803
1804             /* Inner loop uses 331 flops */
1805         }
1806
1807         if(jidx<j_index_end)
1808         {
1809
1810             /* Get j neighbor index, and coordinate index */
1811             jnrlistA         = jjnr[jidx];
1812             jnrlistB         = jjnr[jidx+1];
1813             jnrlistC         = jjnr[jidx+2];
1814             jnrlistD         = jjnr[jidx+3];
1815             /* Sign of each element will be negative for non-real atoms.
1816              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1817              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1818              */
1819             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1820             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1821             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1822             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1823             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1824             j_coord_offsetA  = DIM*jnrA;
1825             j_coord_offsetB  = DIM*jnrB;
1826             j_coord_offsetC  = DIM*jnrC;
1827             j_coord_offsetD  = DIM*jnrD;
1828
1829             /* load j atom coordinates */
1830             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1831                                               x+j_coord_offsetC,x+j_coord_offsetD,
1832                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1833
1834             /* Calculate displacement vector */
1835             dx00             = _mm_sub_ps(ix0,jx0);
1836             dy00             = _mm_sub_ps(iy0,jy0);
1837             dz00             = _mm_sub_ps(iz0,jz0);
1838             dx01             = _mm_sub_ps(ix0,jx1);
1839             dy01             = _mm_sub_ps(iy0,jy1);
1840             dz01             = _mm_sub_ps(iz0,jz1);
1841             dx02             = _mm_sub_ps(ix0,jx2);
1842             dy02             = _mm_sub_ps(iy0,jy2);
1843             dz02             = _mm_sub_ps(iz0,jz2);
1844             dx10             = _mm_sub_ps(ix1,jx0);
1845             dy10             = _mm_sub_ps(iy1,jy0);
1846             dz10             = _mm_sub_ps(iz1,jz0);
1847             dx11             = _mm_sub_ps(ix1,jx1);
1848             dy11             = _mm_sub_ps(iy1,jy1);
1849             dz11             = _mm_sub_ps(iz1,jz1);
1850             dx12             = _mm_sub_ps(ix1,jx2);
1851             dy12             = _mm_sub_ps(iy1,jy2);
1852             dz12             = _mm_sub_ps(iz1,jz2);
1853             dx20             = _mm_sub_ps(ix2,jx0);
1854             dy20             = _mm_sub_ps(iy2,jy0);
1855             dz20             = _mm_sub_ps(iz2,jz0);
1856             dx21             = _mm_sub_ps(ix2,jx1);
1857             dy21             = _mm_sub_ps(iy2,jy1);
1858             dz21             = _mm_sub_ps(iz2,jz1);
1859             dx22             = _mm_sub_ps(ix2,jx2);
1860             dy22             = _mm_sub_ps(iy2,jy2);
1861             dz22             = _mm_sub_ps(iz2,jz2);
1862
1863             /* Calculate squared distance and things based on it */
1864             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1865             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1866             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1867             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1868             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1869             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1870             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1871             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1872             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1873
1874             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1875             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1876             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1877             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1878             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1879             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1880             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1881             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1882             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1883
1884             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1885             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1886             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1887             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1888             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1889             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1890             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1891             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1892             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1893
1894             fjx0             = _mm_setzero_ps();
1895             fjy0             = _mm_setzero_ps();
1896             fjz0             = _mm_setzero_ps();
1897             fjx1             = _mm_setzero_ps();
1898             fjy1             = _mm_setzero_ps();
1899             fjz1             = _mm_setzero_ps();
1900             fjx2             = _mm_setzero_ps();
1901             fjy2             = _mm_setzero_ps();
1902             fjz2             = _mm_setzero_ps();
1903
1904             /**************************
1905              * CALCULATE INTERACTIONS *
1906              **************************/
1907
1908             r00              = _mm_mul_ps(rsq00,rinv00);
1909             r00              = _mm_andnot_ps(dummy_mask,r00);
1910
1911             /* EWALD ELECTROSTATICS */
1912
1913             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1914             ewrt             = _mm_mul_ps(r00,ewtabscale);
1915             ewitab           = _mm_cvttps_epi32(ewrt);
1916             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1917             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1918                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1919                                          &ewtabF,&ewtabFn);
1920             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1921             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1922
1923             /* LENNARD-JONES DISPERSION/REPULSION */
1924
1925             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1926             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1927
1928             fscal            = _mm_add_ps(felec,fvdw);
1929
1930             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1931
1932             /* Calculate temporary vectorial force */
1933             tx               = _mm_mul_ps(fscal,dx00);
1934             ty               = _mm_mul_ps(fscal,dy00);
1935             tz               = _mm_mul_ps(fscal,dz00);
1936
1937             /* Update vectorial force */
1938             fix0             = _mm_add_ps(fix0,tx);
1939             fiy0             = _mm_add_ps(fiy0,ty);
1940             fiz0             = _mm_add_ps(fiz0,tz);
1941
1942             fjx0             = _mm_add_ps(fjx0,tx);
1943             fjy0             = _mm_add_ps(fjy0,ty);
1944             fjz0             = _mm_add_ps(fjz0,tz);
1945
1946             /**************************
1947              * CALCULATE INTERACTIONS *
1948              **************************/
1949
1950             r01              = _mm_mul_ps(rsq01,rinv01);
1951             r01              = _mm_andnot_ps(dummy_mask,r01);
1952
1953             /* EWALD ELECTROSTATICS */
1954
1955             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1956             ewrt             = _mm_mul_ps(r01,ewtabscale);
1957             ewitab           = _mm_cvttps_epi32(ewrt);
1958             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1959             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1960                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1961                                          &ewtabF,&ewtabFn);
1962             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1963             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1964
1965             fscal            = felec;
1966
1967             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1968
1969             /* Calculate temporary vectorial force */
1970             tx               = _mm_mul_ps(fscal,dx01);
1971             ty               = _mm_mul_ps(fscal,dy01);
1972             tz               = _mm_mul_ps(fscal,dz01);
1973
1974             /* Update vectorial force */
1975             fix0             = _mm_add_ps(fix0,tx);
1976             fiy0             = _mm_add_ps(fiy0,ty);
1977             fiz0             = _mm_add_ps(fiz0,tz);
1978
1979             fjx1             = _mm_add_ps(fjx1,tx);
1980             fjy1             = _mm_add_ps(fjy1,ty);
1981             fjz1             = _mm_add_ps(fjz1,tz);
1982
1983             /**************************
1984              * CALCULATE INTERACTIONS *
1985              **************************/
1986
1987             r02              = _mm_mul_ps(rsq02,rinv02);
1988             r02              = _mm_andnot_ps(dummy_mask,r02);
1989
1990             /* EWALD ELECTROSTATICS */
1991
1992             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1993             ewrt             = _mm_mul_ps(r02,ewtabscale);
1994             ewitab           = _mm_cvttps_epi32(ewrt);
1995             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1996             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1997                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1998                                          &ewtabF,&ewtabFn);
1999             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2000             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2001
2002             fscal            = felec;
2003
2004             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2005
2006             /* Calculate temporary vectorial force */
2007             tx               = _mm_mul_ps(fscal,dx02);
2008             ty               = _mm_mul_ps(fscal,dy02);
2009             tz               = _mm_mul_ps(fscal,dz02);
2010
2011             /* Update vectorial force */
2012             fix0             = _mm_add_ps(fix0,tx);
2013             fiy0             = _mm_add_ps(fiy0,ty);
2014             fiz0             = _mm_add_ps(fiz0,tz);
2015
2016             fjx2             = _mm_add_ps(fjx2,tx);
2017             fjy2             = _mm_add_ps(fjy2,ty);
2018             fjz2             = _mm_add_ps(fjz2,tz);
2019
2020             /**************************
2021              * CALCULATE INTERACTIONS *
2022              **************************/
2023
2024             r10              = _mm_mul_ps(rsq10,rinv10);
2025             r10              = _mm_andnot_ps(dummy_mask,r10);
2026
2027             /* EWALD ELECTROSTATICS */
2028
2029             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2030             ewrt             = _mm_mul_ps(r10,ewtabscale);
2031             ewitab           = _mm_cvttps_epi32(ewrt);
2032             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2033             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2034                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2035                                          &ewtabF,&ewtabFn);
2036             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2037             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2038
2039             fscal            = felec;
2040
2041             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2042
2043             /* Calculate temporary vectorial force */
2044             tx               = _mm_mul_ps(fscal,dx10);
2045             ty               = _mm_mul_ps(fscal,dy10);
2046             tz               = _mm_mul_ps(fscal,dz10);
2047
2048             /* Update vectorial force */
2049             fix1             = _mm_add_ps(fix1,tx);
2050             fiy1             = _mm_add_ps(fiy1,ty);
2051             fiz1             = _mm_add_ps(fiz1,tz);
2052
2053             fjx0             = _mm_add_ps(fjx0,tx);
2054             fjy0             = _mm_add_ps(fjy0,ty);
2055             fjz0             = _mm_add_ps(fjz0,tz);
2056
2057             /**************************
2058              * CALCULATE INTERACTIONS *
2059              **************************/
2060
2061             r11              = _mm_mul_ps(rsq11,rinv11);
2062             r11              = _mm_andnot_ps(dummy_mask,r11);
2063
2064             /* EWALD ELECTROSTATICS */
2065
2066             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2067             ewrt             = _mm_mul_ps(r11,ewtabscale);
2068             ewitab           = _mm_cvttps_epi32(ewrt);
2069             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2070             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2071                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2072                                          &ewtabF,&ewtabFn);
2073             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2074             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2075
2076             fscal            = felec;
2077
2078             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2079
2080             /* Calculate temporary vectorial force */
2081             tx               = _mm_mul_ps(fscal,dx11);
2082             ty               = _mm_mul_ps(fscal,dy11);
2083             tz               = _mm_mul_ps(fscal,dz11);
2084
2085             /* Update vectorial force */
2086             fix1             = _mm_add_ps(fix1,tx);
2087             fiy1             = _mm_add_ps(fiy1,ty);
2088             fiz1             = _mm_add_ps(fiz1,tz);
2089
2090             fjx1             = _mm_add_ps(fjx1,tx);
2091             fjy1             = _mm_add_ps(fjy1,ty);
2092             fjz1             = _mm_add_ps(fjz1,tz);
2093
2094             /**************************
2095              * CALCULATE INTERACTIONS *
2096              **************************/
2097
2098             r12              = _mm_mul_ps(rsq12,rinv12);
2099             r12              = _mm_andnot_ps(dummy_mask,r12);
2100
2101             /* EWALD ELECTROSTATICS */
2102
2103             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2104             ewrt             = _mm_mul_ps(r12,ewtabscale);
2105             ewitab           = _mm_cvttps_epi32(ewrt);
2106             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2107             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2108                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2109                                          &ewtabF,&ewtabFn);
2110             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2111             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2112
2113             fscal            = felec;
2114
2115             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2116
2117             /* Calculate temporary vectorial force */
2118             tx               = _mm_mul_ps(fscal,dx12);
2119             ty               = _mm_mul_ps(fscal,dy12);
2120             tz               = _mm_mul_ps(fscal,dz12);
2121
2122             /* Update vectorial force */
2123             fix1             = _mm_add_ps(fix1,tx);
2124             fiy1             = _mm_add_ps(fiy1,ty);
2125             fiz1             = _mm_add_ps(fiz1,tz);
2126
2127             fjx2             = _mm_add_ps(fjx2,tx);
2128             fjy2             = _mm_add_ps(fjy2,ty);
2129             fjz2             = _mm_add_ps(fjz2,tz);
2130
2131             /**************************
2132              * CALCULATE INTERACTIONS *
2133              **************************/
2134
2135             r20              = _mm_mul_ps(rsq20,rinv20);
2136             r20              = _mm_andnot_ps(dummy_mask,r20);
2137
2138             /* EWALD ELECTROSTATICS */
2139
2140             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2141             ewrt             = _mm_mul_ps(r20,ewtabscale);
2142             ewitab           = _mm_cvttps_epi32(ewrt);
2143             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2144             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2145                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2146                                          &ewtabF,&ewtabFn);
2147             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2148             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2149
2150             fscal            = felec;
2151
2152             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2153
2154             /* Calculate temporary vectorial force */
2155             tx               = _mm_mul_ps(fscal,dx20);
2156             ty               = _mm_mul_ps(fscal,dy20);
2157             tz               = _mm_mul_ps(fscal,dz20);
2158
2159             /* Update vectorial force */
2160             fix2             = _mm_add_ps(fix2,tx);
2161             fiy2             = _mm_add_ps(fiy2,ty);
2162             fiz2             = _mm_add_ps(fiz2,tz);
2163
2164             fjx0             = _mm_add_ps(fjx0,tx);
2165             fjy0             = _mm_add_ps(fjy0,ty);
2166             fjz0             = _mm_add_ps(fjz0,tz);
2167
2168             /**************************
2169              * CALCULATE INTERACTIONS *
2170              **************************/
2171
2172             r21              = _mm_mul_ps(rsq21,rinv21);
2173             r21              = _mm_andnot_ps(dummy_mask,r21);
2174
2175             /* EWALD ELECTROSTATICS */
2176
2177             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2178             ewrt             = _mm_mul_ps(r21,ewtabscale);
2179             ewitab           = _mm_cvttps_epi32(ewrt);
2180             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2181             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2182                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2183                                          &ewtabF,&ewtabFn);
2184             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2185             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2186
2187             fscal            = felec;
2188
2189             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2190
2191             /* Calculate temporary vectorial force */
2192             tx               = _mm_mul_ps(fscal,dx21);
2193             ty               = _mm_mul_ps(fscal,dy21);
2194             tz               = _mm_mul_ps(fscal,dz21);
2195
2196             /* Update vectorial force */
2197             fix2             = _mm_add_ps(fix2,tx);
2198             fiy2             = _mm_add_ps(fiy2,ty);
2199             fiz2             = _mm_add_ps(fiz2,tz);
2200
2201             fjx1             = _mm_add_ps(fjx1,tx);
2202             fjy1             = _mm_add_ps(fjy1,ty);
2203             fjz1             = _mm_add_ps(fjz1,tz);
2204
2205             /**************************
2206              * CALCULATE INTERACTIONS *
2207              **************************/
2208
2209             r22              = _mm_mul_ps(rsq22,rinv22);
2210             r22              = _mm_andnot_ps(dummy_mask,r22);
2211
2212             /* EWALD ELECTROSTATICS */
2213
2214             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2215             ewrt             = _mm_mul_ps(r22,ewtabscale);
2216             ewitab           = _mm_cvttps_epi32(ewrt);
2217             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2218             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2219                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2220                                          &ewtabF,&ewtabFn);
2221             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2222             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2223
2224             fscal            = felec;
2225
2226             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2227
2228             /* Calculate temporary vectorial force */
2229             tx               = _mm_mul_ps(fscal,dx22);
2230             ty               = _mm_mul_ps(fscal,dy22);
2231             tz               = _mm_mul_ps(fscal,dz22);
2232
2233             /* Update vectorial force */
2234             fix2             = _mm_add_ps(fix2,tx);
2235             fiy2             = _mm_add_ps(fiy2,ty);
2236             fiz2             = _mm_add_ps(fiz2,tz);
2237
2238             fjx2             = _mm_add_ps(fjx2,tx);
2239             fjy2             = _mm_add_ps(fjy2,ty);
2240             fjz2             = _mm_add_ps(fjz2,tz);
2241
2242             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2243             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2244             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2245             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2246
2247             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2248                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2249
2250             /* Inner loop uses 340 flops */
2251         }
2252
2253         /* End of innermost loop */
2254
2255         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2256                                               f+i_coord_offset,fshift+i_shift_offset);
2257
2258         /* Increment number of inner iterations */
2259         inneriter                  += j_index_end - j_index_start;
2260
2261         /* Outer loop uses 18 flops */
2262     }
2263
2264     /* Increment number of outer iterations */
2265     outeriter        += nri;
2266
2267     /* Update outer/inner flops */
2268
2269     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*340);
2270 }