Rename remaining GMX_ACCELERATION to GMX_CPU_ACCELERATION
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             shX,shY,shZ,rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     int              nvdwtype;
90     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91     int              *vdwtype;
92     real             *vdwparam;
93     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
94     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
95     __m128i          ewitab;
96     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
97     real             *ewtab;
98     __m128           dummy_mask,cutoff_mask;
99     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
100     __m128           one     = _mm_set1_ps(1.0);
101     __m128           two     = _mm_set1_ps(2.0);
102     x                = xx[0];
103     f                = ff[0];
104
105     nri              = nlist->nri;
106     iinr             = nlist->iinr;
107     jindex           = nlist->jindex;
108     jjnr             = nlist->jjnr;
109     shiftidx         = nlist->shift;
110     gid              = nlist->gid;
111     shiftvec         = fr->shift_vec[0];
112     fshift           = fr->fshift[0];
113     facel            = _mm_set1_ps(fr->epsfac);
114     charge           = mdatoms->chargeA;
115     nvdwtype         = fr->ntype;
116     vdwparam         = fr->nbfp;
117     vdwtype          = mdatoms->typeA;
118
119     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
120     ewtab            = fr->ic->tabq_coul_FDV0;
121     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
122     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
123
124     /* Setup water-specific parameters */
125     inr              = nlist->iinr[0];
126     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
127     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
128     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
129     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
130
131     jq0              = _mm_set1_ps(charge[inr+0]);
132     jq1              = _mm_set1_ps(charge[inr+1]);
133     jq2              = _mm_set1_ps(charge[inr+2]);
134     vdwjidx0A        = 2*vdwtype[inr+0];
135     qq00             = _mm_mul_ps(iq0,jq0);
136     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
137     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
138     qq01             = _mm_mul_ps(iq0,jq1);
139     qq02             = _mm_mul_ps(iq0,jq2);
140     qq10             = _mm_mul_ps(iq1,jq0);
141     qq11             = _mm_mul_ps(iq1,jq1);
142     qq12             = _mm_mul_ps(iq1,jq2);
143     qq20             = _mm_mul_ps(iq2,jq0);
144     qq21             = _mm_mul_ps(iq2,jq1);
145     qq22             = _mm_mul_ps(iq2,jq2);
146
147     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
148     rcutoff_scalar   = fr->rcoulomb;
149     rcutoff          = _mm_set1_ps(rcutoff_scalar);
150     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
151
152     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
153     rvdw             = _mm_set1_ps(fr->rvdw);
154
155     /* Avoid stupid compiler warnings */
156     jnrA = jnrB = jnrC = jnrD = 0;
157     j_coord_offsetA = 0;
158     j_coord_offsetB = 0;
159     j_coord_offsetC = 0;
160     j_coord_offsetD = 0;
161
162     outeriter        = 0;
163     inneriter        = 0;
164
165     /* Start outer loop over neighborlists */
166     for(iidx=0; iidx<nri; iidx++)
167     {
168         /* Load shift vector for this list */
169         i_shift_offset   = DIM*shiftidx[iidx];
170         shX              = shiftvec[i_shift_offset+XX];
171         shY              = shiftvec[i_shift_offset+YY];
172         shZ              = shiftvec[i_shift_offset+ZZ];
173
174         /* Load limits for loop over neighbors */
175         j_index_start    = jindex[iidx];
176         j_index_end      = jindex[iidx+1];
177
178         /* Get outer coordinate index */
179         inr              = iinr[iidx];
180         i_coord_offset   = DIM*inr;
181
182         /* Load i particle coords and add shift vector */
183         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
184         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
185         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
186         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
187         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
188         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
189         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
190         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
191         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
192
193         fix0             = _mm_setzero_ps();
194         fiy0             = _mm_setzero_ps();
195         fiz0             = _mm_setzero_ps();
196         fix1             = _mm_setzero_ps();
197         fiy1             = _mm_setzero_ps();
198         fiz1             = _mm_setzero_ps();
199         fix2             = _mm_setzero_ps();
200         fiy2             = _mm_setzero_ps();
201         fiz2             = _mm_setzero_ps();
202
203         /* Reset potential sums */
204         velecsum         = _mm_setzero_ps();
205         vvdwsum          = _mm_setzero_ps();
206
207         /* Start inner kernel loop */
208         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
209         {
210
211             /* Get j neighbor index, and coordinate index */
212             jnrA             = jjnr[jidx];
213             jnrB             = jjnr[jidx+1];
214             jnrC             = jjnr[jidx+2];
215             jnrD             = jjnr[jidx+3];
216
217             j_coord_offsetA  = DIM*jnrA;
218             j_coord_offsetB  = DIM*jnrB;
219             j_coord_offsetC  = DIM*jnrC;
220             j_coord_offsetD  = DIM*jnrD;
221
222             /* load j atom coordinates */
223             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
224                                               x+j_coord_offsetC,x+j_coord_offsetD,
225                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
226
227             /* Calculate displacement vector */
228             dx00             = _mm_sub_ps(ix0,jx0);
229             dy00             = _mm_sub_ps(iy0,jy0);
230             dz00             = _mm_sub_ps(iz0,jz0);
231             dx01             = _mm_sub_ps(ix0,jx1);
232             dy01             = _mm_sub_ps(iy0,jy1);
233             dz01             = _mm_sub_ps(iz0,jz1);
234             dx02             = _mm_sub_ps(ix0,jx2);
235             dy02             = _mm_sub_ps(iy0,jy2);
236             dz02             = _mm_sub_ps(iz0,jz2);
237             dx10             = _mm_sub_ps(ix1,jx0);
238             dy10             = _mm_sub_ps(iy1,jy0);
239             dz10             = _mm_sub_ps(iz1,jz0);
240             dx11             = _mm_sub_ps(ix1,jx1);
241             dy11             = _mm_sub_ps(iy1,jy1);
242             dz11             = _mm_sub_ps(iz1,jz1);
243             dx12             = _mm_sub_ps(ix1,jx2);
244             dy12             = _mm_sub_ps(iy1,jy2);
245             dz12             = _mm_sub_ps(iz1,jz2);
246             dx20             = _mm_sub_ps(ix2,jx0);
247             dy20             = _mm_sub_ps(iy2,jy0);
248             dz20             = _mm_sub_ps(iz2,jz0);
249             dx21             = _mm_sub_ps(ix2,jx1);
250             dy21             = _mm_sub_ps(iy2,jy1);
251             dz21             = _mm_sub_ps(iz2,jz1);
252             dx22             = _mm_sub_ps(ix2,jx2);
253             dy22             = _mm_sub_ps(iy2,jy2);
254             dz22             = _mm_sub_ps(iz2,jz2);
255
256             /* Calculate squared distance and things based on it */
257             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
258             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
259             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
260             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
261             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
262             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
263             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
264             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
265             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
266
267             rinv00           = gmx_mm_invsqrt_ps(rsq00);
268             rinv01           = gmx_mm_invsqrt_ps(rsq01);
269             rinv02           = gmx_mm_invsqrt_ps(rsq02);
270             rinv10           = gmx_mm_invsqrt_ps(rsq10);
271             rinv11           = gmx_mm_invsqrt_ps(rsq11);
272             rinv12           = gmx_mm_invsqrt_ps(rsq12);
273             rinv20           = gmx_mm_invsqrt_ps(rsq20);
274             rinv21           = gmx_mm_invsqrt_ps(rsq21);
275             rinv22           = gmx_mm_invsqrt_ps(rsq22);
276
277             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
278             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
279             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
280             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
281             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
282             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
283             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
284             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
285             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
286
287             fjx0             = _mm_setzero_ps();
288             fjy0             = _mm_setzero_ps();
289             fjz0             = _mm_setzero_ps();
290             fjx1             = _mm_setzero_ps();
291             fjy1             = _mm_setzero_ps();
292             fjz1             = _mm_setzero_ps();
293             fjx2             = _mm_setzero_ps();
294             fjy2             = _mm_setzero_ps();
295             fjz2             = _mm_setzero_ps();
296
297             /**************************
298              * CALCULATE INTERACTIONS *
299              **************************/
300
301             if (gmx_mm_any_lt(rsq00,rcutoff2))
302             {
303
304             r00              = _mm_mul_ps(rsq00,rinv00);
305
306             /* EWALD ELECTROSTATICS */
307
308             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
309             ewrt             = _mm_mul_ps(r00,ewtabscale);
310             ewitab           = _mm_cvttps_epi32(ewrt);
311             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
312             ewitab           = _mm_slli_epi32(ewitab,2);
313             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
314             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
315             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
316             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
317             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
318             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
319             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
320             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
321             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
322
323             /* LENNARD-JONES DISPERSION/REPULSION */
324
325             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
326             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
327             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
328             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
329                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
330             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
331
332             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
333
334             /* Update potential sum for this i atom from the interaction with this j atom. */
335             velec            = _mm_and_ps(velec,cutoff_mask);
336             velecsum         = _mm_add_ps(velecsum,velec);
337             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
338             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
339
340             fscal            = _mm_add_ps(felec,fvdw);
341
342             fscal            = _mm_and_ps(fscal,cutoff_mask);
343
344             /* Calculate temporary vectorial force */
345             tx               = _mm_mul_ps(fscal,dx00);
346             ty               = _mm_mul_ps(fscal,dy00);
347             tz               = _mm_mul_ps(fscal,dz00);
348
349             /* Update vectorial force */
350             fix0             = _mm_add_ps(fix0,tx);
351             fiy0             = _mm_add_ps(fiy0,ty);
352             fiz0             = _mm_add_ps(fiz0,tz);
353
354             fjx0             = _mm_add_ps(fjx0,tx);
355             fjy0             = _mm_add_ps(fjy0,ty);
356             fjz0             = _mm_add_ps(fjz0,tz);
357
358             }
359
360             /**************************
361              * CALCULATE INTERACTIONS *
362              **************************/
363
364             if (gmx_mm_any_lt(rsq01,rcutoff2))
365             {
366
367             r01              = _mm_mul_ps(rsq01,rinv01);
368
369             /* EWALD ELECTROSTATICS */
370
371             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
372             ewrt             = _mm_mul_ps(r01,ewtabscale);
373             ewitab           = _mm_cvttps_epi32(ewrt);
374             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
375             ewitab           = _mm_slli_epi32(ewitab,2);
376             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
377             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
378             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
379             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
380             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
381             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
382             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
383             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
384             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
385
386             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
387
388             /* Update potential sum for this i atom from the interaction with this j atom. */
389             velec            = _mm_and_ps(velec,cutoff_mask);
390             velecsum         = _mm_add_ps(velecsum,velec);
391
392             fscal            = felec;
393
394             fscal            = _mm_and_ps(fscal,cutoff_mask);
395
396             /* Calculate temporary vectorial force */
397             tx               = _mm_mul_ps(fscal,dx01);
398             ty               = _mm_mul_ps(fscal,dy01);
399             tz               = _mm_mul_ps(fscal,dz01);
400
401             /* Update vectorial force */
402             fix0             = _mm_add_ps(fix0,tx);
403             fiy0             = _mm_add_ps(fiy0,ty);
404             fiz0             = _mm_add_ps(fiz0,tz);
405
406             fjx1             = _mm_add_ps(fjx1,tx);
407             fjy1             = _mm_add_ps(fjy1,ty);
408             fjz1             = _mm_add_ps(fjz1,tz);
409
410             }
411
412             /**************************
413              * CALCULATE INTERACTIONS *
414              **************************/
415
416             if (gmx_mm_any_lt(rsq02,rcutoff2))
417             {
418
419             r02              = _mm_mul_ps(rsq02,rinv02);
420
421             /* EWALD ELECTROSTATICS */
422
423             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
424             ewrt             = _mm_mul_ps(r02,ewtabscale);
425             ewitab           = _mm_cvttps_epi32(ewrt);
426             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
427             ewitab           = _mm_slli_epi32(ewitab,2);
428             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
429             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
430             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
431             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
432             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
433             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
434             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
435             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
436             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
437
438             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
439
440             /* Update potential sum for this i atom from the interaction with this j atom. */
441             velec            = _mm_and_ps(velec,cutoff_mask);
442             velecsum         = _mm_add_ps(velecsum,velec);
443
444             fscal            = felec;
445
446             fscal            = _mm_and_ps(fscal,cutoff_mask);
447
448             /* Calculate temporary vectorial force */
449             tx               = _mm_mul_ps(fscal,dx02);
450             ty               = _mm_mul_ps(fscal,dy02);
451             tz               = _mm_mul_ps(fscal,dz02);
452
453             /* Update vectorial force */
454             fix0             = _mm_add_ps(fix0,tx);
455             fiy0             = _mm_add_ps(fiy0,ty);
456             fiz0             = _mm_add_ps(fiz0,tz);
457
458             fjx2             = _mm_add_ps(fjx2,tx);
459             fjy2             = _mm_add_ps(fjy2,ty);
460             fjz2             = _mm_add_ps(fjz2,tz);
461
462             }
463
464             /**************************
465              * CALCULATE INTERACTIONS *
466              **************************/
467
468             if (gmx_mm_any_lt(rsq10,rcutoff2))
469             {
470
471             r10              = _mm_mul_ps(rsq10,rinv10);
472
473             /* EWALD ELECTROSTATICS */
474
475             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
476             ewrt             = _mm_mul_ps(r10,ewtabscale);
477             ewitab           = _mm_cvttps_epi32(ewrt);
478             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
479             ewitab           = _mm_slli_epi32(ewitab,2);
480             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
481             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
482             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
483             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
484             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
485             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
486             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
487             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
488             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
489
490             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
491
492             /* Update potential sum for this i atom from the interaction with this j atom. */
493             velec            = _mm_and_ps(velec,cutoff_mask);
494             velecsum         = _mm_add_ps(velecsum,velec);
495
496             fscal            = felec;
497
498             fscal            = _mm_and_ps(fscal,cutoff_mask);
499
500             /* Calculate temporary vectorial force */
501             tx               = _mm_mul_ps(fscal,dx10);
502             ty               = _mm_mul_ps(fscal,dy10);
503             tz               = _mm_mul_ps(fscal,dz10);
504
505             /* Update vectorial force */
506             fix1             = _mm_add_ps(fix1,tx);
507             fiy1             = _mm_add_ps(fiy1,ty);
508             fiz1             = _mm_add_ps(fiz1,tz);
509
510             fjx0             = _mm_add_ps(fjx0,tx);
511             fjy0             = _mm_add_ps(fjy0,ty);
512             fjz0             = _mm_add_ps(fjz0,tz);
513
514             }
515
516             /**************************
517              * CALCULATE INTERACTIONS *
518              **************************/
519
520             if (gmx_mm_any_lt(rsq11,rcutoff2))
521             {
522
523             r11              = _mm_mul_ps(rsq11,rinv11);
524
525             /* EWALD ELECTROSTATICS */
526
527             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
528             ewrt             = _mm_mul_ps(r11,ewtabscale);
529             ewitab           = _mm_cvttps_epi32(ewrt);
530             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
531             ewitab           = _mm_slli_epi32(ewitab,2);
532             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
533             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
534             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
535             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
536             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
537             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
538             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
539             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
540             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
541
542             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
543
544             /* Update potential sum for this i atom from the interaction with this j atom. */
545             velec            = _mm_and_ps(velec,cutoff_mask);
546             velecsum         = _mm_add_ps(velecsum,velec);
547
548             fscal            = felec;
549
550             fscal            = _mm_and_ps(fscal,cutoff_mask);
551
552             /* Calculate temporary vectorial force */
553             tx               = _mm_mul_ps(fscal,dx11);
554             ty               = _mm_mul_ps(fscal,dy11);
555             tz               = _mm_mul_ps(fscal,dz11);
556
557             /* Update vectorial force */
558             fix1             = _mm_add_ps(fix1,tx);
559             fiy1             = _mm_add_ps(fiy1,ty);
560             fiz1             = _mm_add_ps(fiz1,tz);
561
562             fjx1             = _mm_add_ps(fjx1,tx);
563             fjy1             = _mm_add_ps(fjy1,ty);
564             fjz1             = _mm_add_ps(fjz1,tz);
565
566             }
567
568             /**************************
569              * CALCULATE INTERACTIONS *
570              **************************/
571
572             if (gmx_mm_any_lt(rsq12,rcutoff2))
573             {
574
575             r12              = _mm_mul_ps(rsq12,rinv12);
576
577             /* EWALD ELECTROSTATICS */
578
579             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
580             ewrt             = _mm_mul_ps(r12,ewtabscale);
581             ewitab           = _mm_cvttps_epi32(ewrt);
582             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
583             ewitab           = _mm_slli_epi32(ewitab,2);
584             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
585             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
586             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
587             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
588             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
589             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
590             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
591             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
592             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
593
594             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
595
596             /* Update potential sum for this i atom from the interaction with this j atom. */
597             velec            = _mm_and_ps(velec,cutoff_mask);
598             velecsum         = _mm_add_ps(velecsum,velec);
599
600             fscal            = felec;
601
602             fscal            = _mm_and_ps(fscal,cutoff_mask);
603
604             /* Calculate temporary vectorial force */
605             tx               = _mm_mul_ps(fscal,dx12);
606             ty               = _mm_mul_ps(fscal,dy12);
607             tz               = _mm_mul_ps(fscal,dz12);
608
609             /* Update vectorial force */
610             fix1             = _mm_add_ps(fix1,tx);
611             fiy1             = _mm_add_ps(fiy1,ty);
612             fiz1             = _mm_add_ps(fiz1,tz);
613
614             fjx2             = _mm_add_ps(fjx2,tx);
615             fjy2             = _mm_add_ps(fjy2,ty);
616             fjz2             = _mm_add_ps(fjz2,tz);
617
618             }
619
620             /**************************
621              * CALCULATE INTERACTIONS *
622              **************************/
623
624             if (gmx_mm_any_lt(rsq20,rcutoff2))
625             {
626
627             r20              = _mm_mul_ps(rsq20,rinv20);
628
629             /* EWALD ELECTROSTATICS */
630
631             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
632             ewrt             = _mm_mul_ps(r20,ewtabscale);
633             ewitab           = _mm_cvttps_epi32(ewrt);
634             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
635             ewitab           = _mm_slli_epi32(ewitab,2);
636             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
637             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
638             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
639             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
640             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
641             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
642             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
643             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
644             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
645
646             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
647
648             /* Update potential sum for this i atom from the interaction with this j atom. */
649             velec            = _mm_and_ps(velec,cutoff_mask);
650             velecsum         = _mm_add_ps(velecsum,velec);
651
652             fscal            = felec;
653
654             fscal            = _mm_and_ps(fscal,cutoff_mask);
655
656             /* Calculate temporary vectorial force */
657             tx               = _mm_mul_ps(fscal,dx20);
658             ty               = _mm_mul_ps(fscal,dy20);
659             tz               = _mm_mul_ps(fscal,dz20);
660
661             /* Update vectorial force */
662             fix2             = _mm_add_ps(fix2,tx);
663             fiy2             = _mm_add_ps(fiy2,ty);
664             fiz2             = _mm_add_ps(fiz2,tz);
665
666             fjx0             = _mm_add_ps(fjx0,tx);
667             fjy0             = _mm_add_ps(fjy0,ty);
668             fjz0             = _mm_add_ps(fjz0,tz);
669
670             }
671
672             /**************************
673              * CALCULATE INTERACTIONS *
674              **************************/
675
676             if (gmx_mm_any_lt(rsq21,rcutoff2))
677             {
678
679             r21              = _mm_mul_ps(rsq21,rinv21);
680
681             /* EWALD ELECTROSTATICS */
682
683             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
684             ewrt             = _mm_mul_ps(r21,ewtabscale);
685             ewitab           = _mm_cvttps_epi32(ewrt);
686             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
687             ewitab           = _mm_slli_epi32(ewitab,2);
688             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
689             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
690             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
691             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
692             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
693             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
694             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
695             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
696             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
697
698             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
699
700             /* Update potential sum for this i atom from the interaction with this j atom. */
701             velec            = _mm_and_ps(velec,cutoff_mask);
702             velecsum         = _mm_add_ps(velecsum,velec);
703
704             fscal            = felec;
705
706             fscal            = _mm_and_ps(fscal,cutoff_mask);
707
708             /* Calculate temporary vectorial force */
709             tx               = _mm_mul_ps(fscal,dx21);
710             ty               = _mm_mul_ps(fscal,dy21);
711             tz               = _mm_mul_ps(fscal,dz21);
712
713             /* Update vectorial force */
714             fix2             = _mm_add_ps(fix2,tx);
715             fiy2             = _mm_add_ps(fiy2,ty);
716             fiz2             = _mm_add_ps(fiz2,tz);
717
718             fjx1             = _mm_add_ps(fjx1,tx);
719             fjy1             = _mm_add_ps(fjy1,ty);
720             fjz1             = _mm_add_ps(fjz1,tz);
721
722             }
723
724             /**************************
725              * CALCULATE INTERACTIONS *
726              **************************/
727
728             if (gmx_mm_any_lt(rsq22,rcutoff2))
729             {
730
731             r22              = _mm_mul_ps(rsq22,rinv22);
732
733             /* EWALD ELECTROSTATICS */
734
735             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
736             ewrt             = _mm_mul_ps(r22,ewtabscale);
737             ewitab           = _mm_cvttps_epi32(ewrt);
738             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
739             ewitab           = _mm_slli_epi32(ewitab,2);
740             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
741             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
742             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
743             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
744             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
745             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
746             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
747             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
748             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
749
750             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
751
752             /* Update potential sum for this i atom from the interaction with this j atom. */
753             velec            = _mm_and_ps(velec,cutoff_mask);
754             velecsum         = _mm_add_ps(velecsum,velec);
755
756             fscal            = felec;
757
758             fscal            = _mm_and_ps(fscal,cutoff_mask);
759
760             /* Calculate temporary vectorial force */
761             tx               = _mm_mul_ps(fscal,dx22);
762             ty               = _mm_mul_ps(fscal,dy22);
763             tz               = _mm_mul_ps(fscal,dz22);
764
765             /* Update vectorial force */
766             fix2             = _mm_add_ps(fix2,tx);
767             fiy2             = _mm_add_ps(fiy2,ty);
768             fiz2             = _mm_add_ps(fiz2,tz);
769
770             fjx2             = _mm_add_ps(fjx2,tx);
771             fjy2             = _mm_add_ps(fjy2,ty);
772             fjz2             = _mm_add_ps(fjz2,tz);
773
774             }
775
776             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
777                                                    f+j_coord_offsetC,f+j_coord_offsetD,
778                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
779
780             /* Inner loop uses 432 flops */
781         }
782
783         if(jidx<j_index_end)
784         {
785
786             /* Get j neighbor index, and coordinate index */
787             jnrA             = jjnr[jidx];
788             jnrB             = jjnr[jidx+1];
789             jnrC             = jjnr[jidx+2];
790             jnrD             = jjnr[jidx+3];
791
792             /* Sign of each element will be negative for non-real atoms.
793              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
794              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
795              */
796             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
797             jnrA       = (jnrA>=0) ? jnrA : 0;
798             jnrB       = (jnrB>=0) ? jnrB : 0;
799             jnrC       = (jnrC>=0) ? jnrC : 0;
800             jnrD       = (jnrD>=0) ? jnrD : 0;
801
802             j_coord_offsetA  = DIM*jnrA;
803             j_coord_offsetB  = DIM*jnrB;
804             j_coord_offsetC  = DIM*jnrC;
805             j_coord_offsetD  = DIM*jnrD;
806
807             /* load j atom coordinates */
808             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
809                                               x+j_coord_offsetC,x+j_coord_offsetD,
810                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
811
812             /* Calculate displacement vector */
813             dx00             = _mm_sub_ps(ix0,jx0);
814             dy00             = _mm_sub_ps(iy0,jy0);
815             dz00             = _mm_sub_ps(iz0,jz0);
816             dx01             = _mm_sub_ps(ix0,jx1);
817             dy01             = _mm_sub_ps(iy0,jy1);
818             dz01             = _mm_sub_ps(iz0,jz1);
819             dx02             = _mm_sub_ps(ix0,jx2);
820             dy02             = _mm_sub_ps(iy0,jy2);
821             dz02             = _mm_sub_ps(iz0,jz2);
822             dx10             = _mm_sub_ps(ix1,jx0);
823             dy10             = _mm_sub_ps(iy1,jy0);
824             dz10             = _mm_sub_ps(iz1,jz0);
825             dx11             = _mm_sub_ps(ix1,jx1);
826             dy11             = _mm_sub_ps(iy1,jy1);
827             dz11             = _mm_sub_ps(iz1,jz1);
828             dx12             = _mm_sub_ps(ix1,jx2);
829             dy12             = _mm_sub_ps(iy1,jy2);
830             dz12             = _mm_sub_ps(iz1,jz2);
831             dx20             = _mm_sub_ps(ix2,jx0);
832             dy20             = _mm_sub_ps(iy2,jy0);
833             dz20             = _mm_sub_ps(iz2,jz0);
834             dx21             = _mm_sub_ps(ix2,jx1);
835             dy21             = _mm_sub_ps(iy2,jy1);
836             dz21             = _mm_sub_ps(iz2,jz1);
837             dx22             = _mm_sub_ps(ix2,jx2);
838             dy22             = _mm_sub_ps(iy2,jy2);
839             dz22             = _mm_sub_ps(iz2,jz2);
840
841             /* Calculate squared distance and things based on it */
842             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
843             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
844             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
845             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
846             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
847             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
848             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
849             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
850             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
851
852             rinv00           = gmx_mm_invsqrt_ps(rsq00);
853             rinv01           = gmx_mm_invsqrt_ps(rsq01);
854             rinv02           = gmx_mm_invsqrt_ps(rsq02);
855             rinv10           = gmx_mm_invsqrt_ps(rsq10);
856             rinv11           = gmx_mm_invsqrt_ps(rsq11);
857             rinv12           = gmx_mm_invsqrt_ps(rsq12);
858             rinv20           = gmx_mm_invsqrt_ps(rsq20);
859             rinv21           = gmx_mm_invsqrt_ps(rsq21);
860             rinv22           = gmx_mm_invsqrt_ps(rsq22);
861
862             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
863             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
864             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
865             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
866             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
867             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
868             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
869             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
870             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
871
872             fjx0             = _mm_setzero_ps();
873             fjy0             = _mm_setzero_ps();
874             fjz0             = _mm_setzero_ps();
875             fjx1             = _mm_setzero_ps();
876             fjy1             = _mm_setzero_ps();
877             fjz1             = _mm_setzero_ps();
878             fjx2             = _mm_setzero_ps();
879             fjy2             = _mm_setzero_ps();
880             fjz2             = _mm_setzero_ps();
881
882             /**************************
883              * CALCULATE INTERACTIONS *
884              **************************/
885
886             if (gmx_mm_any_lt(rsq00,rcutoff2))
887             {
888
889             r00              = _mm_mul_ps(rsq00,rinv00);
890             r00              = _mm_andnot_ps(dummy_mask,r00);
891
892             /* EWALD ELECTROSTATICS */
893
894             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
895             ewrt             = _mm_mul_ps(r00,ewtabscale);
896             ewitab           = _mm_cvttps_epi32(ewrt);
897             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
898             ewitab           = _mm_slli_epi32(ewitab,2);
899             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
900             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
901             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
902             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
903             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
904             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
905             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
906             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
907             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
908
909             /* LENNARD-JONES DISPERSION/REPULSION */
910
911             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
912             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
913             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
914             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
915                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
916             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
917
918             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
919
920             /* Update potential sum for this i atom from the interaction with this j atom. */
921             velec            = _mm_and_ps(velec,cutoff_mask);
922             velec            = _mm_andnot_ps(dummy_mask,velec);
923             velecsum         = _mm_add_ps(velecsum,velec);
924             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
925             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
926             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
927
928             fscal            = _mm_add_ps(felec,fvdw);
929
930             fscal            = _mm_and_ps(fscal,cutoff_mask);
931
932             fscal            = _mm_andnot_ps(dummy_mask,fscal);
933
934             /* Calculate temporary vectorial force */
935             tx               = _mm_mul_ps(fscal,dx00);
936             ty               = _mm_mul_ps(fscal,dy00);
937             tz               = _mm_mul_ps(fscal,dz00);
938
939             /* Update vectorial force */
940             fix0             = _mm_add_ps(fix0,tx);
941             fiy0             = _mm_add_ps(fiy0,ty);
942             fiz0             = _mm_add_ps(fiz0,tz);
943
944             fjx0             = _mm_add_ps(fjx0,tx);
945             fjy0             = _mm_add_ps(fjy0,ty);
946             fjz0             = _mm_add_ps(fjz0,tz);
947
948             }
949
950             /**************************
951              * CALCULATE INTERACTIONS *
952              **************************/
953
954             if (gmx_mm_any_lt(rsq01,rcutoff2))
955             {
956
957             r01              = _mm_mul_ps(rsq01,rinv01);
958             r01              = _mm_andnot_ps(dummy_mask,r01);
959
960             /* EWALD ELECTROSTATICS */
961
962             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
963             ewrt             = _mm_mul_ps(r01,ewtabscale);
964             ewitab           = _mm_cvttps_epi32(ewrt);
965             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
966             ewitab           = _mm_slli_epi32(ewitab,2);
967             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
968             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
969             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
970             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
971             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
972             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
973             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
974             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
975             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
976
977             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
978
979             /* Update potential sum for this i atom from the interaction with this j atom. */
980             velec            = _mm_and_ps(velec,cutoff_mask);
981             velec            = _mm_andnot_ps(dummy_mask,velec);
982             velecsum         = _mm_add_ps(velecsum,velec);
983
984             fscal            = felec;
985
986             fscal            = _mm_and_ps(fscal,cutoff_mask);
987
988             fscal            = _mm_andnot_ps(dummy_mask,fscal);
989
990             /* Calculate temporary vectorial force */
991             tx               = _mm_mul_ps(fscal,dx01);
992             ty               = _mm_mul_ps(fscal,dy01);
993             tz               = _mm_mul_ps(fscal,dz01);
994
995             /* Update vectorial force */
996             fix0             = _mm_add_ps(fix0,tx);
997             fiy0             = _mm_add_ps(fiy0,ty);
998             fiz0             = _mm_add_ps(fiz0,tz);
999
1000             fjx1             = _mm_add_ps(fjx1,tx);
1001             fjy1             = _mm_add_ps(fjy1,ty);
1002             fjz1             = _mm_add_ps(fjz1,tz);
1003
1004             }
1005
1006             /**************************
1007              * CALCULATE INTERACTIONS *
1008              **************************/
1009
1010             if (gmx_mm_any_lt(rsq02,rcutoff2))
1011             {
1012
1013             r02              = _mm_mul_ps(rsq02,rinv02);
1014             r02              = _mm_andnot_ps(dummy_mask,r02);
1015
1016             /* EWALD ELECTROSTATICS */
1017
1018             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1019             ewrt             = _mm_mul_ps(r02,ewtabscale);
1020             ewitab           = _mm_cvttps_epi32(ewrt);
1021             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1022             ewitab           = _mm_slli_epi32(ewitab,2);
1023             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1024             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1025             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1026             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1027             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1028             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1029             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1030             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
1031             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1032
1033             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1034
1035             /* Update potential sum for this i atom from the interaction with this j atom. */
1036             velec            = _mm_and_ps(velec,cutoff_mask);
1037             velec            = _mm_andnot_ps(dummy_mask,velec);
1038             velecsum         = _mm_add_ps(velecsum,velec);
1039
1040             fscal            = felec;
1041
1042             fscal            = _mm_and_ps(fscal,cutoff_mask);
1043
1044             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1045
1046             /* Calculate temporary vectorial force */
1047             tx               = _mm_mul_ps(fscal,dx02);
1048             ty               = _mm_mul_ps(fscal,dy02);
1049             tz               = _mm_mul_ps(fscal,dz02);
1050
1051             /* Update vectorial force */
1052             fix0             = _mm_add_ps(fix0,tx);
1053             fiy0             = _mm_add_ps(fiy0,ty);
1054             fiz0             = _mm_add_ps(fiz0,tz);
1055
1056             fjx2             = _mm_add_ps(fjx2,tx);
1057             fjy2             = _mm_add_ps(fjy2,ty);
1058             fjz2             = _mm_add_ps(fjz2,tz);
1059
1060             }
1061
1062             /**************************
1063              * CALCULATE INTERACTIONS *
1064              **************************/
1065
1066             if (gmx_mm_any_lt(rsq10,rcutoff2))
1067             {
1068
1069             r10              = _mm_mul_ps(rsq10,rinv10);
1070             r10              = _mm_andnot_ps(dummy_mask,r10);
1071
1072             /* EWALD ELECTROSTATICS */
1073
1074             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1075             ewrt             = _mm_mul_ps(r10,ewtabscale);
1076             ewitab           = _mm_cvttps_epi32(ewrt);
1077             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1078             ewitab           = _mm_slli_epi32(ewitab,2);
1079             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1080             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1081             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1082             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1083             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1084             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1085             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1086             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1087             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1088
1089             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1090
1091             /* Update potential sum for this i atom from the interaction with this j atom. */
1092             velec            = _mm_and_ps(velec,cutoff_mask);
1093             velec            = _mm_andnot_ps(dummy_mask,velec);
1094             velecsum         = _mm_add_ps(velecsum,velec);
1095
1096             fscal            = felec;
1097
1098             fscal            = _mm_and_ps(fscal,cutoff_mask);
1099
1100             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1101
1102             /* Calculate temporary vectorial force */
1103             tx               = _mm_mul_ps(fscal,dx10);
1104             ty               = _mm_mul_ps(fscal,dy10);
1105             tz               = _mm_mul_ps(fscal,dz10);
1106
1107             /* Update vectorial force */
1108             fix1             = _mm_add_ps(fix1,tx);
1109             fiy1             = _mm_add_ps(fiy1,ty);
1110             fiz1             = _mm_add_ps(fiz1,tz);
1111
1112             fjx0             = _mm_add_ps(fjx0,tx);
1113             fjy0             = _mm_add_ps(fjy0,ty);
1114             fjz0             = _mm_add_ps(fjz0,tz);
1115
1116             }
1117
1118             /**************************
1119              * CALCULATE INTERACTIONS *
1120              **************************/
1121
1122             if (gmx_mm_any_lt(rsq11,rcutoff2))
1123             {
1124
1125             r11              = _mm_mul_ps(rsq11,rinv11);
1126             r11              = _mm_andnot_ps(dummy_mask,r11);
1127
1128             /* EWALD ELECTROSTATICS */
1129
1130             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1131             ewrt             = _mm_mul_ps(r11,ewtabscale);
1132             ewitab           = _mm_cvttps_epi32(ewrt);
1133             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1134             ewitab           = _mm_slli_epi32(ewitab,2);
1135             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1136             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1137             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1138             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1139             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1140             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1141             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1142             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1143             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1144
1145             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1146
1147             /* Update potential sum for this i atom from the interaction with this j atom. */
1148             velec            = _mm_and_ps(velec,cutoff_mask);
1149             velec            = _mm_andnot_ps(dummy_mask,velec);
1150             velecsum         = _mm_add_ps(velecsum,velec);
1151
1152             fscal            = felec;
1153
1154             fscal            = _mm_and_ps(fscal,cutoff_mask);
1155
1156             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1157
1158             /* Calculate temporary vectorial force */
1159             tx               = _mm_mul_ps(fscal,dx11);
1160             ty               = _mm_mul_ps(fscal,dy11);
1161             tz               = _mm_mul_ps(fscal,dz11);
1162
1163             /* Update vectorial force */
1164             fix1             = _mm_add_ps(fix1,tx);
1165             fiy1             = _mm_add_ps(fiy1,ty);
1166             fiz1             = _mm_add_ps(fiz1,tz);
1167
1168             fjx1             = _mm_add_ps(fjx1,tx);
1169             fjy1             = _mm_add_ps(fjy1,ty);
1170             fjz1             = _mm_add_ps(fjz1,tz);
1171
1172             }
1173
1174             /**************************
1175              * CALCULATE INTERACTIONS *
1176              **************************/
1177
1178             if (gmx_mm_any_lt(rsq12,rcutoff2))
1179             {
1180
1181             r12              = _mm_mul_ps(rsq12,rinv12);
1182             r12              = _mm_andnot_ps(dummy_mask,r12);
1183
1184             /* EWALD ELECTROSTATICS */
1185
1186             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1187             ewrt             = _mm_mul_ps(r12,ewtabscale);
1188             ewitab           = _mm_cvttps_epi32(ewrt);
1189             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1190             ewitab           = _mm_slli_epi32(ewitab,2);
1191             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1192             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1193             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1194             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1195             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1196             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1197             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1198             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1199             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1200
1201             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1202
1203             /* Update potential sum for this i atom from the interaction with this j atom. */
1204             velec            = _mm_and_ps(velec,cutoff_mask);
1205             velec            = _mm_andnot_ps(dummy_mask,velec);
1206             velecsum         = _mm_add_ps(velecsum,velec);
1207
1208             fscal            = felec;
1209
1210             fscal            = _mm_and_ps(fscal,cutoff_mask);
1211
1212             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1213
1214             /* Calculate temporary vectorial force */
1215             tx               = _mm_mul_ps(fscal,dx12);
1216             ty               = _mm_mul_ps(fscal,dy12);
1217             tz               = _mm_mul_ps(fscal,dz12);
1218
1219             /* Update vectorial force */
1220             fix1             = _mm_add_ps(fix1,tx);
1221             fiy1             = _mm_add_ps(fiy1,ty);
1222             fiz1             = _mm_add_ps(fiz1,tz);
1223
1224             fjx2             = _mm_add_ps(fjx2,tx);
1225             fjy2             = _mm_add_ps(fjy2,ty);
1226             fjz2             = _mm_add_ps(fjz2,tz);
1227
1228             }
1229
1230             /**************************
1231              * CALCULATE INTERACTIONS *
1232              **************************/
1233
1234             if (gmx_mm_any_lt(rsq20,rcutoff2))
1235             {
1236
1237             r20              = _mm_mul_ps(rsq20,rinv20);
1238             r20              = _mm_andnot_ps(dummy_mask,r20);
1239
1240             /* EWALD ELECTROSTATICS */
1241
1242             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1243             ewrt             = _mm_mul_ps(r20,ewtabscale);
1244             ewitab           = _mm_cvttps_epi32(ewrt);
1245             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1246             ewitab           = _mm_slli_epi32(ewitab,2);
1247             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1248             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1249             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1250             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1251             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1252             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1253             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1254             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1255             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1256
1257             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1258
1259             /* Update potential sum for this i atom from the interaction with this j atom. */
1260             velec            = _mm_and_ps(velec,cutoff_mask);
1261             velec            = _mm_andnot_ps(dummy_mask,velec);
1262             velecsum         = _mm_add_ps(velecsum,velec);
1263
1264             fscal            = felec;
1265
1266             fscal            = _mm_and_ps(fscal,cutoff_mask);
1267
1268             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1269
1270             /* Calculate temporary vectorial force */
1271             tx               = _mm_mul_ps(fscal,dx20);
1272             ty               = _mm_mul_ps(fscal,dy20);
1273             tz               = _mm_mul_ps(fscal,dz20);
1274
1275             /* Update vectorial force */
1276             fix2             = _mm_add_ps(fix2,tx);
1277             fiy2             = _mm_add_ps(fiy2,ty);
1278             fiz2             = _mm_add_ps(fiz2,tz);
1279
1280             fjx0             = _mm_add_ps(fjx0,tx);
1281             fjy0             = _mm_add_ps(fjy0,ty);
1282             fjz0             = _mm_add_ps(fjz0,tz);
1283
1284             }
1285
1286             /**************************
1287              * CALCULATE INTERACTIONS *
1288              **************************/
1289
1290             if (gmx_mm_any_lt(rsq21,rcutoff2))
1291             {
1292
1293             r21              = _mm_mul_ps(rsq21,rinv21);
1294             r21              = _mm_andnot_ps(dummy_mask,r21);
1295
1296             /* EWALD ELECTROSTATICS */
1297
1298             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1299             ewrt             = _mm_mul_ps(r21,ewtabscale);
1300             ewitab           = _mm_cvttps_epi32(ewrt);
1301             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1302             ewitab           = _mm_slli_epi32(ewitab,2);
1303             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1304             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1305             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1306             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1307             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1308             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1309             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1310             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1311             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1312
1313             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1314
1315             /* Update potential sum for this i atom from the interaction with this j atom. */
1316             velec            = _mm_and_ps(velec,cutoff_mask);
1317             velec            = _mm_andnot_ps(dummy_mask,velec);
1318             velecsum         = _mm_add_ps(velecsum,velec);
1319
1320             fscal            = felec;
1321
1322             fscal            = _mm_and_ps(fscal,cutoff_mask);
1323
1324             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1325
1326             /* Calculate temporary vectorial force */
1327             tx               = _mm_mul_ps(fscal,dx21);
1328             ty               = _mm_mul_ps(fscal,dy21);
1329             tz               = _mm_mul_ps(fscal,dz21);
1330
1331             /* Update vectorial force */
1332             fix2             = _mm_add_ps(fix2,tx);
1333             fiy2             = _mm_add_ps(fiy2,ty);
1334             fiz2             = _mm_add_ps(fiz2,tz);
1335
1336             fjx1             = _mm_add_ps(fjx1,tx);
1337             fjy1             = _mm_add_ps(fjy1,ty);
1338             fjz1             = _mm_add_ps(fjz1,tz);
1339
1340             }
1341
1342             /**************************
1343              * CALCULATE INTERACTIONS *
1344              **************************/
1345
1346             if (gmx_mm_any_lt(rsq22,rcutoff2))
1347             {
1348
1349             r22              = _mm_mul_ps(rsq22,rinv22);
1350             r22              = _mm_andnot_ps(dummy_mask,r22);
1351
1352             /* EWALD ELECTROSTATICS */
1353
1354             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1355             ewrt             = _mm_mul_ps(r22,ewtabscale);
1356             ewitab           = _mm_cvttps_epi32(ewrt);
1357             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1358             ewitab           = _mm_slli_epi32(ewitab,2);
1359             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1360             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1361             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1362             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1363             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1364             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1365             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1366             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1367             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1368
1369             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1370
1371             /* Update potential sum for this i atom from the interaction with this j atom. */
1372             velec            = _mm_and_ps(velec,cutoff_mask);
1373             velec            = _mm_andnot_ps(dummy_mask,velec);
1374             velecsum         = _mm_add_ps(velecsum,velec);
1375
1376             fscal            = felec;
1377
1378             fscal            = _mm_and_ps(fscal,cutoff_mask);
1379
1380             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1381
1382             /* Calculate temporary vectorial force */
1383             tx               = _mm_mul_ps(fscal,dx22);
1384             ty               = _mm_mul_ps(fscal,dy22);
1385             tz               = _mm_mul_ps(fscal,dz22);
1386
1387             /* Update vectorial force */
1388             fix2             = _mm_add_ps(fix2,tx);
1389             fiy2             = _mm_add_ps(fiy2,ty);
1390             fiz2             = _mm_add_ps(fiz2,tz);
1391
1392             fjx2             = _mm_add_ps(fjx2,tx);
1393             fjy2             = _mm_add_ps(fjy2,ty);
1394             fjz2             = _mm_add_ps(fjz2,tz);
1395
1396             }
1397
1398             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1399                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1400                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1401
1402             /* Inner loop uses 441 flops */
1403         }
1404
1405         /* End of innermost loop */
1406
1407         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1408                                               f+i_coord_offset,fshift+i_shift_offset);
1409
1410         ggid                        = gid[iidx];
1411         /* Update potential energies */
1412         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1413         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1414
1415         /* Increment number of inner iterations */
1416         inneriter                  += j_index_end - j_index_start;
1417
1418         /* Outer loop uses 29 flops */
1419     }
1420
1421     /* Increment number of outer iterations */
1422     outeriter        += nri;
1423
1424     /* Update outer/inner flops */
1425
1426     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*441);
1427 }
1428 /*
1429  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
1430  * Electrostatics interaction: Ewald
1431  * VdW interaction:            LennardJones
1432  * Geometry:                   Water3-Water3
1433  * Calculate force/pot:        Force
1434  */
1435 void
1436 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
1437                     (t_nblist * gmx_restrict                nlist,
1438                      rvec * gmx_restrict                    xx,
1439                      rvec * gmx_restrict                    ff,
1440                      t_forcerec * gmx_restrict              fr,
1441                      t_mdatoms * gmx_restrict               mdatoms,
1442                      nb_kernel_data_t * gmx_restrict        kernel_data,
1443                      t_nrnb * gmx_restrict                  nrnb)
1444 {
1445     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1446      * just 0 for non-waters.
1447      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1448      * jnr indices corresponding to data put in the four positions in the SIMD register.
1449      */
1450     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1451     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1452     int              jnrA,jnrB,jnrC,jnrD;
1453     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1454     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1455     real             shX,shY,shZ,rcutoff_scalar;
1456     real             *shiftvec,*fshift,*x,*f;
1457     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1458     int              vdwioffset0;
1459     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1460     int              vdwioffset1;
1461     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1462     int              vdwioffset2;
1463     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1464     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1465     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1466     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1467     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1468     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1469     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1470     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1471     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1472     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1473     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1474     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1475     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1476     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1477     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1478     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1479     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1480     real             *charge;
1481     int              nvdwtype;
1482     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1483     int              *vdwtype;
1484     real             *vdwparam;
1485     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1486     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1487     __m128i          ewitab;
1488     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1489     real             *ewtab;
1490     __m128           dummy_mask,cutoff_mask;
1491     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1492     __m128           one     = _mm_set1_ps(1.0);
1493     __m128           two     = _mm_set1_ps(2.0);
1494     x                = xx[0];
1495     f                = ff[0];
1496
1497     nri              = nlist->nri;
1498     iinr             = nlist->iinr;
1499     jindex           = nlist->jindex;
1500     jjnr             = nlist->jjnr;
1501     shiftidx         = nlist->shift;
1502     gid              = nlist->gid;
1503     shiftvec         = fr->shift_vec[0];
1504     fshift           = fr->fshift[0];
1505     facel            = _mm_set1_ps(fr->epsfac);
1506     charge           = mdatoms->chargeA;
1507     nvdwtype         = fr->ntype;
1508     vdwparam         = fr->nbfp;
1509     vdwtype          = mdatoms->typeA;
1510
1511     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1512     ewtab            = fr->ic->tabq_coul_F;
1513     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1514     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1515
1516     /* Setup water-specific parameters */
1517     inr              = nlist->iinr[0];
1518     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1519     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1520     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1521     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1522
1523     jq0              = _mm_set1_ps(charge[inr+0]);
1524     jq1              = _mm_set1_ps(charge[inr+1]);
1525     jq2              = _mm_set1_ps(charge[inr+2]);
1526     vdwjidx0A        = 2*vdwtype[inr+0];
1527     qq00             = _mm_mul_ps(iq0,jq0);
1528     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1529     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1530     qq01             = _mm_mul_ps(iq0,jq1);
1531     qq02             = _mm_mul_ps(iq0,jq2);
1532     qq10             = _mm_mul_ps(iq1,jq0);
1533     qq11             = _mm_mul_ps(iq1,jq1);
1534     qq12             = _mm_mul_ps(iq1,jq2);
1535     qq20             = _mm_mul_ps(iq2,jq0);
1536     qq21             = _mm_mul_ps(iq2,jq1);
1537     qq22             = _mm_mul_ps(iq2,jq2);
1538
1539     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1540     rcutoff_scalar   = fr->rcoulomb;
1541     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1542     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1543
1544     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
1545     rvdw             = _mm_set1_ps(fr->rvdw);
1546
1547     /* Avoid stupid compiler warnings */
1548     jnrA = jnrB = jnrC = jnrD = 0;
1549     j_coord_offsetA = 0;
1550     j_coord_offsetB = 0;
1551     j_coord_offsetC = 0;
1552     j_coord_offsetD = 0;
1553
1554     outeriter        = 0;
1555     inneriter        = 0;
1556
1557     /* Start outer loop over neighborlists */
1558     for(iidx=0; iidx<nri; iidx++)
1559     {
1560         /* Load shift vector for this list */
1561         i_shift_offset   = DIM*shiftidx[iidx];
1562         shX              = shiftvec[i_shift_offset+XX];
1563         shY              = shiftvec[i_shift_offset+YY];
1564         shZ              = shiftvec[i_shift_offset+ZZ];
1565
1566         /* Load limits for loop over neighbors */
1567         j_index_start    = jindex[iidx];
1568         j_index_end      = jindex[iidx+1];
1569
1570         /* Get outer coordinate index */
1571         inr              = iinr[iidx];
1572         i_coord_offset   = DIM*inr;
1573
1574         /* Load i particle coords and add shift vector */
1575         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1576         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1577         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1578         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1579         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1580         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1581         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1582         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1583         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1584
1585         fix0             = _mm_setzero_ps();
1586         fiy0             = _mm_setzero_ps();
1587         fiz0             = _mm_setzero_ps();
1588         fix1             = _mm_setzero_ps();
1589         fiy1             = _mm_setzero_ps();
1590         fiz1             = _mm_setzero_ps();
1591         fix2             = _mm_setzero_ps();
1592         fiy2             = _mm_setzero_ps();
1593         fiz2             = _mm_setzero_ps();
1594
1595         /* Start inner kernel loop */
1596         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1597         {
1598
1599             /* Get j neighbor index, and coordinate index */
1600             jnrA             = jjnr[jidx];
1601             jnrB             = jjnr[jidx+1];
1602             jnrC             = jjnr[jidx+2];
1603             jnrD             = jjnr[jidx+3];
1604
1605             j_coord_offsetA  = DIM*jnrA;
1606             j_coord_offsetB  = DIM*jnrB;
1607             j_coord_offsetC  = DIM*jnrC;
1608             j_coord_offsetD  = DIM*jnrD;
1609
1610             /* load j atom coordinates */
1611             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1612                                               x+j_coord_offsetC,x+j_coord_offsetD,
1613                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1614
1615             /* Calculate displacement vector */
1616             dx00             = _mm_sub_ps(ix0,jx0);
1617             dy00             = _mm_sub_ps(iy0,jy0);
1618             dz00             = _mm_sub_ps(iz0,jz0);
1619             dx01             = _mm_sub_ps(ix0,jx1);
1620             dy01             = _mm_sub_ps(iy0,jy1);
1621             dz01             = _mm_sub_ps(iz0,jz1);
1622             dx02             = _mm_sub_ps(ix0,jx2);
1623             dy02             = _mm_sub_ps(iy0,jy2);
1624             dz02             = _mm_sub_ps(iz0,jz2);
1625             dx10             = _mm_sub_ps(ix1,jx0);
1626             dy10             = _mm_sub_ps(iy1,jy0);
1627             dz10             = _mm_sub_ps(iz1,jz0);
1628             dx11             = _mm_sub_ps(ix1,jx1);
1629             dy11             = _mm_sub_ps(iy1,jy1);
1630             dz11             = _mm_sub_ps(iz1,jz1);
1631             dx12             = _mm_sub_ps(ix1,jx2);
1632             dy12             = _mm_sub_ps(iy1,jy2);
1633             dz12             = _mm_sub_ps(iz1,jz2);
1634             dx20             = _mm_sub_ps(ix2,jx0);
1635             dy20             = _mm_sub_ps(iy2,jy0);
1636             dz20             = _mm_sub_ps(iz2,jz0);
1637             dx21             = _mm_sub_ps(ix2,jx1);
1638             dy21             = _mm_sub_ps(iy2,jy1);
1639             dz21             = _mm_sub_ps(iz2,jz1);
1640             dx22             = _mm_sub_ps(ix2,jx2);
1641             dy22             = _mm_sub_ps(iy2,jy2);
1642             dz22             = _mm_sub_ps(iz2,jz2);
1643
1644             /* Calculate squared distance and things based on it */
1645             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1646             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1647             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1648             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1649             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1650             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1651             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1652             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1653             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1654
1655             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1656             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1657             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1658             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1659             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1660             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1661             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1662             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1663             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1664
1665             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1666             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1667             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1668             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1669             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1670             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1671             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1672             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1673             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1674
1675             fjx0             = _mm_setzero_ps();
1676             fjy0             = _mm_setzero_ps();
1677             fjz0             = _mm_setzero_ps();
1678             fjx1             = _mm_setzero_ps();
1679             fjy1             = _mm_setzero_ps();
1680             fjz1             = _mm_setzero_ps();
1681             fjx2             = _mm_setzero_ps();
1682             fjy2             = _mm_setzero_ps();
1683             fjz2             = _mm_setzero_ps();
1684
1685             /**************************
1686              * CALCULATE INTERACTIONS *
1687              **************************/
1688
1689             if (gmx_mm_any_lt(rsq00,rcutoff2))
1690             {
1691
1692             r00              = _mm_mul_ps(rsq00,rinv00);
1693
1694             /* EWALD ELECTROSTATICS */
1695
1696             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1697             ewrt             = _mm_mul_ps(r00,ewtabscale);
1698             ewitab           = _mm_cvttps_epi32(ewrt);
1699             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1700             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1701                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1702                                          &ewtabF,&ewtabFn);
1703             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1704             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1705
1706             /* LENNARD-JONES DISPERSION/REPULSION */
1707
1708             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1709             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1710
1711             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1712
1713             fscal            = _mm_add_ps(felec,fvdw);
1714
1715             fscal            = _mm_and_ps(fscal,cutoff_mask);
1716
1717             /* Calculate temporary vectorial force */
1718             tx               = _mm_mul_ps(fscal,dx00);
1719             ty               = _mm_mul_ps(fscal,dy00);
1720             tz               = _mm_mul_ps(fscal,dz00);
1721
1722             /* Update vectorial force */
1723             fix0             = _mm_add_ps(fix0,tx);
1724             fiy0             = _mm_add_ps(fiy0,ty);
1725             fiz0             = _mm_add_ps(fiz0,tz);
1726
1727             fjx0             = _mm_add_ps(fjx0,tx);
1728             fjy0             = _mm_add_ps(fjy0,ty);
1729             fjz0             = _mm_add_ps(fjz0,tz);
1730
1731             }
1732
1733             /**************************
1734              * CALCULATE INTERACTIONS *
1735              **************************/
1736
1737             if (gmx_mm_any_lt(rsq01,rcutoff2))
1738             {
1739
1740             r01              = _mm_mul_ps(rsq01,rinv01);
1741
1742             /* EWALD ELECTROSTATICS */
1743
1744             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1745             ewrt             = _mm_mul_ps(r01,ewtabscale);
1746             ewitab           = _mm_cvttps_epi32(ewrt);
1747             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1748             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1749                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1750                                          &ewtabF,&ewtabFn);
1751             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1752             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1753
1754             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1755
1756             fscal            = felec;
1757
1758             fscal            = _mm_and_ps(fscal,cutoff_mask);
1759
1760             /* Calculate temporary vectorial force */
1761             tx               = _mm_mul_ps(fscal,dx01);
1762             ty               = _mm_mul_ps(fscal,dy01);
1763             tz               = _mm_mul_ps(fscal,dz01);
1764
1765             /* Update vectorial force */
1766             fix0             = _mm_add_ps(fix0,tx);
1767             fiy0             = _mm_add_ps(fiy0,ty);
1768             fiz0             = _mm_add_ps(fiz0,tz);
1769
1770             fjx1             = _mm_add_ps(fjx1,tx);
1771             fjy1             = _mm_add_ps(fjy1,ty);
1772             fjz1             = _mm_add_ps(fjz1,tz);
1773
1774             }
1775
1776             /**************************
1777              * CALCULATE INTERACTIONS *
1778              **************************/
1779
1780             if (gmx_mm_any_lt(rsq02,rcutoff2))
1781             {
1782
1783             r02              = _mm_mul_ps(rsq02,rinv02);
1784
1785             /* EWALD ELECTROSTATICS */
1786
1787             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1788             ewrt             = _mm_mul_ps(r02,ewtabscale);
1789             ewitab           = _mm_cvttps_epi32(ewrt);
1790             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1791             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1792                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1793                                          &ewtabF,&ewtabFn);
1794             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1795             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1796
1797             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1798
1799             fscal            = felec;
1800
1801             fscal            = _mm_and_ps(fscal,cutoff_mask);
1802
1803             /* Calculate temporary vectorial force */
1804             tx               = _mm_mul_ps(fscal,dx02);
1805             ty               = _mm_mul_ps(fscal,dy02);
1806             tz               = _mm_mul_ps(fscal,dz02);
1807
1808             /* Update vectorial force */
1809             fix0             = _mm_add_ps(fix0,tx);
1810             fiy0             = _mm_add_ps(fiy0,ty);
1811             fiz0             = _mm_add_ps(fiz0,tz);
1812
1813             fjx2             = _mm_add_ps(fjx2,tx);
1814             fjy2             = _mm_add_ps(fjy2,ty);
1815             fjz2             = _mm_add_ps(fjz2,tz);
1816
1817             }
1818
1819             /**************************
1820              * CALCULATE INTERACTIONS *
1821              **************************/
1822
1823             if (gmx_mm_any_lt(rsq10,rcutoff2))
1824             {
1825
1826             r10              = _mm_mul_ps(rsq10,rinv10);
1827
1828             /* EWALD ELECTROSTATICS */
1829
1830             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1831             ewrt             = _mm_mul_ps(r10,ewtabscale);
1832             ewitab           = _mm_cvttps_epi32(ewrt);
1833             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1834             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1835                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1836                                          &ewtabF,&ewtabFn);
1837             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1838             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1839
1840             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1841
1842             fscal            = felec;
1843
1844             fscal            = _mm_and_ps(fscal,cutoff_mask);
1845
1846             /* Calculate temporary vectorial force */
1847             tx               = _mm_mul_ps(fscal,dx10);
1848             ty               = _mm_mul_ps(fscal,dy10);
1849             tz               = _mm_mul_ps(fscal,dz10);
1850
1851             /* Update vectorial force */
1852             fix1             = _mm_add_ps(fix1,tx);
1853             fiy1             = _mm_add_ps(fiy1,ty);
1854             fiz1             = _mm_add_ps(fiz1,tz);
1855
1856             fjx0             = _mm_add_ps(fjx0,tx);
1857             fjy0             = _mm_add_ps(fjy0,ty);
1858             fjz0             = _mm_add_ps(fjz0,tz);
1859
1860             }
1861
1862             /**************************
1863              * CALCULATE INTERACTIONS *
1864              **************************/
1865
1866             if (gmx_mm_any_lt(rsq11,rcutoff2))
1867             {
1868
1869             r11              = _mm_mul_ps(rsq11,rinv11);
1870
1871             /* EWALD ELECTROSTATICS */
1872
1873             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1874             ewrt             = _mm_mul_ps(r11,ewtabscale);
1875             ewitab           = _mm_cvttps_epi32(ewrt);
1876             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1877             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1878                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1879                                          &ewtabF,&ewtabFn);
1880             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1881             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1882
1883             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1884
1885             fscal            = felec;
1886
1887             fscal            = _mm_and_ps(fscal,cutoff_mask);
1888
1889             /* Calculate temporary vectorial force */
1890             tx               = _mm_mul_ps(fscal,dx11);
1891             ty               = _mm_mul_ps(fscal,dy11);
1892             tz               = _mm_mul_ps(fscal,dz11);
1893
1894             /* Update vectorial force */
1895             fix1             = _mm_add_ps(fix1,tx);
1896             fiy1             = _mm_add_ps(fiy1,ty);
1897             fiz1             = _mm_add_ps(fiz1,tz);
1898
1899             fjx1             = _mm_add_ps(fjx1,tx);
1900             fjy1             = _mm_add_ps(fjy1,ty);
1901             fjz1             = _mm_add_ps(fjz1,tz);
1902
1903             }
1904
1905             /**************************
1906              * CALCULATE INTERACTIONS *
1907              **************************/
1908
1909             if (gmx_mm_any_lt(rsq12,rcutoff2))
1910             {
1911
1912             r12              = _mm_mul_ps(rsq12,rinv12);
1913
1914             /* EWALD ELECTROSTATICS */
1915
1916             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1917             ewrt             = _mm_mul_ps(r12,ewtabscale);
1918             ewitab           = _mm_cvttps_epi32(ewrt);
1919             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1920             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1921                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1922                                          &ewtabF,&ewtabFn);
1923             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1924             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1925
1926             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1927
1928             fscal            = felec;
1929
1930             fscal            = _mm_and_ps(fscal,cutoff_mask);
1931
1932             /* Calculate temporary vectorial force */
1933             tx               = _mm_mul_ps(fscal,dx12);
1934             ty               = _mm_mul_ps(fscal,dy12);
1935             tz               = _mm_mul_ps(fscal,dz12);
1936
1937             /* Update vectorial force */
1938             fix1             = _mm_add_ps(fix1,tx);
1939             fiy1             = _mm_add_ps(fiy1,ty);
1940             fiz1             = _mm_add_ps(fiz1,tz);
1941
1942             fjx2             = _mm_add_ps(fjx2,tx);
1943             fjy2             = _mm_add_ps(fjy2,ty);
1944             fjz2             = _mm_add_ps(fjz2,tz);
1945
1946             }
1947
1948             /**************************
1949              * CALCULATE INTERACTIONS *
1950              **************************/
1951
1952             if (gmx_mm_any_lt(rsq20,rcutoff2))
1953             {
1954
1955             r20              = _mm_mul_ps(rsq20,rinv20);
1956
1957             /* EWALD ELECTROSTATICS */
1958
1959             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1960             ewrt             = _mm_mul_ps(r20,ewtabscale);
1961             ewitab           = _mm_cvttps_epi32(ewrt);
1962             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1963             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1964                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1965                                          &ewtabF,&ewtabFn);
1966             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1967             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1968
1969             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1970
1971             fscal            = felec;
1972
1973             fscal            = _mm_and_ps(fscal,cutoff_mask);
1974
1975             /* Calculate temporary vectorial force */
1976             tx               = _mm_mul_ps(fscal,dx20);
1977             ty               = _mm_mul_ps(fscal,dy20);
1978             tz               = _mm_mul_ps(fscal,dz20);
1979
1980             /* Update vectorial force */
1981             fix2             = _mm_add_ps(fix2,tx);
1982             fiy2             = _mm_add_ps(fiy2,ty);
1983             fiz2             = _mm_add_ps(fiz2,tz);
1984
1985             fjx0             = _mm_add_ps(fjx0,tx);
1986             fjy0             = _mm_add_ps(fjy0,ty);
1987             fjz0             = _mm_add_ps(fjz0,tz);
1988
1989             }
1990
1991             /**************************
1992              * CALCULATE INTERACTIONS *
1993              **************************/
1994
1995             if (gmx_mm_any_lt(rsq21,rcutoff2))
1996             {
1997
1998             r21              = _mm_mul_ps(rsq21,rinv21);
1999
2000             /* EWALD ELECTROSTATICS */
2001
2002             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2003             ewrt             = _mm_mul_ps(r21,ewtabscale);
2004             ewitab           = _mm_cvttps_epi32(ewrt);
2005             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2006             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2007                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2008                                          &ewtabF,&ewtabFn);
2009             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2010             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2011
2012             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2013
2014             fscal            = felec;
2015
2016             fscal            = _mm_and_ps(fscal,cutoff_mask);
2017
2018             /* Calculate temporary vectorial force */
2019             tx               = _mm_mul_ps(fscal,dx21);
2020             ty               = _mm_mul_ps(fscal,dy21);
2021             tz               = _mm_mul_ps(fscal,dz21);
2022
2023             /* Update vectorial force */
2024             fix2             = _mm_add_ps(fix2,tx);
2025             fiy2             = _mm_add_ps(fiy2,ty);
2026             fiz2             = _mm_add_ps(fiz2,tz);
2027
2028             fjx1             = _mm_add_ps(fjx1,tx);
2029             fjy1             = _mm_add_ps(fjy1,ty);
2030             fjz1             = _mm_add_ps(fjz1,tz);
2031
2032             }
2033
2034             /**************************
2035              * CALCULATE INTERACTIONS *
2036              **************************/
2037
2038             if (gmx_mm_any_lt(rsq22,rcutoff2))
2039             {
2040
2041             r22              = _mm_mul_ps(rsq22,rinv22);
2042
2043             /* EWALD ELECTROSTATICS */
2044
2045             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2046             ewrt             = _mm_mul_ps(r22,ewtabscale);
2047             ewitab           = _mm_cvttps_epi32(ewrt);
2048             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2049             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2050                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2051                                          &ewtabF,&ewtabFn);
2052             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2053             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2054
2055             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2056
2057             fscal            = felec;
2058
2059             fscal            = _mm_and_ps(fscal,cutoff_mask);
2060
2061             /* Calculate temporary vectorial force */
2062             tx               = _mm_mul_ps(fscal,dx22);
2063             ty               = _mm_mul_ps(fscal,dy22);
2064             tz               = _mm_mul_ps(fscal,dz22);
2065
2066             /* Update vectorial force */
2067             fix2             = _mm_add_ps(fix2,tx);
2068             fiy2             = _mm_add_ps(fiy2,ty);
2069             fiz2             = _mm_add_ps(fiz2,tz);
2070
2071             fjx2             = _mm_add_ps(fjx2,tx);
2072             fjy2             = _mm_add_ps(fjy2,ty);
2073             fjz2             = _mm_add_ps(fjz2,tz);
2074
2075             }
2076
2077             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2078                                                    f+j_coord_offsetC,f+j_coord_offsetD,
2079                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2080
2081             /* Inner loop uses 358 flops */
2082         }
2083
2084         if(jidx<j_index_end)
2085         {
2086
2087             /* Get j neighbor index, and coordinate index */
2088             jnrA             = jjnr[jidx];
2089             jnrB             = jjnr[jidx+1];
2090             jnrC             = jjnr[jidx+2];
2091             jnrD             = jjnr[jidx+3];
2092
2093             /* Sign of each element will be negative for non-real atoms.
2094              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2095              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2096              */
2097             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2098             jnrA       = (jnrA>=0) ? jnrA : 0;
2099             jnrB       = (jnrB>=0) ? jnrB : 0;
2100             jnrC       = (jnrC>=0) ? jnrC : 0;
2101             jnrD       = (jnrD>=0) ? jnrD : 0;
2102
2103             j_coord_offsetA  = DIM*jnrA;
2104             j_coord_offsetB  = DIM*jnrB;
2105             j_coord_offsetC  = DIM*jnrC;
2106             j_coord_offsetD  = DIM*jnrD;
2107
2108             /* load j atom coordinates */
2109             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2110                                               x+j_coord_offsetC,x+j_coord_offsetD,
2111                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2112
2113             /* Calculate displacement vector */
2114             dx00             = _mm_sub_ps(ix0,jx0);
2115             dy00             = _mm_sub_ps(iy0,jy0);
2116             dz00             = _mm_sub_ps(iz0,jz0);
2117             dx01             = _mm_sub_ps(ix0,jx1);
2118             dy01             = _mm_sub_ps(iy0,jy1);
2119             dz01             = _mm_sub_ps(iz0,jz1);
2120             dx02             = _mm_sub_ps(ix0,jx2);
2121             dy02             = _mm_sub_ps(iy0,jy2);
2122             dz02             = _mm_sub_ps(iz0,jz2);
2123             dx10             = _mm_sub_ps(ix1,jx0);
2124             dy10             = _mm_sub_ps(iy1,jy0);
2125             dz10             = _mm_sub_ps(iz1,jz0);
2126             dx11             = _mm_sub_ps(ix1,jx1);
2127             dy11             = _mm_sub_ps(iy1,jy1);
2128             dz11             = _mm_sub_ps(iz1,jz1);
2129             dx12             = _mm_sub_ps(ix1,jx2);
2130             dy12             = _mm_sub_ps(iy1,jy2);
2131             dz12             = _mm_sub_ps(iz1,jz2);
2132             dx20             = _mm_sub_ps(ix2,jx0);
2133             dy20             = _mm_sub_ps(iy2,jy0);
2134             dz20             = _mm_sub_ps(iz2,jz0);
2135             dx21             = _mm_sub_ps(ix2,jx1);
2136             dy21             = _mm_sub_ps(iy2,jy1);
2137             dz21             = _mm_sub_ps(iz2,jz1);
2138             dx22             = _mm_sub_ps(ix2,jx2);
2139             dy22             = _mm_sub_ps(iy2,jy2);
2140             dz22             = _mm_sub_ps(iz2,jz2);
2141
2142             /* Calculate squared distance and things based on it */
2143             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2144             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2145             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2146             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2147             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2148             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2149             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2150             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2151             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2152
2153             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2154             rinv01           = gmx_mm_invsqrt_ps(rsq01);
2155             rinv02           = gmx_mm_invsqrt_ps(rsq02);
2156             rinv10           = gmx_mm_invsqrt_ps(rsq10);
2157             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2158             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2159             rinv20           = gmx_mm_invsqrt_ps(rsq20);
2160             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2161             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2162
2163             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2164             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2165             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2166             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2167             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2168             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2169             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2170             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2171             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2172
2173             fjx0             = _mm_setzero_ps();
2174             fjy0             = _mm_setzero_ps();
2175             fjz0             = _mm_setzero_ps();
2176             fjx1             = _mm_setzero_ps();
2177             fjy1             = _mm_setzero_ps();
2178             fjz1             = _mm_setzero_ps();
2179             fjx2             = _mm_setzero_ps();
2180             fjy2             = _mm_setzero_ps();
2181             fjz2             = _mm_setzero_ps();
2182
2183             /**************************
2184              * CALCULATE INTERACTIONS *
2185              **************************/
2186
2187             if (gmx_mm_any_lt(rsq00,rcutoff2))
2188             {
2189
2190             r00              = _mm_mul_ps(rsq00,rinv00);
2191             r00              = _mm_andnot_ps(dummy_mask,r00);
2192
2193             /* EWALD ELECTROSTATICS */
2194
2195             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2196             ewrt             = _mm_mul_ps(r00,ewtabscale);
2197             ewitab           = _mm_cvttps_epi32(ewrt);
2198             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2199             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2200                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2201                                          &ewtabF,&ewtabFn);
2202             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2203             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2204
2205             /* LENNARD-JONES DISPERSION/REPULSION */
2206
2207             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2208             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2209
2210             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2211
2212             fscal            = _mm_add_ps(felec,fvdw);
2213
2214             fscal            = _mm_and_ps(fscal,cutoff_mask);
2215
2216             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2217
2218             /* Calculate temporary vectorial force */
2219             tx               = _mm_mul_ps(fscal,dx00);
2220             ty               = _mm_mul_ps(fscal,dy00);
2221             tz               = _mm_mul_ps(fscal,dz00);
2222
2223             /* Update vectorial force */
2224             fix0             = _mm_add_ps(fix0,tx);
2225             fiy0             = _mm_add_ps(fiy0,ty);
2226             fiz0             = _mm_add_ps(fiz0,tz);
2227
2228             fjx0             = _mm_add_ps(fjx0,tx);
2229             fjy0             = _mm_add_ps(fjy0,ty);
2230             fjz0             = _mm_add_ps(fjz0,tz);
2231
2232             }
2233
2234             /**************************
2235              * CALCULATE INTERACTIONS *
2236              **************************/
2237
2238             if (gmx_mm_any_lt(rsq01,rcutoff2))
2239             {
2240
2241             r01              = _mm_mul_ps(rsq01,rinv01);
2242             r01              = _mm_andnot_ps(dummy_mask,r01);
2243
2244             /* EWALD ELECTROSTATICS */
2245
2246             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2247             ewrt             = _mm_mul_ps(r01,ewtabscale);
2248             ewitab           = _mm_cvttps_epi32(ewrt);
2249             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2250             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2251                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2252                                          &ewtabF,&ewtabFn);
2253             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2254             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2255
2256             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2257
2258             fscal            = felec;
2259
2260             fscal            = _mm_and_ps(fscal,cutoff_mask);
2261
2262             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2263
2264             /* Calculate temporary vectorial force */
2265             tx               = _mm_mul_ps(fscal,dx01);
2266             ty               = _mm_mul_ps(fscal,dy01);
2267             tz               = _mm_mul_ps(fscal,dz01);
2268
2269             /* Update vectorial force */
2270             fix0             = _mm_add_ps(fix0,tx);
2271             fiy0             = _mm_add_ps(fiy0,ty);
2272             fiz0             = _mm_add_ps(fiz0,tz);
2273
2274             fjx1             = _mm_add_ps(fjx1,tx);
2275             fjy1             = _mm_add_ps(fjy1,ty);
2276             fjz1             = _mm_add_ps(fjz1,tz);
2277
2278             }
2279
2280             /**************************
2281              * CALCULATE INTERACTIONS *
2282              **************************/
2283
2284             if (gmx_mm_any_lt(rsq02,rcutoff2))
2285             {
2286
2287             r02              = _mm_mul_ps(rsq02,rinv02);
2288             r02              = _mm_andnot_ps(dummy_mask,r02);
2289
2290             /* EWALD ELECTROSTATICS */
2291
2292             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2293             ewrt             = _mm_mul_ps(r02,ewtabscale);
2294             ewitab           = _mm_cvttps_epi32(ewrt);
2295             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2296             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2297                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2298                                          &ewtabF,&ewtabFn);
2299             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2300             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2301
2302             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2303
2304             fscal            = felec;
2305
2306             fscal            = _mm_and_ps(fscal,cutoff_mask);
2307
2308             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2309
2310             /* Calculate temporary vectorial force */
2311             tx               = _mm_mul_ps(fscal,dx02);
2312             ty               = _mm_mul_ps(fscal,dy02);
2313             tz               = _mm_mul_ps(fscal,dz02);
2314
2315             /* Update vectorial force */
2316             fix0             = _mm_add_ps(fix0,tx);
2317             fiy0             = _mm_add_ps(fiy0,ty);
2318             fiz0             = _mm_add_ps(fiz0,tz);
2319
2320             fjx2             = _mm_add_ps(fjx2,tx);
2321             fjy2             = _mm_add_ps(fjy2,ty);
2322             fjz2             = _mm_add_ps(fjz2,tz);
2323
2324             }
2325
2326             /**************************
2327              * CALCULATE INTERACTIONS *
2328              **************************/
2329
2330             if (gmx_mm_any_lt(rsq10,rcutoff2))
2331             {
2332
2333             r10              = _mm_mul_ps(rsq10,rinv10);
2334             r10              = _mm_andnot_ps(dummy_mask,r10);
2335
2336             /* EWALD ELECTROSTATICS */
2337
2338             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2339             ewrt             = _mm_mul_ps(r10,ewtabscale);
2340             ewitab           = _mm_cvttps_epi32(ewrt);
2341             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2342             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2343                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2344                                          &ewtabF,&ewtabFn);
2345             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2346             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2347
2348             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2349
2350             fscal            = felec;
2351
2352             fscal            = _mm_and_ps(fscal,cutoff_mask);
2353
2354             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2355
2356             /* Calculate temporary vectorial force */
2357             tx               = _mm_mul_ps(fscal,dx10);
2358             ty               = _mm_mul_ps(fscal,dy10);
2359             tz               = _mm_mul_ps(fscal,dz10);
2360
2361             /* Update vectorial force */
2362             fix1             = _mm_add_ps(fix1,tx);
2363             fiy1             = _mm_add_ps(fiy1,ty);
2364             fiz1             = _mm_add_ps(fiz1,tz);
2365
2366             fjx0             = _mm_add_ps(fjx0,tx);
2367             fjy0             = _mm_add_ps(fjy0,ty);
2368             fjz0             = _mm_add_ps(fjz0,tz);
2369
2370             }
2371
2372             /**************************
2373              * CALCULATE INTERACTIONS *
2374              **************************/
2375
2376             if (gmx_mm_any_lt(rsq11,rcutoff2))
2377             {
2378
2379             r11              = _mm_mul_ps(rsq11,rinv11);
2380             r11              = _mm_andnot_ps(dummy_mask,r11);
2381
2382             /* EWALD ELECTROSTATICS */
2383
2384             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2385             ewrt             = _mm_mul_ps(r11,ewtabscale);
2386             ewitab           = _mm_cvttps_epi32(ewrt);
2387             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2388             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2389                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2390                                          &ewtabF,&ewtabFn);
2391             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2392             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2393
2394             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2395
2396             fscal            = felec;
2397
2398             fscal            = _mm_and_ps(fscal,cutoff_mask);
2399
2400             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2401
2402             /* Calculate temporary vectorial force */
2403             tx               = _mm_mul_ps(fscal,dx11);
2404             ty               = _mm_mul_ps(fscal,dy11);
2405             tz               = _mm_mul_ps(fscal,dz11);
2406
2407             /* Update vectorial force */
2408             fix1             = _mm_add_ps(fix1,tx);
2409             fiy1             = _mm_add_ps(fiy1,ty);
2410             fiz1             = _mm_add_ps(fiz1,tz);
2411
2412             fjx1             = _mm_add_ps(fjx1,tx);
2413             fjy1             = _mm_add_ps(fjy1,ty);
2414             fjz1             = _mm_add_ps(fjz1,tz);
2415
2416             }
2417
2418             /**************************
2419              * CALCULATE INTERACTIONS *
2420              **************************/
2421
2422             if (gmx_mm_any_lt(rsq12,rcutoff2))
2423             {
2424
2425             r12              = _mm_mul_ps(rsq12,rinv12);
2426             r12              = _mm_andnot_ps(dummy_mask,r12);
2427
2428             /* EWALD ELECTROSTATICS */
2429
2430             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2431             ewrt             = _mm_mul_ps(r12,ewtabscale);
2432             ewitab           = _mm_cvttps_epi32(ewrt);
2433             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2434             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2435                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2436                                          &ewtabF,&ewtabFn);
2437             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2438             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2439
2440             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2441
2442             fscal            = felec;
2443
2444             fscal            = _mm_and_ps(fscal,cutoff_mask);
2445
2446             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2447
2448             /* Calculate temporary vectorial force */
2449             tx               = _mm_mul_ps(fscal,dx12);
2450             ty               = _mm_mul_ps(fscal,dy12);
2451             tz               = _mm_mul_ps(fscal,dz12);
2452
2453             /* Update vectorial force */
2454             fix1             = _mm_add_ps(fix1,tx);
2455             fiy1             = _mm_add_ps(fiy1,ty);
2456             fiz1             = _mm_add_ps(fiz1,tz);
2457
2458             fjx2             = _mm_add_ps(fjx2,tx);
2459             fjy2             = _mm_add_ps(fjy2,ty);
2460             fjz2             = _mm_add_ps(fjz2,tz);
2461
2462             }
2463
2464             /**************************
2465              * CALCULATE INTERACTIONS *
2466              **************************/
2467
2468             if (gmx_mm_any_lt(rsq20,rcutoff2))
2469             {
2470
2471             r20              = _mm_mul_ps(rsq20,rinv20);
2472             r20              = _mm_andnot_ps(dummy_mask,r20);
2473
2474             /* EWALD ELECTROSTATICS */
2475
2476             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2477             ewrt             = _mm_mul_ps(r20,ewtabscale);
2478             ewitab           = _mm_cvttps_epi32(ewrt);
2479             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2480             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2481                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2482                                          &ewtabF,&ewtabFn);
2483             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2484             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2485
2486             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2487
2488             fscal            = felec;
2489
2490             fscal            = _mm_and_ps(fscal,cutoff_mask);
2491
2492             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2493
2494             /* Calculate temporary vectorial force */
2495             tx               = _mm_mul_ps(fscal,dx20);
2496             ty               = _mm_mul_ps(fscal,dy20);
2497             tz               = _mm_mul_ps(fscal,dz20);
2498
2499             /* Update vectorial force */
2500             fix2             = _mm_add_ps(fix2,tx);
2501             fiy2             = _mm_add_ps(fiy2,ty);
2502             fiz2             = _mm_add_ps(fiz2,tz);
2503
2504             fjx0             = _mm_add_ps(fjx0,tx);
2505             fjy0             = _mm_add_ps(fjy0,ty);
2506             fjz0             = _mm_add_ps(fjz0,tz);
2507
2508             }
2509
2510             /**************************
2511              * CALCULATE INTERACTIONS *
2512              **************************/
2513
2514             if (gmx_mm_any_lt(rsq21,rcutoff2))
2515             {
2516
2517             r21              = _mm_mul_ps(rsq21,rinv21);
2518             r21              = _mm_andnot_ps(dummy_mask,r21);
2519
2520             /* EWALD ELECTROSTATICS */
2521
2522             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2523             ewrt             = _mm_mul_ps(r21,ewtabscale);
2524             ewitab           = _mm_cvttps_epi32(ewrt);
2525             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2526             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2527                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2528                                          &ewtabF,&ewtabFn);
2529             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2530             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2531
2532             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2533
2534             fscal            = felec;
2535
2536             fscal            = _mm_and_ps(fscal,cutoff_mask);
2537
2538             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2539
2540             /* Calculate temporary vectorial force */
2541             tx               = _mm_mul_ps(fscal,dx21);
2542             ty               = _mm_mul_ps(fscal,dy21);
2543             tz               = _mm_mul_ps(fscal,dz21);
2544
2545             /* Update vectorial force */
2546             fix2             = _mm_add_ps(fix2,tx);
2547             fiy2             = _mm_add_ps(fiy2,ty);
2548             fiz2             = _mm_add_ps(fiz2,tz);
2549
2550             fjx1             = _mm_add_ps(fjx1,tx);
2551             fjy1             = _mm_add_ps(fjy1,ty);
2552             fjz1             = _mm_add_ps(fjz1,tz);
2553
2554             }
2555
2556             /**************************
2557              * CALCULATE INTERACTIONS *
2558              **************************/
2559
2560             if (gmx_mm_any_lt(rsq22,rcutoff2))
2561             {
2562
2563             r22              = _mm_mul_ps(rsq22,rinv22);
2564             r22              = _mm_andnot_ps(dummy_mask,r22);
2565
2566             /* EWALD ELECTROSTATICS */
2567
2568             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2569             ewrt             = _mm_mul_ps(r22,ewtabscale);
2570             ewitab           = _mm_cvttps_epi32(ewrt);
2571             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2572             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2573                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2574                                          &ewtabF,&ewtabFn);
2575             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2576             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2577
2578             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2579
2580             fscal            = felec;
2581
2582             fscal            = _mm_and_ps(fscal,cutoff_mask);
2583
2584             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2585
2586             /* Calculate temporary vectorial force */
2587             tx               = _mm_mul_ps(fscal,dx22);
2588             ty               = _mm_mul_ps(fscal,dy22);
2589             tz               = _mm_mul_ps(fscal,dz22);
2590
2591             /* Update vectorial force */
2592             fix2             = _mm_add_ps(fix2,tx);
2593             fiy2             = _mm_add_ps(fiy2,ty);
2594             fiz2             = _mm_add_ps(fiz2,tz);
2595
2596             fjx2             = _mm_add_ps(fjx2,tx);
2597             fjy2             = _mm_add_ps(fjy2,ty);
2598             fjz2             = _mm_add_ps(fjz2,tz);
2599
2600             }
2601
2602             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2603                                                    f+j_coord_offsetC,f+j_coord_offsetD,
2604                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2605
2606             /* Inner loop uses 367 flops */
2607         }
2608
2609         /* End of innermost loop */
2610
2611         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2612                                               f+i_coord_offset,fshift+i_shift_offset);
2613
2614         /* Increment number of inner iterations */
2615         inneriter                  += j_index_end - j_index_start;
2616
2617         /* Outer loop uses 27 flops */
2618     }
2619
2620     /* Increment number of outer iterations */
2621     outeriter        += nri;
2622
2623     /* Update outer/inner flops */
2624
2625     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*367);
2626 }