Merge "removed group non-boneded call with verlet scheme" into release-4-6
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sse4_1_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse4_1_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse4_1_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwioffset3;
76     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
96     real             *charge;
97     int              nvdwtype;
98     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99     int              *vdwtype;
100     real             *vdwparam;
101     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
102     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
103     __m128i          ewitab;
104     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
105     real             *ewtab;
106     __m128           dummy_mask,cutoff_mask;
107     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
108     __m128           one     = _mm_set1_ps(1.0);
109     __m128           two     = _mm_set1_ps(2.0);
110     x                = xx[0];
111     f                = ff[0];
112
113     nri              = nlist->nri;
114     iinr             = nlist->iinr;
115     jindex           = nlist->jindex;
116     jjnr             = nlist->jjnr;
117     shiftidx         = nlist->shift;
118     gid              = nlist->gid;
119     shiftvec         = fr->shift_vec[0];
120     fshift           = fr->fshift[0];
121     facel            = _mm_set1_ps(fr->epsfac);
122     charge           = mdatoms->chargeA;
123     nvdwtype         = fr->ntype;
124     vdwparam         = fr->nbfp;
125     vdwtype          = mdatoms->typeA;
126
127     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
128     ewtab            = fr->ic->tabq_coul_FDV0;
129     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
130     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
131
132     /* Setup water-specific parameters */
133     inr              = nlist->iinr[0];
134     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
135     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
136     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
137     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
138
139     jq1              = _mm_set1_ps(charge[inr+1]);
140     jq2              = _mm_set1_ps(charge[inr+2]);
141     jq3              = _mm_set1_ps(charge[inr+3]);
142     vdwjidx0A        = 2*vdwtype[inr+0];
143     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
144     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
145     qq11             = _mm_mul_ps(iq1,jq1);
146     qq12             = _mm_mul_ps(iq1,jq2);
147     qq13             = _mm_mul_ps(iq1,jq3);
148     qq21             = _mm_mul_ps(iq2,jq1);
149     qq22             = _mm_mul_ps(iq2,jq2);
150     qq23             = _mm_mul_ps(iq2,jq3);
151     qq31             = _mm_mul_ps(iq3,jq1);
152     qq32             = _mm_mul_ps(iq3,jq2);
153     qq33             = _mm_mul_ps(iq3,jq3);
154
155     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
156     rcutoff_scalar   = fr->rcoulomb;
157     rcutoff          = _mm_set1_ps(rcutoff_scalar);
158     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
159
160     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
161     rvdw             = _mm_set1_ps(fr->rvdw);
162
163     /* Avoid stupid compiler warnings */
164     jnrA = jnrB = jnrC = jnrD = 0;
165     j_coord_offsetA = 0;
166     j_coord_offsetB = 0;
167     j_coord_offsetC = 0;
168     j_coord_offsetD = 0;
169
170     outeriter        = 0;
171     inneriter        = 0;
172
173     for(iidx=0;iidx<4*DIM;iidx++)
174     {
175         scratch[iidx] = 0.0;
176     }
177
178     /* Start outer loop over neighborlists */
179     for(iidx=0; iidx<nri; iidx++)
180     {
181         /* Load shift vector for this list */
182         i_shift_offset   = DIM*shiftidx[iidx];
183
184         /* Load limits for loop over neighbors */
185         j_index_start    = jindex[iidx];
186         j_index_end      = jindex[iidx+1];
187
188         /* Get outer coordinate index */
189         inr              = iinr[iidx];
190         i_coord_offset   = DIM*inr;
191
192         /* Load i particle coords and add shift vector */
193         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
194                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
195
196         fix0             = _mm_setzero_ps();
197         fiy0             = _mm_setzero_ps();
198         fiz0             = _mm_setzero_ps();
199         fix1             = _mm_setzero_ps();
200         fiy1             = _mm_setzero_ps();
201         fiz1             = _mm_setzero_ps();
202         fix2             = _mm_setzero_ps();
203         fiy2             = _mm_setzero_ps();
204         fiz2             = _mm_setzero_ps();
205         fix3             = _mm_setzero_ps();
206         fiy3             = _mm_setzero_ps();
207         fiz3             = _mm_setzero_ps();
208
209         /* Reset potential sums */
210         velecsum         = _mm_setzero_ps();
211         vvdwsum          = _mm_setzero_ps();
212
213         /* Start inner kernel loop */
214         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215         {
216
217             /* Get j neighbor index, and coordinate index */
218             jnrA             = jjnr[jidx];
219             jnrB             = jjnr[jidx+1];
220             jnrC             = jjnr[jidx+2];
221             jnrD             = jjnr[jidx+3];
222             j_coord_offsetA  = DIM*jnrA;
223             j_coord_offsetB  = DIM*jnrB;
224             j_coord_offsetC  = DIM*jnrC;
225             j_coord_offsetD  = DIM*jnrD;
226
227             /* load j atom coordinates */
228             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
229                                               x+j_coord_offsetC,x+j_coord_offsetD,
230                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
231                                               &jy2,&jz2,&jx3,&jy3,&jz3);
232
233             /* Calculate displacement vector */
234             dx00             = _mm_sub_ps(ix0,jx0);
235             dy00             = _mm_sub_ps(iy0,jy0);
236             dz00             = _mm_sub_ps(iz0,jz0);
237             dx11             = _mm_sub_ps(ix1,jx1);
238             dy11             = _mm_sub_ps(iy1,jy1);
239             dz11             = _mm_sub_ps(iz1,jz1);
240             dx12             = _mm_sub_ps(ix1,jx2);
241             dy12             = _mm_sub_ps(iy1,jy2);
242             dz12             = _mm_sub_ps(iz1,jz2);
243             dx13             = _mm_sub_ps(ix1,jx3);
244             dy13             = _mm_sub_ps(iy1,jy3);
245             dz13             = _mm_sub_ps(iz1,jz3);
246             dx21             = _mm_sub_ps(ix2,jx1);
247             dy21             = _mm_sub_ps(iy2,jy1);
248             dz21             = _mm_sub_ps(iz2,jz1);
249             dx22             = _mm_sub_ps(ix2,jx2);
250             dy22             = _mm_sub_ps(iy2,jy2);
251             dz22             = _mm_sub_ps(iz2,jz2);
252             dx23             = _mm_sub_ps(ix2,jx3);
253             dy23             = _mm_sub_ps(iy2,jy3);
254             dz23             = _mm_sub_ps(iz2,jz3);
255             dx31             = _mm_sub_ps(ix3,jx1);
256             dy31             = _mm_sub_ps(iy3,jy1);
257             dz31             = _mm_sub_ps(iz3,jz1);
258             dx32             = _mm_sub_ps(ix3,jx2);
259             dy32             = _mm_sub_ps(iy3,jy2);
260             dz32             = _mm_sub_ps(iz3,jz2);
261             dx33             = _mm_sub_ps(ix3,jx3);
262             dy33             = _mm_sub_ps(iy3,jy3);
263             dz33             = _mm_sub_ps(iz3,jz3);
264
265             /* Calculate squared distance and things based on it */
266             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
267             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
268             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
269             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
270             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
271             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
272             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
273             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
274             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
275             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
276
277             rinv11           = gmx_mm_invsqrt_ps(rsq11);
278             rinv12           = gmx_mm_invsqrt_ps(rsq12);
279             rinv13           = gmx_mm_invsqrt_ps(rsq13);
280             rinv21           = gmx_mm_invsqrt_ps(rsq21);
281             rinv22           = gmx_mm_invsqrt_ps(rsq22);
282             rinv23           = gmx_mm_invsqrt_ps(rsq23);
283             rinv31           = gmx_mm_invsqrt_ps(rsq31);
284             rinv32           = gmx_mm_invsqrt_ps(rsq32);
285             rinv33           = gmx_mm_invsqrt_ps(rsq33);
286
287             rinvsq00         = gmx_mm_inv_ps(rsq00);
288             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
289             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
290             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
291             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
292             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
293             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
294             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
295             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
296             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
297
298             fjx0             = _mm_setzero_ps();
299             fjy0             = _mm_setzero_ps();
300             fjz0             = _mm_setzero_ps();
301             fjx1             = _mm_setzero_ps();
302             fjy1             = _mm_setzero_ps();
303             fjz1             = _mm_setzero_ps();
304             fjx2             = _mm_setzero_ps();
305             fjy2             = _mm_setzero_ps();
306             fjz2             = _mm_setzero_ps();
307             fjx3             = _mm_setzero_ps();
308             fjy3             = _mm_setzero_ps();
309             fjz3             = _mm_setzero_ps();
310
311             /**************************
312              * CALCULATE INTERACTIONS *
313              **************************/
314
315             if (gmx_mm_any_lt(rsq00,rcutoff2))
316             {
317
318             /* LENNARD-JONES DISPERSION/REPULSION */
319
320             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
321             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
322             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
323             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
324                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
325             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
326
327             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
328
329             /* Update potential sum for this i atom from the interaction with this j atom. */
330             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
331             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
332
333             fscal            = fvdw;
334
335             fscal            = _mm_and_ps(fscal,cutoff_mask);
336
337             /* Calculate temporary vectorial force */
338             tx               = _mm_mul_ps(fscal,dx00);
339             ty               = _mm_mul_ps(fscal,dy00);
340             tz               = _mm_mul_ps(fscal,dz00);
341
342             /* Update vectorial force */
343             fix0             = _mm_add_ps(fix0,tx);
344             fiy0             = _mm_add_ps(fiy0,ty);
345             fiz0             = _mm_add_ps(fiz0,tz);
346
347             fjx0             = _mm_add_ps(fjx0,tx);
348             fjy0             = _mm_add_ps(fjy0,ty);
349             fjz0             = _mm_add_ps(fjz0,tz);
350
351             }
352
353             /**************************
354              * CALCULATE INTERACTIONS *
355              **************************/
356
357             if (gmx_mm_any_lt(rsq11,rcutoff2))
358             {
359
360             r11              = _mm_mul_ps(rsq11,rinv11);
361
362             /* EWALD ELECTROSTATICS */
363
364             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
365             ewrt             = _mm_mul_ps(r11,ewtabscale);
366             ewitab           = _mm_cvttps_epi32(ewrt);
367             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
368             ewitab           = _mm_slli_epi32(ewitab,2);
369             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
370             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
371             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
372             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
373             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
374             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
375             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
376             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
377             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
378
379             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
380
381             /* Update potential sum for this i atom from the interaction with this j atom. */
382             velec            = _mm_and_ps(velec,cutoff_mask);
383             velecsum         = _mm_add_ps(velecsum,velec);
384
385             fscal            = felec;
386
387             fscal            = _mm_and_ps(fscal,cutoff_mask);
388
389             /* Calculate temporary vectorial force */
390             tx               = _mm_mul_ps(fscal,dx11);
391             ty               = _mm_mul_ps(fscal,dy11);
392             tz               = _mm_mul_ps(fscal,dz11);
393
394             /* Update vectorial force */
395             fix1             = _mm_add_ps(fix1,tx);
396             fiy1             = _mm_add_ps(fiy1,ty);
397             fiz1             = _mm_add_ps(fiz1,tz);
398
399             fjx1             = _mm_add_ps(fjx1,tx);
400             fjy1             = _mm_add_ps(fjy1,ty);
401             fjz1             = _mm_add_ps(fjz1,tz);
402
403             }
404
405             /**************************
406              * CALCULATE INTERACTIONS *
407              **************************/
408
409             if (gmx_mm_any_lt(rsq12,rcutoff2))
410             {
411
412             r12              = _mm_mul_ps(rsq12,rinv12);
413
414             /* EWALD ELECTROSTATICS */
415
416             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
417             ewrt             = _mm_mul_ps(r12,ewtabscale);
418             ewitab           = _mm_cvttps_epi32(ewrt);
419             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
420             ewitab           = _mm_slli_epi32(ewitab,2);
421             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
422             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
423             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
424             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
425             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
426             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
427             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
428             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
429             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
430
431             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
432
433             /* Update potential sum for this i atom from the interaction with this j atom. */
434             velec            = _mm_and_ps(velec,cutoff_mask);
435             velecsum         = _mm_add_ps(velecsum,velec);
436
437             fscal            = felec;
438
439             fscal            = _mm_and_ps(fscal,cutoff_mask);
440
441             /* Calculate temporary vectorial force */
442             tx               = _mm_mul_ps(fscal,dx12);
443             ty               = _mm_mul_ps(fscal,dy12);
444             tz               = _mm_mul_ps(fscal,dz12);
445
446             /* Update vectorial force */
447             fix1             = _mm_add_ps(fix1,tx);
448             fiy1             = _mm_add_ps(fiy1,ty);
449             fiz1             = _mm_add_ps(fiz1,tz);
450
451             fjx2             = _mm_add_ps(fjx2,tx);
452             fjy2             = _mm_add_ps(fjy2,ty);
453             fjz2             = _mm_add_ps(fjz2,tz);
454
455             }
456
457             /**************************
458              * CALCULATE INTERACTIONS *
459              **************************/
460
461             if (gmx_mm_any_lt(rsq13,rcutoff2))
462             {
463
464             r13              = _mm_mul_ps(rsq13,rinv13);
465
466             /* EWALD ELECTROSTATICS */
467
468             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469             ewrt             = _mm_mul_ps(r13,ewtabscale);
470             ewitab           = _mm_cvttps_epi32(ewrt);
471             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
472             ewitab           = _mm_slli_epi32(ewitab,2);
473             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
474             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
475             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
476             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
477             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
478             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
479             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
480             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
481             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
482
483             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
484
485             /* Update potential sum for this i atom from the interaction with this j atom. */
486             velec            = _mm_and_ps(velec,cutoff_mask);
487             velecsum         = _mm_add_ps(velecsum,velec);
488
489             fscal            = felec;
490
491             fscal            = _mm_and_ps(fscal,cutoff_mask);
492
493             /* Calculate temporary vectorial force */
494             tx               = _mm_mul_ps(fscal,dx13);
495             ty               = _mm_mul_ps(fscal,dy13);
496             tz               = _mm_mul_ps(fscal,dz13);
497
498             /* Update vectorial force */
499             fix1             = _mm_add_ps(fix1,tx);
500             fiy1             = _mm_add_ps(fiy1,ty);
501             fiz1             = _mm_add_ps(fiz1,tz);
502
503             fjx3             = _mm_add_ps(fjx3,tx);
504             fjy3             = _mm_add_ps(fjy3,ty);
505             fjz3             = _mm_add_ps(fjz3,tz);
506
507             }
508
509             /**************************
510              * CALCULATE INTERACTIONS *
511              **************************/
512
513             if (gmx_mm_any_lt(rsq21,rcutoff2))
514             {
515
516             r21              = _mm_mul_ps(rsq21,rinv21);
517
518             /* EWALD ELECTROSTATICS */
519
520             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
521             ewrt             = _mm_mul_ps(r21,ewtabscale);
522             ewitab           = _mm_cvttps_epi32(ewrt);
523             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
524             ewitab           = _mm_slli_epi32(ewitab,2);
525             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
526             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
527             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
528             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
529             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
530             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
531             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
532             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
533             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
534
535             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
536
537             /* Update potential sum for this i atom from the interaction with this j atom. */
538             velec            = _mm_and_ps(velec,cutoff_mask);
539             velecsum         = _mm_add_ps(velecsum,velec);
540
541             fscal            = felec;
542
543             fscal            = _mm_and_ps(fscal,cutoff_mask);
544
545             /* Calculate temporary vectorial force */
546             tx               = _mm_mul_ps(fscal,dx21);
547             ty               = _mm_mul_ps(fscal,dy21);
548             tz               = _mm_mul_ps(fscal,dz21);
549
550             /* Update vectorial force */
551             fix2             = _mm_add_ps(fix2,tx);
552             fiy2             = _mm_add_ps(fiy2,ty);
553             fiz2             = _mm_add_ps(fiz2,tz);
554
555             fjx1             = _mm_add_ps(fjx1,tx);
556             fjy1             = _mm_add_ps(fjy1,ty);
557             fjz1             = _mm_add_ps(fjz1,tz);
558
559             }
560
561             /**************************
562              * CALCULATE INTERACTIONS *
563              **************************/
564
565             if (gmx_mm_any_lt(rsq22,rcutoff2))
566             {
567
568             r22              = _mm_mul_ps(rsq22,rinv22);
569
570             /* EWALD ELECTROSTATICS */
571
572             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
573             ewrt             = _mm_mul_ps(r22,ewtabscale);
574             ewitab           = _mm_cvttps_epi32(ewrt);
575             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
576             ewitab           = _mm_slli_epi32(ewitab,2);
577             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
578             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
579             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
580             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
581             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
582             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
583             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
584             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
585             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
586
587             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
588
589             /* Update potential sum for this i atom from the interaction with this j atom. */
590             velec            = _mm_and_ps(velec,cutoff_mask);
591             velecsum         = _mm_add_ps(velecsum,velec);
592
593             fscal            = felec;
594
595             fscal            = _mm_and_ps(fscal,cutoff_mask);
596
597             /* Calculate temporary vectorial force */
598             tx               = _mm_mul_ps(fscal,dx22);
599             ty               = _mm_mul_ps(fscal,dy22);
600             tz               = _mm_mul_ps(fscal,dz22);
601
602             /* Update vectorial force */
603             fix2             = _mm_add_ps(fix2,tx);
604             fiy2             = _mm_add_ps(fiy2,ty);
605             fiz2             = _mm_add_ps(fiz2,tz);
606
607             fjx2             = _mm_add_ps(fjx2,tx);
608             fjy2             = _mm_add_ps(fjy2,ty);
609             fjz2             = _mm_add_ps(fjz2,tz);
610
611             }
612
613             /**************************
614              * CALCULATE INTERACTIONS *
615              **************************/
616
617             if (gmx_mm_any_lt(rsq23,rcutoff2))
618             {
619
620             r23              = _mm_mul_ps(rsq23,rinv23);
621
622             /* EWALD ELECTROSTATICS */
623
624             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
625             ewrt             = _mm_mul_ps(r23,ewtabscale);
626             ewitab           = _mm_cvttps_epi32(ewrt);
627             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
628             ewitab           = _mm_slli_epi32(ewitab,2);
629             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
630             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
631             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
632             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
633             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
634             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
635             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
636             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
637             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
638
639             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
640
641             /* Update potential sum for this i atom from the interaction with this j atom. */
642             velec            = _mm_and_ps(velec,cutoff_mask);
643             velecsum         = _mm_add_ps(velecsum,velec);
644
645             fscal            = felec;
646
647             fscal            = _mm_and_ps(fscal,cutoff_mask);
648
649             /* Calculate temporary vectorial force */
650             tx               = _mm_mul_ps(fscal,dx23);
651             ty               = _mm_mul_ps(fscal,dy23);
652             tz               = _mm_mul_ps(fscal,dz23);
653
654             /* Update vectorial force */
655             fix2             = _mm_add_ps(fix2,tx);
656             fiy2             = _mm_add_ps(fiy2,ty);
657             fiz2             = _mm_add_ps(fiz2,tz);
658
659             fjx3             = _mm_add_ps(fjx3,tx);
660             fjy3             = _mm_add_ps(fjy3,ty);
661             fjz3             = _mm_add_ps(fjz3,tz);
662
663             }
664
665             /**************************
666              * CALCULATE INTERACTIONS *
667              **************************/
668
669             if (gmx_mm_any_lt(rsq31,rcutoff2))
670             {
671
672             r31              = _mm_mul_ps(rsq31,rinv31);
673
674             /* EWALD ELECTROSTATICS */
675
676             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
677             ewrt             = _mm_mul_ps(r31,ewtabscale);
678             ewitab           = _mm_cvttps_epi32(ewrt);
679             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
680             ewitab           = _mm_slli_epi32(ewitab,2);
681             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
682             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
683             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
684             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
685             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
686             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
687             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
688             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
689             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
690
691             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
692
693             /* Update potential sum for this i atom from the interaction with this j atom. */
694             velec            = _mm_and_ps(velec,cutoff_mask);
695             velecsum         = _mm_add_ps(velecsum,velec);
696
697             fscal            = felec;
698
699             fscal            = _mm_and_ps(fscal,cutoff_mask);
700
701             /* Calculate temporary vectorial force */
702             tx               = _mm_mul_ps(fscal,dx31);
703             ty               = _mm_mul_ps(fscal,dy31);
704             tz               = _mm_mul_ps(fscal,dz31);
705
706             /* Update vectorial force */
707             fix3             = _mm_add_ps(fix3,tx);
708             fiy3             = _mm_add_ps(fiy3,ty);
709             fiz3             = _mm_add_ps(fiz3,tz);
710
711             fjx1             = _mm_add_ps(fjx1,tx);
712             fjy1             = _mm_add_ps(fjy1,ty);
713             fjz1             = _mm_add_ps(fjz1,tz);
714
715             }
716
717             /**************************
718              * CALCULATE INTERACTIONS *
719              **************************/
720
721             if (gmx_mm_any_lt(rsq32,rcutoff2))
722             {
723
724             r32              = _mm_mul_ps(rsq32,rinv32);
725
726             /* EWALD ELECTROSTATICS */
727
728             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
729             ewrt             = _mm_mul_ps(r32,ewtabscale);
730             ewitab           = _mm_cvttps_epi32(ewrt);
731             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
732             ewitab           = _mm_slli_epi32(ewitab,2);
733             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
734             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
735             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
736             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
737             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
738             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
739             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
740             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
741             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
742
743             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
744
745             /* Update potential sum for this i atom from the interaction with this j atom. */
746             velec            = _mm_and_ps(velec,cutoff_mask);
747             velecsum         = _mm_add_ps(velecsum,velec);
748
749             fscal            = felec;
750
751             fscal            = _mm_and_ps(fscal,cutoff_mask);
752
753             /* Calculate temporary vectorial force */
754             tx               = _mm_mul_ps(fscal,dx32);
755             ty               = _mm_mul_ps(fscal,dy32);
756             tz               = _mm_mul_ps(fscal,dz32);
757
758             /* Update vectorial force */
759             fix3             = _mm_add_ps(fix3,tx);
760             fiy3             = _mm_add_ps(fiy3,ty);
761             fiz3             = _mm_add_ps(fiz3,tz);
762
763             fjx2             = _mm_add_ps(fjx2,tx);
764             fjy2             = _mm_add_ps(fjy2,ty);
765             fjz2             = _mm_add_ps(fjz2,tz);
766
767             }
768
769             /**************************
770              * CALCULATE INTERACTIONS *
771              **************************/
772
773             if (gmx_mm_any_lt(rsq33,rcutoff2))
774             {
775
776             r33              = _mm_mul_ps(rsq33,rinv33);
777
778             /* EWALD ELECTROSTATICS */
779
780             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
781             ewrt             = _mm_mul_ps(r33,ewtabscale);
782             ewitab           = _mm_cvttps_epi32(ewrt);
783             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
784             ewitab           = _mm_slli_epi32(ewitab,2);
785             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
786             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
787             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
788             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
789             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
790             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
791             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
792             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
793             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
794
795             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
796
797             /* Update potential sum for this i atom from the interaction with this j atom. */
798             velec            = _mm_and_ps(velec,cutoff_mask);
799             velecsum         = _mm_add_ps(velecsum,velec);
800
801             fscal            = felec;
802
803             fscal            = _mm_and_ps(fscal,cutoff_mask);
804
805             /* Calculate temporary vectorial force */
806             tx               = _mm_mul_ps(fscal,dx33);
807             ty               = _mm_mul_ps(fscal,dy33);
808             tz               = _mm_mul_ps(fscal,dz33);
809
810             /* Update vectorial force */
811             fix3             = _mm_add_ps(fix3,tx);
812             fiy3             = _mm_add_ps(fiy3,ty);
813             fiz3             = _mm_add_ps(fiz3,tz);
814
815             fjx3             = _mm_add_ps(fjx3,tx);
816             fjy3             = _mm_add_ps(fjy3,ty);
817             fjz3             = _mm_add_ps(fjz3,tz);
818
819             }
820
821             fjptrA             = f+j_coord_offsetA;
822             fjptrB             = f+j_coord_offsetB;
823             fjptrC             = f+j_coord_offsetC;
824             fjptrD             = f+j_coord_offsetD;
825
826             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
827                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
828                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
829
830             /* Inner loop uses 458 flops */
831         }
832
833         if(jidx<j_index_end)
834         {
835
836             /* Get j neighbor index, and coordinate index */
837             jnrlistA         = jjnr[jidx];
838             jnrlistB         = jjnr[jidx+1];
839             jnrlistC         = jjnr[jidx+2];
840             jnrlistD         = jjnr[jidx+3];
841             /* Sign of each element will be negative for non-real atoms.
842              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
843              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
844              */
845             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
846             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
847             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
848             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
849             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
850             j_coord_offsetA  = DIM*jnrA;
851             j_coord_offsetB  = DIM*jnrB;
852             j_coord_offsetC  = DIM*jnrC;
853             j_coord_offsetD  = DIM*jnrD;
854
855             /* load j atom coordinates */
856             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
857                                               x+j_coord_offsetC,x+j_coord_offsetD,
858                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
859                                               &jy2,&jz2,&jx3,&jy3,&jz3);
860
861             /* Calculate displacement vector */
862             dx00             = _mm_sub_ps(ix0,jx0);
863             dy00             = _mm_sub_ps(iy0,jy0);
864             dz00             = _mm_sub_ps(iz0,jz0);
865             dx11             = _mm_sub_ps(ix1,jx1);
866             dy11             = _mm_sub_ps(iy1,jy1);
867             dz11             = _mm_sub_ps(iz1,jz1);
868             dx12             = _mm_sub_ps(ix1,jx2);
869             dy12             = _mm_sub_ps(iy1,jy2);
870             dz12             = _mm_sub_ps(iz1,jz2);
871             dx13             = _mm_sub_ps(ix1,jx3);
872             dy13             = _mm_sub_ps(iy1,jy3);
873             dz13             = _mm_sub_ps(iz1,jz3);
874             dx21             = _mm_sub_ps(ix2,jx1);
875             dy21             = _mm_sub_ps(iy2,jy1);
876             dz21             = _mm_sub_ps(iz2,jz1);
877             dx22             = _mm_sub_ps(ix2,jx2);
878             dy22             = _mm_sub_ps(iy2,jy2);
879             dz22             = _mm_sub_ps(iz2,jz2);
880             dx23             = _mm_sub_ps(ix2,jx3);
881             dy23             = _mm_sub_ps(iy2,jy3);
882             dz23             = _mm_sub_ps(iz2,jz3);
883             dx31             = _mm_sub_ps(ix3,jx1);
884             dy31             = _mm_sub_ps(iy3,jy1);
885             dz31             = _mm_sub_ps(iz3,jz1);
886             dx32             = _mm_sub_ps(ix3,jx2);
887             dy32             = _mm_sub_ps(iy3,jy2);
888             dz32             = _mm_sub_ps(iz3,jz2);
889             dx33             = _mm_sub_ps(ix3,jx3);
890             dy33             = _mm_sub_ps(iy3,jy3);
891             dz33             = _mm_sub_ps(iz3,jz3);
892
893             /* Calculate squared distance and things based on it */
894             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
895             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
896             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
897             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
898             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
899             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
900             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
901             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
902             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
903             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
904
905             rinv11           = gmx_mm_invsqrt_ps(rsq11);
906             rinv12           = gmx_mm_invsqrt_ps(rsq12);
907             rinv13           = gmx_mm_invsqrt_ps(rsq13);
908             rinv21           = gmx_mm_invsqrt_ps(rsq21);
909             rinv22           = gmx_mm_invsqrt_ps(rsq22);
910             rinv23           = gmx_mm_invsqrt_ps(rsq23);
911             rinv31           = gmx_mm_invsqrt_ps(rsq31);
912             rinv32           = gmx_mm_invsqrt_ps(rsq32);
913             rinv33           = gmx_mm_invsqrt_ps(rsq33);
914
915             rinvsq00         = gmx_mm_inv_ps(rsq00);
916             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
917             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
918             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
919             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
920             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
921             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
922             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
923             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
924             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
925
926             fjx0             = _mm_setzero_ps();
927             fjy0             = _mm_setzero_ps();
928             fjz0             = _mm_setzero_ps();
929             fjx1             = _mm_setzero_ps();
930             fjy1             = _mm_setzero_ps();
931             fjz1             = _mm_setzero_ps();
932             fjx2             = _mm_setzero_ps();
933             fjy2             = _mm_setzero_ps();
934             fjz2             = _mm_setzero_ps();
935             fjx3             = _mm_setzero_ps();
936             fjy3             = _mm_setzero_ps();
937             fjz3             = _mm_setzero_ps();
938
939             /**************************
940              * CALCULATE INTERACTIONS *
941              **************************/
942
943             if (gmx_mm_any_lt(rsq00,rcutoff2))
944             {
945
946             /* LENNARD-JONES DISPERSION/REPULSION */
947
948             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
949             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
950             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
951             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
952                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
953             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
954
955             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
956
957             /* Update potential sum for this i atom from the interaction with this j atom. */
958             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
959             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
960             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
961
962             fscal            = fvdw;
963
964             fscal            = _mm_and_ps(fscal,cutoff_mask);
965
966             fscal            = _mm_andnot_ps(dummy_mask,fscal);
967
968             /* Calculate temporary vectorial force */
969             tx               = _mm_mul_ps(fscal,dx00);
970             ty               = _mm_mul_ps(fscal,dy00);
971             tz               = _mm_mul_ps(fscal,dz00);
972
973             /* Update vectorial force */
974             fix0             = _mm_add_ps(fix0,tx);
975             fiy0             = _mm_add_ps(fiy0,ty);
976             fiz0             = _mm_add_ps(fiz0,tz);
977
978             fjx0             = _mm_add_ps(fjx0,tx);
979             fjy0             = _mm_add_ps(fjy0,ty);
980             fjz0             = _mm_add_ps(fjz0,tz);
981
982             }
983
984             /**************************
985              * CALCULATE INTERACTIONS *
986              **************************/
987
988             if (gmx_mm_any_lt(rsq11,rcutoff2))
989             {
990
991             r11              = _mm_mul_ps(rsq11,rinv11);
992             r11              = _mm_andnot_ps(dummy_mask,r11);
993
994             /* EWALD ELECTROSTATICS */
995
996             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
997             ewrt             = _mm_mul_ps(r11,ewtabscale);
998             ewitab           = _mm_cvttps_epi32(ewrt);
999             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1000             ewitab           = _mm_slli_epi32(ewitab,2);
1001             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1002             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1003             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1004             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1005             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1006             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1007             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1008             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1009             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1010
1011             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1012
1013             /* Update potential sum for this i atom from the interaction with this j atom. */
1014             velec            = _mm_and_ps(velec,cutoff_mask);
1015             velec            = _mm_andnot_ps(dummy_mask,velec);
1016             velecsum         = _mm_add_ps(velecsum,velec);
1017
1018             fscal            = felec;
1019
1020             fscal            = _mm_and_ps(fscal,cutoff_mask);
1021
1022             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1023
1024             /* Calculate temporary vectorial force */
1025             tx               = _mm_mul_ps(fscal,dx11);
1026             ty               = _mm_mul_ps(fscal,dy11);
1027             tz               = _mm_mul_ps(fscal,dz11);
1028
1029             /* Update vectorial force */
1030             fix1             = _mm_add_ps(fix1,tx);
1031             fiy1             = _mm_add_ps(fiy1,ty);
1032             fiz1             = _mm_add_ps(fiz1,tz);
1033
1034             fjx1             = _mm_add_ps(fjx1,tx);
1035             fjy1             = _mm_add_ps(fjy1,ty);
1036             fjz1             = _mm_add_ps(fjz1,tz);
1037
1038             }
1039
1040             /**************************
1041              * CALCULATE INTERACTIONS *
1042              **************************/
1043
1044             if (gmx_mm_any_lt(rsq12,rcutoff2))
1045             {
1046
1047             r12              = _mm_mul_ps(rsq12,rinv12);
1048             r12              = _mm_andnot_ps(dummy_mask,r12);
1049
1050             /* EWALD ELECTROSTATICS */
1051
1052             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1053             ewrt             = _mm_mul_ps(r12,ewtabscale);
1054             ewitab           = _mm_cvttps_epi32(ewrt);
1055             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1056             ewitab           = _mm_slli_epi32(ewitab,2);
1057             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1058             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1059             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1060             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1061             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1062             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1063             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1064             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1065             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1066
1067             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1068
1069             /* Update potential sum for this i atom from the interaction with this j atom. */
1070             velec            = _mm_and_ps(velec,cutoff_mask);
1071             velec            = _mm_andnot_ps(dummy_mask,velec);
1072             velecsum         = _mm_add_ps(velecsum,velec);
1073
1074             fscal            = felec;
1075
1076             fscal            = _mm_and_ps(fscal,cutoff_mask);
1077
1078             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1079
1080             /* Calculate temporary vectorial force */
1081             tx               = _mm_mul_ps(fscal,dx12);
1082             ty               = _mm_mul_ps(fscal,dy12);
1083             tz               = _mm_mul_ps(fscal,dz12);
1084
1085             /* Update vectorial force */
1086             fix1             = _mm_add_ps(fix1,tx);
1087             fiy1             = _mm_add_ps(fiy1,ty);
1088             fiz1             = _mm_add_ps(fiz1,tz);
1089
1090             fjx2             = _mm_add_ps(fjx2,tx);
1091             fjy2             = _mm_add_ps(fjy2,ty);
1092             fjz2             = _mm_add_ps(fjz2,tz);
1093
1094             }
1095
1096             /**************************
1097              * CALCULATE INTERACTIONS *
1098              **************************/
1099
1100             if (gmx_mm_any_lt(rsq13,rcutoff2))
1101             {
1102
1103             r13              = _mm_mul_ps(rsq13,rinv13);
1104             r13              = _mm_andnot_ps(dummy_mask,r13);
1105
1106             /* EWALD ELECTROSTATICS */
1107
1108             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1109             ewrt             = _mm_mul_ps(r13,ewtabscale);
1110             ewitab           = _mm_cvttps_epi32(ewrt);
1111             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1112             ewitab           = _mm_slli_epi32(ewitab,2);
1113             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1114             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1115             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1116             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1117             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1118             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1119             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1120             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
1121             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1122
1123             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1124
1125             /* Update potential sum for this i atom from the interaction with this j atom. */
1126             velec            = _mm_and_ps(velec,cutoff_mask);
1127             velec            = _mm_andnot_ps(dummy_mask,velec);
1128             velecsum         = _mm_add_ps(velecsum,velec);
1129
1130             fscal            = felec;
1131
1132             fscal            = _mm_and_ps(fscal,cutoff_mask);
1133
1134             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1135
1136             /* Calculate temporary vectorial force */
1137             tx               = _mm_mul_ps(fscal,dx13);
1138             ty               = _mm_mul_ps(fscal,dy13);
1139             tz               = _mm_mul_ps(fscal,dz13);
1140
1141             /* Update vectorial force */
1142             fix1             = _mm_add_ps(fix1,tx);
1143             fiy1             = _mm_add_ps(fiy1,ty);
1144             fiz1             = _mm_add_ps(fiz1,tz);
1145
1146             fjx3             = _mm_add_ps(fjx3,tx);
1147             fjy3             = _mm_add_ps(fjy3,ty);
1148             fjz3             = _mm_add_ps(fjz3,tz);
1149
1150             }
1151
1152             /**************************
1153              * CALCULATE INTERACTIONS *
1154              **************************/
1155
1156             if (gmx_mm_any_lt(rsq21,rcutoff2))
1157             {
1158
1159             r21              = _mm_mul_ps(rsq21,rinv21);
1160             r21              = _mm_andnot_ps(dummy_mask,r21);
1161
1162             /* EWALD ELECTROSTATICS */
1163
1164             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1165             ewrt             = _mm_mul_ps(r21,ewtabscale);
1166             ewitab           = _mm_cvttps_epi32(ewrt);
1167             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1168             ewitab           = _mm_slli_epi32(ewitab,2);
1169             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1170             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1171             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1172             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1173             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1174             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1175             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1176             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1177             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1178
1179             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1180
1181             /* Update potential sum for this i atom from the interaction with this j atom. */
1182             velec            = _mm_and_ps(velec,cutoff_mask);
1183             velec            = _mm_andnot_ps(dummy_mask,velec);
1184             velecsum         = _mm_add_ps(velecsum,velec);
1185
1186             fscal            = felec;
1187
1188             fscal            = _mm_and_ps(fscal,cutoff_mask);
1189
1190             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1191
1192             /* Calculate temporary vectorial force */
1193             tx               = _mm_mul_ps(fscal,dx21);
1194             ty               = _mm_mul_ps(fscal,dy21);
1195             tz               = _mm_mul_ps(fscal,dz21);
1196
1197             /* Update vectorial force */
1198             fix2             = _mm_add_ps(fix2,tx);
1199             fiy2             = _mm_add_ps(fiy2,ty);
1200             fiz2             = _mm_add_ps(fiz2,tz);
1201
1202             fjx1             = _mm_add_ps(fjx1,tx);
1203             fjy1             = _mm_add_ps(fjy1,ty);
1204             fjz1             = _mm_add_ps(fjz1,tz);
1205
1206             }
1207
1208             /**************************
1209              * CALCULATE INTERACTIONS *
1210              **************************/
1211
1212             if (gmx_mm_any_lt(rsq22,rcutoff2))
1213             {
1214
1215             r22              = _mm_mul_ps(rsq22,rinv22);
1216             r22              = _mm_andnot_ps(dummy_mask,r22);
1217
1218             /* EWALD ELECTROSTATICS */
1219
1220             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1221             ewrt             = _mm_mul_ps(r22,ewtabscale);
1222             ewitab           = _mm_cvttps_epi32(ewrt);
1223             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1224             ewitab           = _mm_slli_epi32(ewitab,2);
1225             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1226             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1227             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1228             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1229             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1230             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1231             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1232             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1233             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1234
1235             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1236
1237             /* Update potential sum for this i atom from the interaction with this j atom. */
1238             velec            = _mm_and_ps(velec,cutoff_mask);
1239             velec            = _mm_andnot_ps(dummy_mask,velec);
1240             velecsum         = _mm_add_ps(velecsum,velec);
1241
1242             fscal            = felec;
1243
1244             fscal            = _mm_and_ps(fscal,cutoff_mask);
1245
1246             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1247
1248             /* Calculate temporary vectorial force */
1249             tx               = _mm_mul_ps(fscal,dx22);
1250             ty               = _mm_mul_ps(fscal,dy22);
1251             tz               = _mm_mul_ps(fscal,dz22);
1252
1253             /* Update vectorial force */
1254             fix2             = _mm_add_ps(fix2,tx);
1255             fiy2             = _mm_add_ps(fiy2,ty);
1256             fiz2             = _mm_add_ps(fiz2,tz);
1257
1258             fjx2             = _mm_add_ps(fjx2,tx);
1259             fjy2             = _mm_add_ps(fjy2,ty);
1260             fjz2             = _mm_add_ps(fjz2,tz);
1261
1262             }
1263
1264             /**************************
1265              * CALCULATE INTERACTIONS *
1266              **************************/
1267
1268             if (gmx_mm_any_lt(rsq23,rcutoff2))
1269             {
1270
1271             r23              = _mm_mul_ps(rsq23,rinv23);
1272             r23              = _mm_andnot_ps(dummy_mask,r23);
1273
1274             /* EWALD ELECTROSTATICS */
1275
1276             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1277             ewrt             = _mm_mul_ps(r23,ewtabscale);
1278             ewitab           = _mm_cvttps_epi32(ewrt);
1279             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1280             ewitab           = _mm_slli_epi32(ewitab,2);
1281             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1282             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1283             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1284             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1285             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1286             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1287             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1288             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
1289             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1290
1291             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1292
1293             /* Update potential sum for this i atom from the interaction with this j atom. */
1294             velec            = _mm_and_ps(velec,cutoff_mask);
1295             velec            = _mm_andnot_ps(dummy_mask,velec);
1296             velecsum         = _mm_add_ps(velecsum,velec);
1297
1298             fscal            = felec;
1299
1300             fscal            = _mm_and_ps(fscal,cutoff_mask);
1301
1302             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1303
1304             /* Calculate temporary vectorial force */
1305             tx               = _mm_mul_ps(fscal,dx23);
1306             ty               = _mm_mul_ps(fscal,dy23);
1307             tz               = _mm_mul_ps(fscal,dz23);
1308
1309             /* Update vectorial force */
1310             fix2             = _mm_add_ps(fix2,tx);
1311             fiy2             = _mm_add_ps(fiy2,ty);
1312             fiz2             = _mm_add_ps(fiz2,tz);
1313
1314             fjx3             = _mm_add_ps(fjx3,tx);
1315             fjy3             = _mm_add_ps(fjy3,ty);
1316             fjz3             = _mm_add_ps(fjz3,tz);
1317
1318             }
1319
1320             /**************************
1321              * CALCULATE INTERACTIONS *
1322              **************************/
1323
1324             if (gmx_mm_any_lt(rsq31,rcutoff2))
1325             {
1326
1327             r31              = _mm_mul_ps(rsq31,rinv31);
1328             r31              = _mm_andnot_ps(dummy_mask,r31);
1329
1330             /* EWALD ELECTROSTATICS */
1331
1332             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1333             ewrt             = _mm_mul_ps(r31,ewtabscale);
1334             ewitab           = _mm_cvttps_epi32(ewrt);
1335             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1336             ewitab           = _mm_slli_epi32(ewitab,2);
1337             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1338             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1339             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1340             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1341             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1342             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1343             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1344             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
1345             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1346
1347             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1348
1349             /* Update potential sum for this i atom from the interaction with this j atom. */
1350             velec            = _mm_and_ps(velec,cutoff_mask);
1351             velec            = _mm_andnot_ps(dummy_mask,velec);
1352             velecsum         = _mm_add_ps(velecsum,velec);
1353
1354             fscal            = felec;
1355
1356             fscal            = _mm_and_ps(fscal,cutoff_mask);
1357
1358             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1359
1360             /* Calculate temporary vectorial force */
1361             tx               = _mm_mul_ps(fscal,dx31);
1362             ty               = _mm_mul_ps(fscal,dy31);
1363             tz               = _mm_mul_ps(fscal,dz31);
1364
1365             /* Update vectorial force */
1366             fix3             = _mm_add_ps(fix3,tx);
1367             fiy3             = _mm_add_ps(fiy3,ty);
1368             fiz3             = _mm_add_ps(fiz3,tz);
1369
1370             fjx1             = _mm_add_ps(fjx1,tx);
1371             fjy1             = _mm_add_ps(fjy1,ty);
1372             fjz1             = _mm_add_ps(fjz1,tz);
1373
1374             }
1375
1376             /**************************
1377              * CALCULATE INTERACTIONS *
1378              **************************/
1379
1380             if (gmx_mm_any_lt(rsq32,rcutoff2))
1381             {
1382
1383             r32              = _mm_mul_ps(rsq32,rinv32);
1384             r32              = _mm_andnot_ps(dummy_mask,r32);
1385
1386             /* EWALD ELECTROSTATICS */
1387
1388             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1389             ewrt             = _mm_mul_ps(r32,ewtabscale);
1390             ewitab           = _mm_cvttps_epi32(ewrt);
1391             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1392             ewitab           = _mm_slli_epi32(ewitab,2);
1393             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1394             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1395             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1396             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1397             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1398             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1399             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1400             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
1401             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1402
1403             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1404
1405             /* Update potential sum for this i atom from the interaction with this j atom. */
1406             velec            = _mm_and_ps(velec,cutoff_mask);
1407             velec            = _mm_andnot_ps(dummy_mask,velec);
1408             velecsum         = _mm_add_ps(velecsum,velec);
1409
1410             fscal            = felec;
1411
1412             fscal            = _mm_and_ps(fscal,cutoff_mask);
1413
1414             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1415
1416             /* Calculate temporary vectorial force */
1417             tx               = _mm_mul_ps(fscal,dx32);
1418             ty               = _mm_mul_ps(fscal,dy32);
1419             tz               = _mm_mul_ps(fscal,dz32);
1420
1421             /* Update vectorial force */
1422             fix3             = _mm_add_ps(fix3,tx);
1423             fiy3             = _mm_add_ps(fiy3,ty);
1424             fiz3             = _mm_add_ps(fiz3,tz);
1425
1426             fjx2             = _mm_add_ps(fjx2,tx);
1427             fjy2             = _mm_add_ps(fjy2,ty);
1428             fjz2             = _mm_add_ps(fjz2,tz);
1429
1430             }
1431
1432             /**************************
1433              * CALCULATE INTERACTIONS *
1434              **************************/
1435
1436             if (gmx_mm_any_lt(rsq33,rcutoff2))
1437             {
1438
1439             r33              = _mm_mul_ps(rsq33,rinv33);
1440             r33              = _mm_andnot_ps(dummy_mask,r33);
1441
1442             /* EWALD ELECTROSTATICS */
1443
1444             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1445             ewrt             = _mm_mul_ps(r33,ewtabscale);
1446             ewitab           = _mm_cvttps_epi32(ewrt);
1447             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1448             ewitab           = _mm_slli_epi32(ewitab,2);
1449             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1450             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1451             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1452             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1453             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1454             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1455             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1456             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
1457             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1458
1459             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1460
1461             /* Update potential sum for this i atom from the interaction with this j atom. */
1462             velec            = _mm_and_ps(velec,cutoff_mask);
1463             velec            = _mm_andnot_ps(dummy_mask,velec);
1464             velecsum         = _mm_add_ps(velecsum,velec);
1465
1466             fscal            = felec;
1467
1468             fscal            = _mm_and_ps(fscal,cutoff_mask);
1469
1470             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1471
1472             /* Calculate temporary vectorial force */
1473             tx               = _mm_mul_ps(fscal,dx33);
1474             ty               = _mm_mul_ps(fscal,dy33);
1475             tz               = _mm_mul_ps(fscal,dz33);
1476
1477             /* Update vectorial force */
1478             fix3             = _mm_add_ps(fix3,tx);
1479             fiy3             = _mm_add_ps(fiy3,ty);
1480             fiz3             = _mm_add_ps(fiz3,tz);
1481
1482             fjx3             = _mm_add_ps(fjx3,tx);
1483             fjy3             = _mm_add_ps(fjy3,ty);
1484             fjz3             = _mm_add_ps(fjz3,tz);
1485
1486             }
1487
1488             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1489             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1490             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1491             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1492
1493             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1494                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1495                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1496
1497             /* Inner loop uses 467 flops */
1498         }
1499
1500         /* End of innermost loop */
1501
1502         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1503                                               f+i_coord_offset,fshift+i_shift_offset);
1504
1505         ggid                        = gid[iidx];
1506         /* Update potential energies */
1507         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1508         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1509
1510         /* Increment number of inner iterations */
1511         inneriter                  += j_index_end - j_index_start;
1512
1513         /* Outer loop uses 26 flops */
1514     }
1515
1516     /* Increment number of outer iterations */
1517     outeriter        += nri;
1518
1519     /* Update outer/inner flops */
1520
1521     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*467);
1522 }
1523 /*
1524  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse4_1_single
1525  * Electrostatics interaction: Ewald
1526  * VdW interaction:            LennardJones
1527  * Geometry:                   Water4-Water4
1528  * Calculate force/pot:        Force
1529  */
1530 void
1531 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse4_1_single
1532                     (t_nblist * gmx_restrict                nlist,
1533                      rvec * gmx_restrict                    xx,
1534                      rvec * gmx_restrict                    ff,
1535                      t_forcerec * gmx_restrict              fr,
1536                      t_mdatoms * gmx_restrict               mdatoms,
1537                      nb_kernel_data_t * gmx_restrict        kernel_data,
1538                      t_nrnb * gmx_restrict                  nrnb)
1539 {
1540     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1541      * just 0 for non-waters.
1542      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1543      * jnr indices corresponding to data put in the four positions in the SIMD register.
1544      */
1545     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1546     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1547     int              jnrA,jnrB,jnrC,jnrD;
1548     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1549     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1550     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1551     real             rcutoff_scalar;
1552     real             *shiftvec,*fshift,*x,*f;
1553     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1554     real             scratch[4*DIM];
1555     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1556     int              vdwioffset0;
1557     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1558     int              vdwioffset1;
1559     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1560     int              vdwioffset2;
1561     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1562     int              vdwioffset3;
1563     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1564     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1565     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1566     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1567     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1568     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1569     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1570     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1571     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1572     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1573     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1574     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1575     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1576     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1577     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1578     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1579     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1580     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1581     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1582     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1583     real             *charge;
1584     int              nvdwtype;
1585     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1586     int              *vdwtype;
1587     real             *vdwparam;
1588     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1589     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1590     __m128i          ewitab;
1591     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1592     real             *ewtab;
1593     __m128           dummy_mask,cutoff_mask;
1594     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1595     __m128           one     = _mm_set1_ps(1.0);
1596     __m128           two     = _mm_set1_ps(2.0);
1597     x                = xx[0];
1598     f                = ff[0];
1599
1600     nri              = nlist->nri;
1601     iinr             = nlist->iinr;
1602     jindex           = nlist->jindex;
1603     jjnr             = nlist->jjnr;
1604     shiftidx         = nlist->shift;
1605     gid              = nlist->gid;
1606     shiftvec         = fr->shift_vec[0];
1607     fshift           = fr->fshift[0];
1608     facel            = _mm_set1_ps(fr->epsfac);
1609     charge           = mdatoms->chargeA;
1610     nvdwtype         = fr->ntype;
1611     vdwparam         = fr->nbfp;
1612     vdwtype          = mdatoms->typeA;
1613
1614     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1615     ewtab            = fr->ic->tabq_coul_F;
1616     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1617     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1618
1619     /* Setup water-specific parameters */
1620     inr              = nlist->iinr[0];
1621     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1622     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1623     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1624     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1625
1626     jq1              = _mm_set1_ps(charge[inr+1]);
1627     jq2              = _mm_set1_ps(charge[inr+2]);
1628     jq3              = _mm_set1_ps(charge[inr+3]);
1629     vdwjidx0A        = 2*vdwtype[inr+0];
1630     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1631     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1632     qq11             = _mm_mul_ps(iq1,jq1);
1633     qq12             = _mm_mul_ps(iq1,jq2);
1634     qq13             = _mm_mul_ps(iq1,jq3);
1635     qq21             = _mm_mul_ps(iq2,jq1);
1636     qq22             = _mm_mul_ps(iq2,jq2);
1637     qq23             = _mm_mul_ps(iq2,jq3);
1638     qq31             = _mm_mul_ps(iq3,jq1);
1639     qq32             = _mm_mul_ps(iq3,jq2);
1640     qq33             = _mm_mul_ps(iq3,jq3);
1641
1642     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1643     rcutoff_scalar   = fr->rcoulomb;
1644     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1645     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1646
1647     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
1648     rvdw             = _mm_set1_ps(fr->rvdw);
1649
1650     /* Avoid stupid compiler warnings */
1651     jnrA = jnrB = jnrC = jnrD = 0;
1652     j_coord_offsetA = 0;
1653     j_coord_offsetB = 0;
1654     j_coord_offsetC = 0;
1655     j_coord_offsetD = 0;
1656
1657     outeriter        = 0;
1658     inneriter        = 0;
1659
1660     for(iidx=0;iidx<4*DIM;iidx++)
1661     {
1662         scratch[iidx] = 0.0;
1663     }
1664
1665     /* Start outer loop over neighborlists */
1666     for(iidx=0; iidx<nri; iidx++)
1667     {
1668         /* Load shift vector for this list */
1669         i_shift_offset   = DIM*shiftidx[iidx];
1670
1671         /* Load limits for loop over neighbors */
1672         j_index_start    = jindex[iidx];
1673         j_index_end      = jindex[iidx+1];
1674
1675         /* Get outer coordinate index */
1676         inr              = iinr[iidx];
1677         i_coord_offset   = DIM*inr;
1678
1679         /* Load i particle coords and add shift vector */
1680         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1681                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1682
1683         fix0             = _mm_setzero_ps();
1684         fiy0             = _mm_setzero_ps();
1685         fiz0             = _mm_setzero_ps();
1686         fix1             = _mm_setzero_ps();
1687         fiy1             = _mm_setzero_ps();
1688         fiz1             = _mm_setzero_ps();
1689         fix2             = _mm_setzero_ps();
1690         fiy2             = _mm_setzero_ps();
1691         fiz2             = _mm_setzero_ps();
1692         fix3             = _mm_setzero_ps();
1693         fiy3             = _mm_setzero_ps();
1694         fiz3             = _mm_setzero_ps();
1695
1696         /* Start inner kernel loop */
1697         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1698         {
1699
1700             /* Get j neighbor index, and coordinate index */
1701             jnrA             = jjnr[jidx];
1702             jnrB             = jjnr[jidx+1];
1703             jnrC             = jjnr[jidx+2];
1704             jnrD             = jjnr[jidx+3];
1705             j_coord_offsetA  = DIM*jnrA;
1706             j_coord_offsetB  = DIM*jnrB;
1707             j_coord_offsetC  = DIM*jnrC;
1708             j_coord_offsetD  = DIM*jnrD;
1709
1710             /* load j atom coordinates */
1711             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1712                                               x+j_coord_offsetC,x+j_coord_offsetD,
1713                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1714                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1715
1716             /* Calculate displacement vector */
1717             dx00             = _mm_sub_ps(ix0,jx0);
1718             dy00             = _mm_sub_ps(iy0,jy0);
1719             dz00             = _mm_sub_ps(iz0,jz0);
1720             dx11             = _mm_sub_ps(ix1,jx1);
1721             dy11             = _mm_sub_ps(iy1,jy1);
1722             dz11             = _mm_sub_ps(iz1,jz1);
1723             dx12             = _mm_sub_ps(ix1,jx2);
1724             dy12             = _mm_sub_ps(iy1,jy2);
1725             dz12             = _mm_sub_ps(iz1,jz2);
1726             dx13             = _mm_sub_ps(ix1,jx3);
1727             dy13             = _mm_sub_ps(iy1,jy3);
1728             dz13             = _mm_sub_ps(iz1,jz3);
1729             dx21             = _mm_sub_ps(ix2,jx1);
1730             dy21             = _mm_sub_ps(iy2,jy1);
1731             dz21             = _mm_sub_ps(iz2,jz1);
1732             dx22             = _mm_sub_ps(ix2,jx2);
1733             dy22             = _mm_sub_ps(iy2,jy2);
1734             dz22             = _mm_sub_ps(iz2,jz2);
1735             dx23             = _mm_sub_ps(ix2,jx3);
1736             dy23             = _mm_sub_ps(iy2,jy3);
1737             dz23             = _mm_sub_ps(iz2,jz3);
1738             dx31             = _mm_sub_ps(ix3,jx1);
1739             dy31             = _mm_sub_ps(iy3,jy1);
1740             dz31             = _mm_sub_ps(iz3,jz1);
1741             dx32             = _mm_sub_ps(ix3,jx2);
1742             dy32             = _mm_sub_ps(iy3,jy2);
1743             dz32             = _mm_sub_ps(iz3,jz2);
1744             dx33             = _mm_sub_ps(ix3,jx3);
1745             dy33             = _mm_sub_ps(iy3,jy3);
1746             dz33             = _mm_sub_ps(iz3,jz3);
1747
1748             /* Calculate squared distance and things based on it */
1749             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1750             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1751             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1752             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1753             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1754             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1755             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1756             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1757             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1758             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1759
1760             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1761             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1762             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1763             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1764             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1765             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1766             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1767             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1768             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1769
1770             rinvsq00         = gmx_mm_inv_ps(rsq00);
1771             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1772             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1773             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1774             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1775             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1776             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1777             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1778             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1779             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1780
1781             fjx0             = _mm_setzero_ps();
1782             fjy0             = _mm_setzero_ps();
1783             fjz0             = _mm_setzero_ps();
1784             fjx1             = _mm_setzero_ps();
1785             fjy1             = _mm_setzero_ps();
1786             fjz1             = _mm_setzero_ps();
1787             fjx2             = _mm_setzero_ps();
1788             fjy2             = _mm_setzero_ps();
1789             fjz2             = _mm_setzero_ps();
1790             fjx3             = _mm_setzero_ps();
1791             fjy3             = _mm_setzero_ps();
1792             fjz3             = _mm_setzero_ps();
1793
1794             /**************************
1795              * CALCULATE INTERACTIONS *
1796              **************************/
1797
1798             if (gmx_mm_any_lt(rsq00,rcutoff2))
1799             {
1800
1801             /* LENNARD-JONES DISPERSION/REPULSION */
1802
1803             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1804             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1805
1806             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1807
1808             fscal            = fvdw;
1809
1810             fscal            = _mm_and_ps(fscal,cutoff_mask);
1811
1812             /* Calculate temporary vectorial force */
1813             tx               = _mm_mul_ps(fscal,dx00);
1814             ty               = _mm_mul_ps(fscal,dy00);
1815             tz               = _mm_mul_ps(fscal,dz00);
1816
1817             /* Update vectorial force */
1818             fix0             = _mm_add_ps(fix0,tx);
1819             fiy0             = _mm_add_ps(fiy0,ty);
1820             fiz0             = _mm_add_ps(fiz0,tz);
1821
1822             fjx0             = _mm_add_ps(fjx0,tx);
1823             fjy0             = _mm_add_ps(fjy0,ty);
1824             fjz0             = _mm_add_ps(fjz0,tz);
1825
1826             }
1827
1828             /**************************
1829              * CALCULATE INTERACTIONS *
1830              **************************/
1831
1832             if (gmx_mm_any_lt(rsq11,rcutoff2))
1833             {
1834
1835             r11              = _mm_mul_ps(rsq11,rinv11);
1836
1837             /* EWALD ELECTROSTATICS */
1838
1839             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1840             ewrt             = _mm_mul_ps(r11,ewtabscale);
1841             ewitab           = _mm_cvttps_epi32(ewrt);
1842             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1843             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1844                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1845                                          &ewtabF,&ewtabFn);
1846             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1847             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1848
1849             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1850
1851             fscal            = felec;
1852
1853             fscal            = _mm_and_ps(fscal,cutoff_mask);
1854
1855             /* Calculate temporary vectorial force */
1856             tx               = _mm_mul_ps(fscal,dx11);
1857             ty               = _mm_mul_ps(fscal,dy11);
1858             tz               = _mm_mul_ps(fscal,dz11);
1859
1860             /* Update vectorial force */
1861             fix1             = _mm_add_ps(fix1,tx);
1862             fiy1             = _mm_add_ps(fiy1,ty);
1863             fiz1             = _mm_add_ps(fiz1,tz);
1864
1865             fjx1             = _mm_add_ps(fjx1,tx);
1866             fjy1             = _mm_add_ps(fjy1,ty);
1867             fjz1             = _mm_add_ps(fjz1,tz);
1868
1869             }
1870
1871             /**************************
1872              * CALCULATE INTERACTIONS *
1873              **************************/
1874
1875             if (gmx_mm_any_lt(rsq12,rcutoff2))
1876             {
1877
1878             r12              = _mm_mul_ps(rsq12,rinv12);
1879
1880             /* EWALD ELECTROSTATICS */
1881
1882             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1883             ewrt             = _mm_mul_ps(r12,ewtabscale);
1884             ewitab           = _mm_cvttps_epi32(ewrt);
1885             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1886             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1887                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1888                                          &ewtabF,&ewtabFn);
1889             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1890             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1891
1892             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1893
1894             fscal            = felec;
1895
1896             fscal            = _mm_and_ps(fscal,cutoff_mask);
1897
1898             /* Calculate temporary vectorial force */
1899             tx               = _mm_mul_ps(fscal,dx12);
1900             ty               = _mm_mul_ps(fscal,dy12);
1901             tz               = _mm_mul_ps(fscal,dz12);
1902
1903             /* Update vectorial force */
1904             fix1             = _mm_add_ps(fix1,tx);
1905             fiy1             = _mm_add_ps(fiy1,ty);
1906             fiz1             = _mm_add_ps(fiz1,tz);
1907
1908             fjx2             = _mm_add_ps(fjx2,tx);
1909             fjy2             = _mm_add_ps(fjy2,ty);
1910             fjz2             = _mm_add_ps(fjz2,tz);
1911
1912             }
1913
1914             /**************************
1915              * CALCULATE INTERACTIONS *
1916              **************************/
1917
1918             if (gmx_mm_any_lt(rsq13,rcutoff2))
1919             {
1920
1921             r13              = _mm_mul_ps(rsq13,rinv13);
1922
1923             /* EWALD ELECTROSTATICS */
1924
1925             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1926             ewrt             = _mm_mul_ps(r13,ewtabscale);
1927             ewitab           = _mm_cvttps_epi32(ewrt);
1928             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1929             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1930                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1931                                          &ewtabF,&ewtabFn);
1932             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1933             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1934
1935             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1936
1937             fscal            = felec;
1938
1939             fscal            = _mm_and_ps(fscal,cutoff_mask);
1940
1941             /* Calculate temporary vectorial force */
1942             tx               = _mm_mul_ps(fscal,dx13);
1943             ty               = _mm_mul_ps(fscal,dy13);
1944             tz               = _mm_mul_ps(fscal,dz13);
1945
1946             /* Update vectorial force */
1947             fix1             = _mm_add_ps(fix1,tx);
1948             fiy1             = _mm_add_ps(fiy1,ty);
1949             fiz1             = _mm_add_ps(fiz1,tz);
1950
1951             fjx3             = _mm_add_ps(fjx3,tx);
1952             fjy3             = _mm_add_ps(fjy3,ty);
1953             fjz3             = _mm_add_ps(fjz3,tz);
1954
1955             }
1956
1957             /**************************
1958              * CALCULATE INTERACTIONS *
1959              **************************/
1960
1961             if (gmx_mm_any_lt(rsq21,rcutoff2))
1962             {
1963
1964             r21              = _mm_mul_ps(rsq21,rinv21);
1965
1966             /* EWALD ELECTROSTATICS */
1967
1968             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1969             ewrt             = _mm_mul_ps(r21,ewtabscale);
1970             ewitab           = _mm_cvttps_epi32(ewrt);
1971             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1972             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1973                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1974                                          &ewtabF,&ewtabFn);
1975             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1976             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1977
1978             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1979
1980             fscal            = felec;
1981
1982             fscal            = _mm_and_ps(fscal,cutoff_mask);
1983
1984             /* Calculate temporary vectorial force */
1985             tx               = _mm_mul_ps(fscal,dx21);
1986             ty               = _mm_mul_ps(fscal,dy21);
1987             tz               = _mm_mul_ps(fscal,dz21);
1988
1989             /* Update vectorial force */
1990             fix2             = _mm_add_ps(fix2,tx);
1991             fiy2             = _mm_add_ps(fiy2,ty);
1992             fiz2             = _mm_add_ps(fiz2,tz);
1993
1994             fjx1             = _mm_add_ps(fjx1,tx);
1995             fjy1             = _mm_add_ps(fjy1,ty);
1996             fjz1             = _mm_add_ps(fjz1,tz);
1997
1998             }
1999
2000             /**************************
2001              * CALCULATE INTERACTIONS *
2002              **************************/
2003
2004             if (gmx_mm_any_lt(rsq22,rcutoff2))
2005             {
2006
2007             r22              = _mm_mul_ps(rsq22,rinv22);
2008
2009             /* EWALD ELECTROSTATICS */
2010
2011             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2012             ewrt             = _mm_mul_ps(r22,ewtabscale);
2013             ewitab           = _mm_cvttps_epi32(ewrt);
2014             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2015             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2016                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2017                                          &ewtabF,&ewtabFn);
2018             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2019             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2020
2021             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2022
2023             fscal            = felec;
2024
2025             fscal            = _mm_and_ps(fscal,cutoff_mask);
2026
2027             /* Calculate temporary vectorial force */
2028             tx               = _mm_mul_ps(fscal,dx22);
2029             ty               = _mm_mul_ps(fscal,dy22);
2030             tz               = _mm_mul_ps(fscal,dz22);
2031
2032             /* Update vectorial force */
2033             fix2             = _mm_add_ps(fix2,tx);
2034             fiy2             = _mm_add_ps(fiy2,ty);
2035             fiz2             = _mm_add_ps(fiz2,tz);
2036
2037             fjx2             = _mm_add_ps(fjx2,tx);
2038             fjy2             = _mm_add_ps(fjy2,ty);
2039             fjz2             = _mm_add_ps(fjz2,tz);
2040
2041             }
2042
2043             /**************************
2044              * CALCULATE INTERACTIONS *
2045              **************************/
2046
2047             if (gmx_mm_any_lt(rsq23,rcutoff2))
2048             {
2049
2050             r23              = _mm_mul_ps(rsq23,rinv23);
2051
2052             /* EWALD ELECTROSTATICS */
2053
2054             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2055             ewrt             = _mm_mul_ps(r23,ewtabscale);
2056             ewitab           = _mm_cvttps_epi32(ewrt);
2057             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2058             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2059                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2060                                          &ewtabF,&ewtabFn);
2061             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2062             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2063
2064             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2065
2066             fscal            = felec;
2067
2068             fscal            = _mm_and_ps(fscal,cutoff_mask);
2069
2070             /* Calculate temporary vectorial force */
2071             tx               = _mm_mul_ps(fscal,dx23);
2072             ty               = _mm_mul_ps(fscal,dy23);
2073             tz               = _mm_mul_ps(fscal,dz23);
2074
2075             /* Update vectorial force */
2076             fix2             = _mm_add_ps(fix2,tx);
2077             fiy2             = _mm_add_ps(fiy2,ty);
2078             fiz2             = _mm_add_ps(fiz2,tz);
2079
2080             fjx3             = _mm_add_ps(fjx3,tx);
2081             fjy3             = _mm_add_ps(fjy3,ty);
2082             fjz3             = _mm_add_ps(fjz3,tz);
2083
2084             }
2085
2086             /**************************
2087              * CALCULATE INTERACTIONS *
2088              **************************/
2089
2090             if (gmx_mm_any_lt(rsq31,rcutoff2))
2091             {
2092
2093             r31              = _mm_mul_ps(rsq31,rinv31);
2094
2095             /* EWALD ELECTROSTATICS */
2096
2097             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2098             ewrt             = _mm_mul_ps(r31,ewtabscale);
2099             ewitab           = _mm_cvttps_epi32(ewrt);
2100             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2101             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2102                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2103                                          &ewtabF,&ewtabFn);
2104             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2105             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2106
2107             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2108
2109             fscal            = felec;
2110
2111             fscal            = _mm_and_ps(fscal,cutoff_mask);
2112
2113             /* Calculate temporary vectorial force */
2114             tx               = _mm_mul_ps(fscal,dx31);
2115             ty               = _mm_mul_ps(fscal,dy31);
2116             tz               = _mm_mul_ps(fscal,dz31);
2117
2118             /* Update vectorial force */
2119             fix3             = _mm_add_ps(fix3,tx);
2120             fiy3             = _mm_add_ps(fiy3,ty);
2121             fiz3             = _mm_add_ps(fiz3,tz);
2122
2123             fjx1             = _mm_add_ps(fjx1,tx);
2124             fjy1             = _mm_add_ps(fjy1,ty);
2125             fjz1             = _mm_add_ps(fjz1,tz);
2126
2127             }
2128
2129             /**************************
2130              * CALCULATE INTERACTIONS *
2131              **************************/
2132
2133             if (gmx_mm_any_lt(rsq32,rcutoff2))
2134             {
2135
2136             r32              = _mm_mul_ps(rsq32,rinv32);
2137
2138             /* EWALD ELECTROSTATICS */
2139
2140             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2141             ewrt             = _mm_mul_ps(r32,ewtabscale);
2142             ewitab           = _mm_cvttps_epi32(ewrt);
2143             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2144             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2145                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2146                                          &ewtabF,&ewtabFn);
2147             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2148             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2149
2150             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2151
2152             fscal            = felec;
2153
2154             fscal            = _mm_and_ps(fscal,cutoff_mask);
2155
2156             /* Calculate temporary vectorial force */
2157             tx               = _mm_mul_ps(fscal,dx32);
2158             ty               = _mm_mul_ps(fscal,dy32);
2159             tz               = _mm_mul_ps(fscal,dz32);
2160
2161             /* Update vectorial force */
2162             fix3             = _mm_add_ps(fix3,tx);
2163             fiy3             = _mm_add_ps(fiy3,ty);
2164             fiz3             = _mm_add_ps(fiz3,tz);
2165
2166             fjx2             = _mm_add_ps(fjx2,tx);
2167             fjy2             = _mm_add_ps(fjy2,ty);
2168             fjz2             = _mm_add_ps(fjz2,tz);
2169
2170             }
2171
2172             /**************************
2173              * CALCULATE INTERACTIONS *
2174              **************************/
2175
2176             if (gmx_mm_any_lt(rsq33,rcutoff2))
2177             {
2178
2179             r33              = _mm_mul_ps(rsq33,rinv33);
2180
2181             /* EWALD ELECTROSTATICS */
2182
2183             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2184             ewrt             = _mm_mul_ps(r33,ewtabscale);
2185             ewitab           = _mm_cvttps_epi32(ewrt);
2186             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2187             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2188                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2189                                          &ewtabF,&ewtabFn);
2190             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2191             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2192
2193             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2194
2195             fscal            = felec;
2196
2197             fscal            = _mm_and_ps(fscal,cutoff_mask);
2198
2199             /* Calculate temporary vectorial force */
2200             tx               = _mm_mul_ps(fscal,dx33);
2201             ty               = _mm_mul_ps(fscal,dy33);
2202             tz               = _mm_mul_ps(fscal,dz33);
2203
2204             /* Update vectorial force */
2205             fix3             = _mm_add_ps(fix3,tx);
2206             fiy3             = _mm_add_ps(fiy3,ty);
2207             fiz3             = _mm_add_ps(fiz3,tz);
2208
2209             fjx3             = _mm_add_ps(fjx3,tx);
2210             fjy3             = _mm_add_ps(fjy3,ty);
2211             fjz3             = _mm_add_ps(fjz3,tz);
2212
2213             }
2214
2215             fjptrA             = f+j_coord_offsetA;
2216             fjptrB             = f+j_coord_offsetB;
2217             fjptrC             = f+j_coord_offsetC;
2218             fjptrD             = f+j_coord_offsetD;
2219
2220             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2221                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2222                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2223
2224             /* Inner loop uses 384 flops */
2225         }
2226
2227         if(jidx<j_index_end)
2228         {
2229
2230             /* Get j neighbor index, and coordinate index */
2231             jnrlistA         = jjnr[jidx];
2232             jnrlistB         = jjnr[jidx+1];
2233             jnrlistC         = jjnr[jidx+2];
2234             jnrlistD         = jjnr[jidx+3];
2235             /* Sign of each element will be negative for non-real atoms.
2236              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2237              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2238              */
2239             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2240             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2241             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2242             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2243             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2244             j_coord_offsetA  = DIM*jnrA;
2245             j_coord_offsetB  = DIM*jnrB;
2246             j_coord_offsetC  = DIM*jnrC;
2247             j_coord_offsetD  = DIM*jnrD;
2248
2249             /* load j atom coordinates */
2250             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2251                                               x+j_coord_offsetC,x+j_coord_offsetD,
2252                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2253                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2254
2255             /* Calculate displacement vector */
2256             dx00             = _mm_sub_ps(ix0,jx0);
2257             dy00             = _mm_sub_ps(iy0,jy0);
2258             dz00             = _mm_sub_ps(iz0,jz0);
2259             dx11             = _mm_sub_ps(ix1,jx1);
2260             dy11             = _mm_sub_ps(iy1,jy1);
2261             dz11             = _mm_sub_ps(iz1,jz1);
2262             dx12             = _mm_sub_ps(ix1,jx2);
2263             dy12             = _mm_sub_ps(iy1,jy2);
2264             dz12             = _mm_sub_ps(iz1,jz2);
2265             dx13             = _mm_sub_ps(ix1,jx3);
2266             dy13             = _mm_sub_ps(iy1,jy3);
2267             dz13             = _mm_sub_ps(iz1,jz3);
2268             dx21             = _mm_sub_ps(ix2,jx1);
2269             dy21             = _mm_sub_ps(iy2,jy1);
2270             dz21             = _mm_sub_ps(iz2,jz1);
2271             dx22             = _mm_sub_ps(ix2,jx2);
2272             dy22             = _mm_sub_ps(iy2,jy2);
2273             dz22             = _mm_sub_ps(iz2,jz2);
2274             dx23             = _mm_sub_ps(ix2,jx3);
2275             dy23             = _mm_sub_ps(iy2,jy3);
2276             dz23             = _mm_sub_ps(iz2,jz3);
2277             dx31             = _mm_sub_ps(ix3,jx1);
2278             dy31             = _mm_sub_ps(iy3,jy1);
2279             dz31             = _mm_sub_ps(iz3,jz1);
2280             dx32             = _mm_sub_ps(ix3,jx2);
2281             dy32             = _mm_sub_ps(iy3,jy2);
2282             dz32             = _mm_sub_ps(iz3,jz2);
2283             dx33             = _mm_sub_ps(ix3,jx3);
2284             dy33             = _mm_sub_ps(iy3,jy3);
2285             dz33             = _mm_sub_ps(iz3,jz3);
2286
2287             /* Calculate squared distance and things based on it */
2288             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2289             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2290             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2291             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2292             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2293             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2294             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2295             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2296             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2297             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2298
2299             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2300             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2301             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2302             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2303             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2304             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2305             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2306             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2307             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2308
2309             rinvsq00         = gmx_mm_inv_ps(rsq00);
2310             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2311             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2312             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2313             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2314             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2315             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2316             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2317             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2318             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2319
2320             fjx0             = _mm_setzero_ps();
2321             fjy0             = _mm_setzero_ps();
2322             fjz0             = _mm_setzero_ps();
2323             fjx1             = _mm_setzero_ps();
2324             fjy1             = _mm_setzero_ps();
2325             fjz1             = _mm_setzero_ps();
2326             fjx2             = _mm_setzero_ps();
2327             fjy2             = _mm_setzero_ps();
2328             fjz2             = _mm_setzero_ps();
2329             fjx3             = _mm_setzero_ps();
2330             fjy3             = _mm_setzero_ps();
2331             fjz3             = _mm_setzero_ps();
2332
2333             /**************************
2334              * CALCULATE INTERACTIONS *
2335              **************************/
2336
2337             if (gmx_mm_any_lt(rsq00,rcutoff2))
2338             {
2339
2340             /* LENNARD-JONES DISPERSION/REPULSION */
2341
2342             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2343             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2344
2345             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2346
2347             fscal            = fvdw;
2348
2349             fscal            = _mm_and_ps(fscal,cutoff_mask);
2350
2351             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2352
2353             /* Calculate temporary vectorial force */
2354             tx               = _mm_mul_ps(fscal,dx00);
2355             ty               = _mm_mul_ps(fscal,dy00);
2356             tz               = _mm_mul_ps(fscal,dz00);
2357
2358             /* Update vectorial force */
2359             fix0             = _mm_add_ps(fix0,tx);
2360             fiy0             = _mm_add_ps(fiy0,ty);
2361             fiz0             = _mm_add_ps(fiz0,tz);
2362
2363             fjx0             = _mm_add_ps(fjx0,tx);
2364             fjy0             = _mm_add_ps(fjy0,ty);
2365             fjz0             = _mm_add_ps(fjz0,tz);
2366
2367             }
2368
2369             /**************************
2370              * CALCULATE INTERACTIONS *
2371              **************************/
2372
2373             if (gmx_mm_any_lt(rsq11,rcutoff2))
2374             {
2375
2376             r11              = _mm_mul_ps(rsq11,rinv11);
2377             r11              = _mm_andnot_ps(dummy_mask,r11);
2378
2379             /* EWALD ELECTROSTATICS */
2380
2381             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2382             ewrt             = _mm_mul_ps(r11,ewtabscale);
2383             ewitab           = _mm_cvttps_epi32(ewrt);
2384             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2385             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2386                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2387                                          &ewtabF,&ewtabFn);
2388             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2389             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2390
2391             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2392
2393             fscal            = felec;
2394
2395             fscal            = _mm_and_ps(fscal,cutoff_mask);
2396
2397             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2398
2399             /* Calculate temporary vectorial force */
2400             tx               = _mm_mul_ps(fscal,dx11);
2401             ty               = _mm_mul_ps(fscal,dy11);
2402             tz               = _mm_mul_ps(fscal,dz11);
2403
2404             /* Update vectorial force */
2405             fix1             = _mm_add_ps(fix1,tx);
2406             fiy1             = _mm_add_ps(fiy1,ty);
2407             fiz1             = _mm_add_ps(fiz1,tz);
2408
2409             fjx1             = _mm_add_ps(fjx1,tx);
2410             fjy1             = _mm_add_ps(fjy1,ty);
2411             fjz1             = _mm_add_ps(fjz1,tz);
2412
2413             }
2414
2415             /**************************
2416              * CALCULATE INTERACTIONS *
2417              **************************/
2418
2419             if (gmx_mm_any_lt(rsq12,rcutoff2))
2420             {
2421
2422             r12              = _mm_mul_ps(rsq12,rinv12);
2423             r12              = _mm_andnot_ps(dummy_mask,r12);
2424
2425             /* EWALD ELECTROSTATICS */
2426
2427             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2428             ewrt             = _mm_mul_ps(r12,ewtabscale);
2429             ewitab           = _mm_cvttps_epi32(ewrt);
2430             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2431             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2432                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2433                                          &ewtabF,&ewtabFn);
2434             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2435             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2436
2437             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2438
2439             fscal            = felec;
2440
2441             fscal            = _mm_and_ps(fscal,cutoff_mask);
2442
2443             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2444
2445             /* Calculate temporary vectorial force */
2446             tx               = _mm_mul_ps(fscal,dx12);
2447             ty               = _mm_mul_ps(fscal,dy12);
2448             tz               = _mm_mul_ps(fscal,dz12);
2449
2450             /* Update vectorial force */
2451             fix1             = _mm_add_ps(fix1,tx);
2452             fiy1             = _mm_add_ps(fiy1,ty);
2453             fiz1             = _mm_add_ps(fiz1,tz);
2454
2455             fjx2             = _mm_add_ps(fjx2,tx);
2456             fjy2             = _mm_add_ps(fjy2,ty);
2457             fjz2             = _mm_add_ps(fjz2,tz);
2458
2459             }
2460
2461             /**************************
2462              * CALCULATE INTERACTIONS *
2463              **************************/
2464
2465             if (gmx_mm_any_lt(rsq13,rcutoff2))
2466             {
2467
2468             r13              = _mm_mul_ps(rsq13,rinv13);
2469             r13              = _mm_andnot_ps(dummy_mask,r13);
2470
2471             /* EWALD ELECTROSTATICS */
2472
2473             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2474             ewrt             = _mm_mul_ps(r13,ewtabscale);
2475             ewitab           = _mm_cvttps_epi32(ewrt);
2476             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2477             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2478                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2479                                          &ewtabF,&ewtabFn);
2480             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2481             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2482
2483             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2484
2485             fscal            = felec;
2486
2487             fscal            = _mm_and_ps(fscal,cutoff_mask);
2488
2489             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2490
2491             /* Calculate temporary vectorial force */
2492             tx               = _mm_mul_ps(fscal,dx13);
2493             ty               = _mm_mul_ps(fscal,dy13);
2494             tz               = _mm_mul_ps(fscal,dz13);
2495
2496             /* Update vectorial force */
2497             fix1             = _mm_add_ps(fix1,tx);
2498             fiy1             = _mm_add_ps(fiy1,ty);
2499             fiz1             = _mm_add_ps(fiz1,tz);
2500
2501             fjx3             = _mm_add_ps(fjx3,tx);
2502             fjy3             = _mm_add_ps(fjy3,ty);
2503             fjz3             = _mm_add_ps(fjz3,tz);
2504
2505             }
2506
2507             /**************************
2508              * CALCULATE INTERACTIONS *
2509              **************************/
2510
2511             if (gmx_mm_any_lt(rsq21,rcutoff2))
2512             {
2513
2514             r21              = _mm_mul_ps(rsq21,rinv21);
2515             r21              = _mm_andnot_ps(dummy_mask,r21);
2516
2517             /* EWALD ELECTROSTATICS */
2518
2519             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2520             ewrt             = _mm_mul_ps(r21,ewtabscale);
2521             ewitab           = _mm_cvttps_epi32(ewrt);
2522             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2523             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2524                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2525                                          &ewtabF,&ewtabFn);
2526             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2527             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2528
2529             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2530
2531             fscal            = felec;
2532
2533             fscal            = _mm_and_ps(fscal,cutoff_mask);
2534
2535             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2536
2537             /* Calculate temporary vectorial force */
2538             tx               = _mm_mul_ps(fscal,dx21);
2539             ty               = _mm_mul_ps(fscal,dy21);
2540             tz               = _mm_mul_ps(fscal,dz21);
2541
2542             /* Update vectorial force */
2543             fix2             = _mm_add_ps(fix2,tx);
2544             fiy2             = _mm_add_ps(fiy2,ty);
2545             fiz2             = _mm_add_ps(fiz2,tz);
2546
2547             fjx1             = _mm_add_ps(fjx1,tx);
2548             fjy1             = _mm_add_ps(fjy1,ty);
2549             fjz1             = _mm_add_ps(fjz1,tz);
2550
2551             }
2552
2553             /**************************
2554              * CALCULATE INTERACTIONS *
2555              **************************/
2556
2557             if (gmx_mm_any_lt(rsq22,rcutoff2))
2558             {
2559
2560             r22              = _mm_mul_ps(rsq22,rinv22);
2561             r22              = _mm_andnot_ps(dummy_mask,r22);
2562
2563             /* EWALD ELECTROSTATICS */
2564
2565             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2566             ewrt             = _mm_mul_ps(r22,ewtabscale);
2567             ewitab           = _mm_cvttps_epi32(ewrt);
2568             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2569             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2570                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2571                                          &ewtabF,&ewtabFn);
2572             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2573             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2574
2575             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2576
2577             fscal            = felec;
2578
2579             fscal            = _mm_and_ps(fscal,cutoff_mask);
2580
2581             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2582
2583             /* Calculate temporary vectorial force */
2584             tx               = _mm_mul_ps(fscal,dx22);
2585             ty               = _mm_mul_ps(fscal,dy22);
2586             tz               = _mm_mul_ps(fscal,dz22);
2587
2588             /* Update vectorial force */
2589             fix2             = _mm_add_ps(fix2,tx);
2590             fiy2             = _mm_add_ps(fiy2,ty);
2591             fiz2             = _mm_add_ps(fiz2,tz);
2592
2593             fjx2             = _mm_add_ps(fjx2,tx);
2594             fjy2             = _mm_add_ps(fjy2,ty);
2595             fjz2             = _mm_add_ps(fjz2,tz);
2596
2597             }
2598
2599             /**************************
2600              * CALCULATE INTERACTIONS *
2601              **************************/
2602
2603             if (gmx_mm_any_lt(rsq23,rcutoff2))
2604             {
2605
2606             r23              = _mm_mul_ps(rsq23,rinv23);
2607             r23              = _mm_andnot_ps(dummy_mask,r23);
2608
2609             /* EWALD ELECTROSTATICS */
2610
2611             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2612             ewrt             = _mm_mul_ps(r23,ewtabscale);
2613             ewitab           = _mm_cvttps_epi32(ewrt);
2614             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2615             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2616                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2617                                          &ewtabF,&ewtabFn);
2618             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2619             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2620
2621             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2622
2623             fscal            = felec;
2624
2625             fscal            = _mm_and_ps(fscal,cutoff_mask);
2626
2627             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2628
2629             /* Calculate temporary vectorial force */
2630             tx               = _mm_mul_ps(fscal,dx23);
2631             ty               = _mm_mul_ps(fscal,dy23);
2632             tz               = _mm_mul_ps(fscal,dz23);
2633
2634             /* Update vectorial force */
2635             fix2             = _mm_add_ps(fix2,tx);
2636             fiy2             = _mm_add_ps(fiy2,ty);
2637             fiz2             = _mm_add_ps(fiz2,tz);
2638
2639             fjx3             = _mm_add_ps(fjx3,tx);
2640             fjy3             = _mm_add_ps(fjy3,ty);
2641             fjz3             = _mm_add_ps(fjz3,tz);
2642
2643             }
2644
2645             /**************************
2646              * CALCULATE INTERACTIONS *
2647              **************************/
2648
2649             if (gmx_mm_any_lt(rsq31,rcutoff2))
2650             {
2651
2652             r31              = _mm_mul_ps(rsq31,rinv31);
2653             r31              = _mm_andnot_ps(dummy_mask,r31);
2654
2655             /* EWALD ELECTROSTATICS */
2656
2657             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2658             ewrt             = _mm_mul_ps(r31,ewtabscale);
2659             ewitab           = _mm_cvttps_epi32(ewrt);
2660             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2661             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2662                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2663                                          &ewtabF,&ewtabFn);
2664             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2665             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2666
2667             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2668
2669             fscal            = felec;
2670
2671             fscal            = _mm_and_ps(fscal,cutoff_mask);
2672
2673             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2674
2675             /* Calculate temporary vectorial force */
2676             tx               = _mm_mul_ps(fscal,dx31);
2677             ty               = _mm_mul_ps(fscal,dy31);
2678             tz               = _mm_mul_ps(fscal,dz31);
2679
2680             /* Update vectorial force */
2681             fix3             = _mm_add_ps(fix3,tx);
2682             fiy3             = _mm_add_ps(fiy3,ty);
2683             fiz3             = _mm_add_ps(fiz3,tz);
2684
2685             fjx1             = _mm_add_ps(fjx1,tx);
2686             fjy1             = _mm_add_ps(fjy1,ty);
2687             fjz1             = _mm_add_ps(fjz1,tz);
2688
2689             }
2690
2691             /**************************
2692              * CALCULATE INTERACTIONS *
2693              **************************/
2694
2695             if (gmx_mm_any_lt(rsq32,rcutoff2))
2696             {
2697
2698             r32              = _mm_mul_ps(rsq32,rinv32);
2699             r32              = _mm_andnot_ps(dummy_mask,r32);
2700
2701             /* EWALD ELECTROSTATICS */
2702
2703             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2704             ewrt             = _mm_mul_ps(r32,ewtabscale);
2705             ewitab           = _mm_cvttps_epi32(ewrt);
2706             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2707             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2708                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2709                                          &ewtabF,&ewtabFn);
2710             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2711             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2712
2713             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2714
2715             fscal            = felec;
2716
2717             fscal            = _mm_and_ps(fscal,cutoff_mask);
2718
2719             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2720
2721             /* Calculate temporary vectorial force */
2722             tx               = _mm_mul_ps(fscal,dx32);
2723             ty               = _mm_mul_ps(fscal,dy32);
2724             tz               = _mm_mul_ps(fscal,dz32);
2725
2726             /* Update vectorial force */
2727             fix3             = _mm_add_ps(fix3,tx);
2728             fiy3             = _mm_add_ps(fiy3,ty);
2729             fiz3             = _mm_add_ps(fiz3,tz);
2730
2731             fjx2             = _mm_add_ps(fjx2,tx);
2732             fjy2             = _mm_add_ps(fjy2,ty);
2733             fjz2             = _mm_add_ps(fjz2,tz);
2734
2735             }
2736
2737             /**************************
2738              * CALCULATE INTERACTIONS *
2739              **************************/
2740
2741             if (gmx_mm_any_lt(rsq33,rcutoff2))
2742             {
2743
2744             r33              = _mm_mul_ps(rsq33,rinv33);
2745             r33              = _mm_andnot_ps(dummy_mask,r33);
2746
2747             /* EWALD ELECTROSTATICS */
2748
2749             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2750             ewrt             = _mm_mul_ps(r33,ewtabscale);
2751             ewitab           = _mm_cvttps_epi32(ewrt);
2752             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2753             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2754                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2755                                          &ewtabF,&ewtabFn);
2756             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2757             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2758
2759             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2760
2761             fscal            = felec;
2762
2763             fscal            = _mm_and_ps(fscal,cutoff_mask);
2764
2765             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2766
2767             /* Calculate temporary vectorial force */
2768             tx               = _mm_mul_ps(fscal,dx33);
2769             ty               = _mm_mul_ps(fscal,dy33);
2770             tz               = _mm_mul_ps(fscal,dz33);
2771
2772             /* Update vectorial force */
2773             fix3             = _mm_add_ps(fix3,tx);
2774             fiy3             = _mm_add_ps(fiy3,ty);
2775             fiz3             = _mm_add_ps(fiz3,tz);
2776
2777             fjx3             = _mm_add_ps(fjx3,tx);
2778             fjy3             = _mm_add_ps(fjy3,ty);
2779             fjz3             = _mm_add_ps(fjz3,tz);
2780
2781             }
2782
2783             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2784             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2785             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2786             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2787
2788             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2789                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2790                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2791
2792             /* Inner loop uses 393 flops */
2793         }
2794
2795         /* End of innermost loop */
2796
2797         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2798                                               f+i_coord_offset,fshift+i_shift_offset);
2799
2800         /* Increment number of inner iterations */
2801         inneriter                  += j_index_end - j_index_start;
2802
2803         /* Outer loop uses 24 flops */
2804     }
2805
2806     /* Increment number of outer iterations */
2807     outeriter        += nri;
2808
2809     /* Update outer/inner flops */
2810
2811     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*393);
2812 }