Introduce gmxpre.h for truly global definitions
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse4_1_single.h"
50 #include "kernelutil_x86_sse4_1_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwioffset3;
92     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
112     real             *charge;
113     int              nvdwtype;
114     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115     int              *vdwtype;
116     real             *vdwparam;
117     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
118     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
119     __m128i          ewitab;
120     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
121     real             *ewtab;
122     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
123     real             rswitch_scalar,d_scalar;
124     __m128           dummy_mask,cutoff_mask;
125     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
126     __m128           one     = _mm_set1_ps(1.0);
127     __m128           two     = _mm_set1_ps(2.0);
128     x                = xx[0];
129     f                = ff[0];
130
131     nri              = nlist->nri;
132     iinr             = nlist->iinr;
133     jindex           = nlist->jindex;
134     jjnr             = nlist->jjnr;
135     shiftidx         = nlist->shift;
136     gid              = nlist->gid;
137     shiftvec         = fr->shift_vec[0];
138     fshift           = fr->fshift[0];
139     facel            = _mm_set1_ps(fr->epsfac);
140     charge           = mdatoms->chargeA;
141     nvdwtype         = fr->ntype;
142     vdwparam         = fr->nbfp;
143     vdwtype          = mdatoms->typeA;
144
145     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
146     ewtab            = fr->ic->tabq_coul_FDV0;
147     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
148     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
149
150     /* Setup water-specific parameters */
151     inr              = nlist->iinr[0];
152     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
153     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
154     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
155     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
156
157     jq1              = _mm_set1_ps(charge[inr+1]);
158     jq2              = _mm_set1_ps(charge[inr+2]);
159     jq3              = _mm_set1_ps(charge[inr+3]);
160     vdwjidx0A        = 2*vdwtype[inr+0];
161     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
162     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
163     qq11             = _mm_mul_ps(iq1,jq1);
164     qq12             = _mm_mul_ps(iq1,jq2);
165     qq13             = _mm_mul_ps(iq1,jq3);
166     qq21             = _mm_mul_ps(iq2,jq1);
167     qq22             = _mm_mul_ps(iq2,jq2);
168     qq23             = _mm_mul_ps(iq2,jq3);
169     qq31             = _mm_mul_ps(iq3,jq1);
170     qq32             = _mm_mul_ps(iq3,jq2);
171     qq33             = _mm_mul_ps(iq3,jq3);
172
173     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
174     rcutoff_scalar   = fr->rcoulomb;
175     rcutoff          = _mm_set1_ps(rcutoff_scalar);
176     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
177
178     rswitch_scalar   = fr->rcoulomb_switch;
179     rswitch          = _mm_set1_ps(rswitch_scalar);
180     /* Setup switch parameters */
181     d_scalar         = rcutoff_scalar-rswitch_scalar;
182     d                = _mm_set1_ps(d_scalar);
183     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
184     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
185     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
186     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
187     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
188     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
189
190     /* Avoid stupid compiler warnings */
191     jnrA = jnrB = jnrC = jnrD = 0;
192     j_coord_offsetA = 0;
193     j_coord_offsetB = 0;
194     j_coord_offsetC = 0;
195     j_coord_offsetD = 0;
196
197     outeriter        = 0;
198     inneriter        = 0;
199
200     for(iidx=0;iidx<4*DIM;iidx++)
201     {
202         scratch[iidx] = 0.0;
203     }
204
205     /* Start outer loop over neighborlists */
206     for(iidx=0; iidx<nri; iidx++)
207     {
208         /* Load shift vector for this list */
209         i_shift_offset   = DIM*shiftidx[iidx];
210
211         /* Load limits for loop over neighbors */
212         j_index_start    = jindex[iidx];
213         j_index_end      = jindex[iidx+1];
214
215         /* Get outer coordinate index */
216         inr              = iinr[iidx];
217         i_coord_offset   = DIM*inr;
218
219         /* Load i particle coords and add shift vector */
220         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
221                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
222
223         fix0             = _mm_setzero_ps();
224         fiy0             = _mm_setzero_ps();
225         fiz0             = _mm_setzero_ps();
226         fix1             = _mm_setzero_ps();
227         fiy1             = _mm_setzero_ps();
228         fiz1             = _mm_setzero_ps();
229         fix2             = _mm_setzero_ps();
230         fiy2             = _mm_setzero_ps();
231         fiz2             = _mm_setzero_ps();
232         fix3             = _mm_setzero_ps();
233         fiy3             = _mm_setzero_ps();
234         fiz3             = _mm_setzero_ps();
235
236         /* Reset potential sums */
237         velecsum         = _mm_setzero_ps();
238         vvdwsum          = _mm_setzero_ps();
239
240         /* Start inner kernel loop */
241         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
242         {
243
244             /* Get j neighbor index, and coordinate index */
245             jnrA             = jjnr[jidx];
246             jnrB             = jjnr[jidx+1];
247             jnrC             = jjnr[jidx+2];
248             jnrD             = jjnr[jidx+3];
249             j_coord_offsetA  = DIM*jnrA;
250             j_coord_offsetB  = DIM*jnrB;
251             j_coord_offsetC  = DIM*jnrC;
252             j_coord_offsetD  = DIM*jnrD;
253
254             /* load j atom coordinates */
255             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
256                                               x+j_coord_offsetC,x+j_coord_offsetD,
257                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
258                                               &jy2,&jz2,&jx3,&jy3,&jz3);
259
260             /* Calculate displacement vector */
261             dx00             = _mm_sub_ps(ix0,jx0);
262             dy00             = _mm_sub_ps(iy0,jy0);
263             dz00             = _mm_sub_ps(iz0,jz0);
264             dx11             = _mm_sub_ps(ix1,jx1);
265             dy11             = _mm_sub_ps(iy1,jy1);
266             dz11             = _mm_sub_ps(iz1,jz1);
267             dx12             = _mm_sub_ps(ix1,jx2);
268             dy12             = _mm_sub_ps(iy1,jy2);
269             dz12             = _mm_sub_ps(iz1,jz2);
270             dx13             = _mm_sub_ps(ix1,jx3);
271             dy13             = _mm_sub_ps(iy1,jy3);
272             dz13             = _mm_sub_ps(iz1,jz3);
273             dx21             = _mm_sub_ps(ix2,jx1);
274             dy21             = _mm_sub_ps(iy2,jy1);
275             dz21             = _mm_sub_ps(iz2,jz1);
276             dx22             = _mm_sub_ps(ix2,jx2);
277             dy22             = _mm_sub_ps(iy2,jy2);
278             dz22             = _mm_sub_ps(iz2,jz2);
279             dx23             = _mm_sub_ps(ix2,jx3);
280             dy23             = _mm_sub_ps(iy2,jy3);
281             dz23             = _mm_sub_ps(iz2,jz3);
282             dx31             = _mm_sub_ps(ix3,jx1);
283             dy31             = _mm_sub_ps(iy3,jy1);
284             dz31             = _mm_sub_ps(iz3,jz1);
285             dx32             = _mm_sub_ps(ix3,jx2);
286             dy32             = _mm_sub_ps(iy3,jy2);
287             dz32             = _mm_sub_ps(iz3,jz2);
288             dx33             = _mm_sub_ps(ix3,jx3);
289             dy33             = _mm_sub_ps(iy3,jy3);
290             dz33             = _mm_sub_ps(iz3,jz3);
291
292             /* Calculate squared distance and things based on it */
293             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
294             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
295             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
296             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
297             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
298             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
299             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
300             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
301             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
302             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
303
304             rinv00           = gmx_mm_invsqrt_ps(rsq00);
305             rinv11           = gmx_mm_invsqrt_ps(rsq11);
306             rinv12           = gmx_mm_invsqrt_ps(rsq12);
307             rinv13           = gmx_mm_invsqrt_ps(rsq13);
308             rinv21           = gmx_mm_invsqrt_ps(rsq21);
309             rinv22           = gmx_mm_invsqrt_ps(rsq22);
310             rinv23           = gmx_mm_invsqrt_ps(rsq23);
311             rinv31           = gmx_mm_invsqrt_ps(rsq31);
312             rinv32           = gmx_mm_invsqrt_ps(rsq32);
313             rinv33           = gmx_mm_invsqrt_ps(rsq33);
314
315             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
316             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
317             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
318             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
319             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
320             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
321             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
322             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
323             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
324             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
325
326             fjx0             = _mm_setzero_ps();
327             fjy0             = _mm_setzero_ps();
328             fjz0             = _mm_setzero_ps();
329             fjx1             = _mm_setzero_ps();
330             fjy1             = _mm_setzero_ps();
331             fjz1             = _mm_setzero_ps();
332             fjx2             = _mm_setzero_ps();
333             fjy2             = _mm_setzero_ps();
334             fjz2             = _mm_setzero_ps();
335             fjx3             = _mm_setzero_ps();
336             fjy3             = _mm_setzero_ps();
337             fjz3             = _mm_setzero_ps();
338
339             /**************************
340              * CALCULATE INTERACTIONS *
341              **************************/
342
343             if (gmx_mm_any_lt(rsq00,rcutoff2))
344             {
345
346             r00              = _mm_mul_ps(rsq00,rinv00);
347
348             /* LENNARD-JONES DISPERSION/REPULSION */
349
350             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
351             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
352             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
353             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
354             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
355
356             d                = _mm_sub_ps(r00,rswitch);
357             d                = _mm_max_ps(d,_mm_setzero_ps());
358             d2               = _mm_mul_ps(d,d);
359             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
360
361             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
362
363             /* Evaluate switch function */
364             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
365             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
366             vvdw             = _mm_mul_ps(vvdw,sw);
367             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
368
369             /* Update potential sum for this i atom from the interaction with this j atom. */
370             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
371             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
372
373             fscal            = fvdw;
374
375             fscal            = _mm_and_ps(fscal,cutoff_mask);
376
377             /* Calculate temporary vectorial force */
378             tx               = _mm_mul_ps(fscal,dx00);
379             ty               = _mm_mul_ps(fscal,dy00);
380             tz               = _mm_mul_ps(fscal,dz00);
381
382             /* Update vectorial force */
383             fix0             = _mm_add_ps(fix0,tx);
384             fiy0             = _mm_add_ps(fiy0,ty);
385             fiz0             = _mm_add_ps(fiz0,tz);
386
387             fjx0             = _mm_add_ps(fjx0,tx);
388             fjy0             = _mm_add_ps(fjy0,ty);
389             fjz0             = _mm_add_ps(fjz0,tz);
390
391             }
392
393             /**************************
394              * CALCULATE INTERACTIONS *
395              **************************/
396
397             if (gmx_mm_any_lt(rsq11,rcutoff2))
398             {
399
400             r11              = _mm_mul_ps(rsq11,rinv11);
401
402             /* EWALD ELECTROSTATICS */
403
404             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
405             ewrt             = _mm_mul_ps(r11,ewtabscale);
406             ewitab           = _mm_cvttps_epi32(ewrt);
407             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
408             ewitab           = _mm_slli_epi32(ewitab,2);
409             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
410             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
411             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
412             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
413             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
414             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
415             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
416             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
417             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
418
419             d                = _mm_sub_ps(r11,rswitch);
420             d                = _mm_max_ps(d,_mm_setzero_ps());
421             d2               = _mm_mul_ps(d,d);
422             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
423
424             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
425
426             /* Evaluate switch function */
427             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
428             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
429             velec            = _mm_mul_ps(velec,sw);
430             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
431
432             /* Update potential sum for this i atom from the interaction with this j atom. */
433             velec            = _mm_and_ps(velec,cutoff_mask);
434             velecsum         = _mm_add_ps(velecsum,velec);
435
436             fscal            = felec;
437
438             fscal            = _mm_and_ps(fscal,cutoff_mask);
439
440             /* Calculate temporary vectorial force */
441             tx               = _mm_mul_ps(fscal,dx11);
442             ty               = _mm_mul_ps(fscal,dy11);
443             tz               = _mm_mul_ps(fscal,dz11);
444
445             /* Update vectorial force */
446             fix1             = _mm_add_ps(fix1,tx);
447             fiy1             = _mm_add_ps(fiy1,ty);
448             fiz1             = _mm_add_ps(fiz1,tz);
449
450             fjx1             = _mm_add_ps(fjx1,tx);
451             fjy1             = _mm_add_ps(fjy1,ty);
452             fjz1             = _mm_add_ps(fjz1,tz);
453
454             }
455
456             /**************************
457              * CALCULATE INTERACTIONS *
458              **************************/
459
460             if (gmx_mm_any_lt(rsq12,rcutoff2))
461             {
462
463             r12              = _mm_mul_ps(rsq12,rinv12);
464
465             /* EWALD ELECTROSTATICS */
466
467             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
468             ewrt             = _mm_mul_ps(r12,ewtabscale);
469             ewitab           = _mm_cvttps_epi32(ewrt);
470             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
471             ewitab           = _mm_slli_epi32(ewitab,2);
472             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
473             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
474             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
475             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
476             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
477             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
478             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
479             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
480             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
481
482             d                = _mm_sub_ps(r12,rswitch);
483             d                = _mm_max_ps(d,_mm_setzero_ps());
484             d2               = _mm_mul_ps(d,d);
485             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
486
487             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
488
489             /* Evaluate switch function */
490             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
491             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
492             velec            = _mm_mul_ps(velec,sw);
493             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
494
495             /* Update potential sum for this i atom from the interaction with this j atom. */
496             velec            = _mm_and_ps(velec,cutoff_mask);
497             velecsum         = _mm_add_ps(velecsum,velec);
498
499             fscal            = felec;
500
501             fscal            = _mm_and_ps(fscal,cutoff_mask);
502
503             /* Calculate temporary vectorial force */
504             tx               = _mm_mul_ps(fscal,dx12);
505             ty               = _mm_mul_ps(fscal,dy12);
506             tz               = _mm_mul_ps(fscal,dz12);
507
508             /* Update vectorial force */
509             fix1             = _mm_add_ps(fix1,tx);
510             fiy1             = _mm_add_ps(fiy1,ty);
511             fiz1             = _mm_add_ps(fiz1,tz);
512
513             fjx2             = _mm_add_ps(fjx2,tx);
514             fjy2             = _mm_add_ps(fjy2,ty);
515             fjz2             = _mm_add_ps(fjz2,tz);
516
517             }
518
519             /**************************
520              * CALCULATE INTERACTIONS *
521              **************************/
522
523             if (gmx_mm_any_lt(rsq13,rcutoff2))
524             {
525
526             r13              = _mm_mul_ps(rsq13,rinv13);
527
528             /* EWALD ELECTROSTATICS */
529
530             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
531             ewrt             = _mm_mul_ps(r13,ewtabscale);
532             ewitab           = _mm_cvttps_epi32(ewrt);
533             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
534             ewitab           = _mm_slli_epi32(ewitab,2);
535             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
536             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
537             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
538             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
539             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
540             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
541             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
542             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
543             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
544
545             d                = _mm_sub_ps(r13,rswitch);
546             d                = _mm_max_ps(d,_mm_setzero_ps());
547             d2               = _mm_mul_ps(d,d);
548             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
549
550             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
551
552             /* Evaluate switch function */
553             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
554             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
555             velec            = _mm_mul_ps(velec,sw);
556             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
557
558             /* Update potential sum for this i atom from the interaction with this j atom. */
559             velec            = _mm_and_ps(velec,cutoff_mask);
560             velecsum         = _mm_add_ps(velecsum,velec);
561
562             fscal            = felec;
563
564             fscal            = _mm_and_ps(fscal,cutoff_mask);
565
566             /* Calculate temporary vectorial force */
567             tx               = _mm_mul_ps(fscal,dx13);
568             ty               = _mm_mul_ps(fscal,dy13);
569             tz               = _mm_mul_ps(fscal,dz13);
570
571             /* Update vectorial force */
572             fix1             = _mm_add_ps(fix1,tx);
573             fiy1             = _mm_add_ps(fiy1,ty);
574             fiz1             = _mm_add_ps(fiz1,tz);
575
576             fjx3             = _mm_add_ps(fjx3,tx);
577             fjy3             = _mm_add_ps(fjy3,ty);
578             fjz3             = _mm_add_ps(fjz3,tz);
579
580             }
581
582             /**************************
583              * CALCULATE INTERACTIONS *
584              **************************/
585
586             if (gmx_mm_any_lt(rsq21,rcutoff2))
587             {
588
589             r21              = _mm_mul_ps(rsq21,rinv21);
590
591             /* EWALD ELECTROSTATICS */
592
593             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
594             ewrt             = _mm_mul_ps(r21,ewtabscale);
595             ewitab           = _mm_cvttps_epi32(ewrt);
596             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
597             ewitab           = _mm_slli_epi32(ewitab,2);
598             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
599             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
600             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
601             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
602             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
603             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
604             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
605             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
606             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
607
608             d                = _mm_sub_ps(r21,rswitch);
609             d                = _mm_max_ps(d,_mm_setzero_ps());
610             d2               = _mm_mul_ps(d,d);
611             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
612
613             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
614
615             /* Evaluate switch function */
616             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
617             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
618             velec            = _mm_mul_ps(velec,sw);
619             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
620
621             /* Update potential sum for this i atom from the interaction with this j atom. */
622             velec            = _mm_and_ps(velec,cutoff_mask);
623             velecsum         = _mm_add_ps(velecsum,velec);
624
625             fscal            = felec;
626
627             fscal            = _mm_and_ps(fscal,cutoff_mask);
628
629             /* Calculate temporary vectorial force */
630             tx               = _mm_mul_ps(fscal,dx21);
631             ty               = _mm_mul_ps(fscal,dy21);
632             tz               = _mm_mul_ps(fscal,dz21);
633
634             /* Update vectorial force */
635             fix2             = _mm_add_ps(fix2,tx);
636             fiy2             = _mm_add_ps(fiy2,ty);
637             fiz2             = _mm_add_ps(fiz2,tz);
638
639             fjx1             = _mm_add_ps(fjx1,tx);
640             fjy1             = _mm_add_ps(fjy1,ty);
641             fjz1             = _mm_add_ps(fjz1,tz);
642
643             }
644
645             /**************************
646              * CALCULATE INTERACTIONS *
647              **************************/
648
649             if (gmx_mm_any_lt(rsq22,rcutoff2))
650             {
651
652             r22              = _mm_mul_ps(rsq22,rinv22);
653
654             /* EWALD ELECTROSTATICS */
655
656             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
657             ewrt             = _mm_mul_ps(r22,ewtabscale);
658             ewitab           = _mm_cvttps_epi32(ewrt);
659             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
660             ewitab           = _mm_slli_epi32(ewitab,2);
661             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
662             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
663             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
664             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
665             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
666             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
667             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
668             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
669             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
670
671             d                = _mm_sub_ps(r22,rswitch);
672             d                = _mm_max_ps(d,_mm_setzero_ps());
673             d2               = _mm_mul_ps(d,d);
674             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
675
676             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
677
678             /* Evaluate switch function */
679             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
680             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
681             velec            = _mm_mul_ps(velec,sw);
682             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
683
684             /* Update potential sum for this i atom from the interaction with this j atom. */
685             velec            = _mm_and_ps(velec,cutoff_mask);
686             velecsum         = _mm_add_ps(velecsum,velec);
687
688             fscal            = felec;
689
690             fscal            = _mm_and_ps(fscal,cutoff_mask);
691
692             /* Calculate temporary vectorial force */
693             tx               = _mm_mul_ps(fscal,dx22);
694             ty               = _mm_mul_ps(fscal,dy22);
695             tz               = _mm_mul_ps(fscal,dz22);
696
697             /* Update vectorial force */
698             fix2             = _mm_add_ps(fix2,tx);
699             fiy2             = _mm_add_ps(fiy2,ty);
700             fiz2             = _mm_add_ps(fiz2,tz);
701
702             fjx2             = _mm_add_ps(fjx2,tx);
703             fjy2             = _mm_add_ps(fjy2,ty);
704             fjz2             = _mm_add_ps(fjz2,tz);
705
706             }
707
708             /**************************
709              * CALCULATE INTERACTIONS *
710              **************************/
711
712             if (gmx_mm_any_lt(rsq23,rcutoff2))
713             {
714
715             r23              = _mm_mul_ps(rsq23,rinv23);
716
717             /* EWALD ELECTROSTATICS */
718
719             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
720             ewrt             = _mm_mul_ps(r23,ewtabscale);
721             ewitab           = _mm_cvttps_epi32(ewrt);
722             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
723             ewitab           = _mm_slli_epi32(ewitab,2);
724             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
725             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
726             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
727             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
728             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
729             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
730             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
731             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
732             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
733
734             d                = _mm_sub_ps(r23,rswitch);
735             d                = _mm_max_ps(d,_mm_setzero_ps());
736             d2               = _mm_mul_ps(d,d);
737             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
738
739             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
740
741             /* Evaluate switch function */
742             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
743             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
744             velec            = _mm_mul_ps(velec,sw);
745             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
746
747             /* Update potential sum for this i atom from the interaction with this j atom. */
748             velec            = _mm_and_ps(velec,cutoff_mask);
749             velecsum         = _mm_add_ps(velecsum,velec);
750
751             fscal            = felec;
752
753             fscal            = _mm_and_ps(fscal,cutoff_mask);
754
755             /* Calculate temporary vectorial force */
756             tx               = _mm_mul_ps(fscal,dx23);
757             ty               = _mm_mul_ps(fscal,dy23);
758             tz               = _mm_mul_ps(fscal,dz23);
759
760             /* Update vectorial force */
761             fix2             = _mm_add_ps(fix2,tx);
762             fiy2             = _mm_add_ps(fiy2,ty);
763             fiz2             = _mm_add_ps(fiz2,tz);
764
765             fjx3             = _mm_add_ps(fjx3,tx);
766             fjy3             = _mm_add_ps(fjy3,ty);
767             fjz3             = _mm_add_ps(fjz3,tz);
768
769             }
770
771             /**************************
772              * CALCULATE INTERACTIONS *
773              **************************/
774
775             if (gmx_mm_any_lt(rsq31,rcutoff2))
776             {
777
778             r31              = _mm_mul_ps(rsq31,rinv31);
779
780             /* EWALD ELECTROSTATICS */
781
782             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
783             ewrt             = _mm_mul_ps(r31,ewtabscale);
784             ewitab           = _mm_cvttps_epi32(ewrt);
785             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
786             ewitab           = _mm_slli_epi32(ewitab,2);
787             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
788             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
789             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
790             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
791             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
792             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
793             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
794             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
795             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
796
797             d                = _mm_sub_ps(r31,rswitch);
798             d                = _mm_max_ps(d,_mm_setzero_ps());
799             d2               = _mm_mul_ps(d,d);
800             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
801
802             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
803
804             /* Evaluate switch function */
805             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
806             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
807             velec            = _mm_mul_ps(velec,sw);
808             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
809
810             /* Update potential sum for this i atom from the interaction with this j atom. */
811             velec            = _mm_and_ps(velec,cutoff_mask);
812             velecsum         = _mm_add_ps(velecsum,velec);
813
814             fscal            = felec;
815
816             fscal            = _mm_and_ps(fscal,cutoff_mask);
817
818             /* Calculate temporary vectorial force */
819             tx               = _mm_mul_ps(fscal,dx31);
820             ty               = _mm_mul_ps(fscal,dy31);
821             tz               = _mm_mul_ps(fscal,dz31);
822
823             /* Update vectorial force */
824             fix3             = _mm_add_ps(fix3,tx);
825             fiy3             = _mm_add_ps(fiy3,ty);
826             fiz3             = _mm_add_ps(fiz3,tz);
827
828             fjx1             = _mm_add_ps(fjx1,tx);
829             fjy1             = _mm_add_ps(fjy1,ty);
830             fjz1             = _mm_add_ps(fjz1,tz);
831
832             }
833
834             /**************************
835              * CALCULATE INTERACTIONS *
836              **************************/
837
838             if (gmx_mm_any_lt(rsq32,rcutoff2))
839             {
840
841             r32              = _mm_mul_ps(rsq32,rinv32);
842
843             /* EWALD ELECTROSTATICS */
844
845             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
846             ewrt             = _mm_mul_ps(r32,ewtabscale);
847             ewitab           = _mm_cvttps_epi32(ewrt);
848             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
849             ewitab           = _mm_slli_epi32(ewitab,2);
850             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
851             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
852             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
853             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
854             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
855             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
856             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
857             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
858             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
859
860             d                = _mm_sub_ps(r32,rswitch);
861             d                = _mm_max_ps(d,_mm_setzero_ps());
862             d2               = _mm_mul_ps(d,d);
863             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
864
865             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
866
867             /* Evaluate switch function */
868             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
869             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
870             velec            = _mm_mul_ps(velec,sw);
871             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
872
873             /* Update potential sum for this i atom from the interaction with this j atom. */
874             velec            = _mm_and_ps(velec,cutoff_mask);
875             velecsum         = _mm_add_ps(velecsum,velec);
876
877             fscal            = felec;
878
879             fscal            = _mm_and_ps(fscal,cutoff_mask);
880
881             /* Calculate temporary vectorial force */
882             tx               = _mm_mul_ps(fscal,dx32);
883             ty               = _mm_mul_ps(fscal,dy32);
884             tz               = _mm_mul_ps(fscal,dz32);
885
886             /* Update vectorial force */
887             fix3             = _mm_add_ps(fix3,tx);
888             fiy3             = _mm_add_ps(fiy3,ty);
889             fiz3             = _mm_add_ps(fiz3,tz);
890
891             fjx2             = _mm_add_ps(fjx2,tx);
892             fjy2             = _mm_add_ps(fjy2,ty);
893             fjz2             = _mm_add_ps(fjz2,tz);
894
895             }
896
897             /**************************
898              * CALCULATE INTERACTIONS *
899              **************************/
900
901             if (gmx_mm_any_lt(rsq33,rcutoff2))
902             {
903
904             r33              = _mm_mul_ps(rsq33,rinv33);
905
906             /* EWALD ELECTROSTATICS */
907
908             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
909             ewrt             = _mm_mul_ps(r33,ewtabscale);
910             ewitab           = _mm_cvttps_epi32(ewrt);
911             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
912             ewitab           = _mm_slli_epi32(ewitab,2);
913             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
914             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
915             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
916             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
917             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
918             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
919             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
920             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
921             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
922
923             d                = _mm_sub_ps(r33,rswitch);
924             d                = _mm_max_ps(d,_mm_setzero_ps());
925             d2               = _mm_mul_ps(d,d);
926             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
927
928             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
929
930             /* Evaluate switch function */
931             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
932             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
933             velec            = _mm_mul_ps(velec,sw);
934             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
935
936             /* Update potential sum for this i atom from the interaction with this j atom. */
937             velec            = _mm_and_ps(velec,cutoff_mask);
938             velecsum         = _mm_add_ps(velecsum,velec);
939
940             fscal            = felec;
941
942             fscal            = _mm_and_ps(fscal,cutoff_mask);
943
944             /* Calculate temporary vectorial force */
945             tx               = _mm_mul_ps(fscal,dx33);
946             ty               = _mm_mul_ps(fscal,dy33);
947             tz               = _mm_mul_ps(fscal,dz33);
948
949             /* Update vectorial force */
950             fix3             = _mm_add_ps(fix3,tx);
951             fiy3             = _mm_add_ps(fiy3,ty);
952             fiz3             = _mm_add_ps(fiz3,tz);
953
954             fjx3             = _mm_add_ps(fjx3,tx);
955             fjy3             = _mm_add_ps(fjy3,ty);
956             fjz3             = _mm_add_ps(fjz3,tz);
957
958             }
959
960             fjptrA             = f+j_coord_offsetA;
961             fjptrB             = f+j_coord_offsetB;
962             fjptrC             = f+j_coord_offsetC;
963             fjptrD             = f+j_coord_offsetD;
964
965             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
966                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
967                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
968
969             /* Inner loop uses 647 flops */
970         }
971
972         if(jidx<j_index_end)
973         {
974
975             /* Get j neighbor index, and coordinate index */
976             jnrlistA         = jjnr[jidx];
977             jnrlistB         = jjnr[jidx+1];
978             jnrlistC         = jjnr[jidx+2];
979             jnrlistD         = jjnr[jidx+3];
980             /* Sign of each element will be negative for non-real atoms.
981              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
982              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
983              */
984             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
985             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
986             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
987             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
988             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
989             j_coord_offsetA  = DIM*jnrA;
990             j_coord_offsetB  = DIM*jnrB;
991             j_coord_offsetC  = DIM*jnrC;
992             j_coord_offsetD  = DIM*jnrD;
993
994             /* load j atom coordinates */
995             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
996                                               x+j_coord_offsetC,x+j_coord_offsetD,
997                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
998                                               &jy2,&jz2,&jx3,&jy3,&jz3);
999
1000             /* Calculate displacement vector */
1001             dx00             = _mm_sub_ps(ix0,jx0);
1002             dy00             = _mm_sub_ps(iy0,jy0);
1003             dz00             = _mm_sub_ps(iz0,jz0);
1004             dx11             = _mm_sub_ps(ix1,jx1);
1005             dy11             = _mm_sub_ps(iy1,jy1);
1006             dz11             = _mm_sub_ps(iz1,jz1);
1007             dx12             = _mm_sub_ps(ix1,jx2);
1008             dy12             = _mm_sub_ps(iy1,jy2);
1009             dz12             = _mm_sub_ps(iz1,jz2);
1010             dx13             = _mm_sub_ps(ix1,jx3);
1011             dy13             = _mm_sub_ps(iy1,jy3);
1012             dz13             = _mm_sub_ps(iz1,jz3);
1013             dx21             = _mm_sub_ps(ix2,jx1);
1014             dy21             = _mm_sub_ps(iy2,jy1);
1015             dz21             = _mm_sub_ps(iz2,jz1);
1016             dx22             = _mm_sub_ps(ix2,jx2);
1017             dy22             = _mm_sub_ps(iy2,jy2);
1018             dz22             = _mm_sub_ps(iz2,jz2);
1019             dx23             = _mm_sub_ps(ix2,jx3);
1020             dy23             = _mm_sub_ps(iy2,jy3);
1021             dz23             = _mm_sub_ps(iz2,jz3);
1022             dx31             = _mm_sub_ps(ix3,jx1);
1023             dy31             = _mm_sub_ps(iy3,jy1);
1024             dz31             = _mm_sub_ps(iz3,jz1);
1025             dx32             = _mm_sub_ps(ix3,jx2);
1026             dy32             = _mm_sub_ps(iy3,jy2);
1027             dz32             = _mm_sub_ps(iz3,jz2);
1028             dx33             = _mm_sub_ps(ix3,jx3);
1029             dy33             = _mm_sub_ps(iy3,jy3);
1030             dz33             = _mm_sub_ps(iz3,jz3);
1031
1032             /* Calculate squared distance and things based on it */
1033             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1034             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1035             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1036             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1037             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1038             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1039             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1040             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1041             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1042             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1043
1044             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1045             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1046             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1047             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1048             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1049             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1050             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1051             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1052             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1053             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1054
1055             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1056             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1057             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1058             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1059             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1060             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1061             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1062             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1063             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1064             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1065
1066             fjx0             = _mm_setzero_ps();
1067             fjy0             = _mm_setzero_ps();
1068             fjz0             = _mm_setzero_ps();
1069             fjx1             = _mm_setzero_ps();
1070             fjy1             = _mm_setzero_ps();
1071             fjz1             = _mm_setzero_ps();
1072             fjx2             = _mm_setzero_ps();
1073             fjy2             = _mm_setzero_ps();
1074             fjz2             = _mm_setzero_ps();
1075             fjx3             = _mm_setzero_ps();
1076             fjy3             = _mm_setzero_ps();
1077             fjz3             = _mm_setzero_ps();
1078
1079             /**************************
1080              * CALCULATE INTERACTIONS *
1081              **************************/
1082
1083             if (gmx_mm_any_lt(rsq00,rcutoff2))
1084             {
1085
1086             r00              = _mm_mul_ps(rsq00,rinv00);
1087             r00              = _mm_andnot_ps(dummy_mask,r00);
1088
1089             /* LENNARD-JONES DISPERSION/REPULSION */
1090
1091             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1092             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1093             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1094             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1095             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1096
1097             d                = _mm_sub_ps(r00,rswitch);
1098             d                = _mm_max_ps(d,_mm_setzero_ps());
1099             d2               = _mm_mul_ps(d,d);
1100             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1101
1102             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1103
1104             /* Evaluate switch function */
1105             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1106             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1107             vvdw             = _mm_mul_ps(vvdw,sw);
1108             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1109
1110             /* Update potential sum for this i atom from the interaction with this j atom. */
1111             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
1112             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
1113             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
1114
1115             fscal            = fvdw;
1116
1117             fscal            = _mm_and_ps(fscal,cutoff_mask);
1118
1119             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1120
1121             /* Calculate temporary vectorial force */
1122             tx               = _mm_mul_ps(fscal,dx00);
1123             ty               = _mm_mul_ps(fscal,dy00);
1124             tz               = _mm_mul_ps(fscal,dz00);
1125
1126             /* Update vectorial force */
1127             fix0             = _mm_add_ps(fix0,tx);
1128             fiy0             = _mm_add_ps(fiy0,ty);
1129             fiz0             = _mm_add_ps(fiz0,tz);
1130
1131             fjx0             = _mm_add_ps(fjx0,tx);
1132             fjy0             = _mm_add_ps(fjy0,ty);
1133             fjz0             = _mm_add_ps(fjz0,tz);
1134
1135             }
1136
1137             /**************************
1138              * CALCULATE INTERACTIONS *
1139              **************************/
1140
1141             if (gmx_mm_any_lt(rsq11,rcutoff2))
1142             {
1143
1144             r11              = _mm_mul_ps(rsq11,rinv11);
1145             r11              = _mm_andnot_ps(dummy_mask,r11);
1146
1147             /* EWALD ELECTROSTATICS */
1148
1149             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150             ewrt             = _mm_mul_ps(r11,ewtabscale);
1151             ewitab           = _mm_cvttps_epi32(ewrt);
1152             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1153             ewitab           = _mm_slli_epi32(ewitab,2);
1154             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1155             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1156             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1157             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1158             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1159             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1160             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1161             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1162             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1163
1164             d                = _mm_sub_ps(r11,rswitch);
1165             d                = _mm_max_ps(d,_mm_setzero_ps());
1166             d2               = _mm_mul_ps(d,d);
1167             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1168
1169             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1170
1171             /* Evaluate switch function */
1172             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1173             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1174             velec            = _mm_mul_ps(velec,sw);
1175             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1176
1177             /* Update potential sum for this i atom from the interaction with this j atom. */
1178             velec            = _mm_and_ps(velec,cutoff_mask);
1179             velec            = _mm_andnot_ps(dummy_mask,velec);
1180             velecsum         = _mm_add_ps(velecsum,velec);
1181
1182             fscal            = felec;
1183
1184             fscal            = _mm_and_ps(fscal,cutoff_mask);
1185
1186             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1187
1188             /* Calculate temporary vectorial force */
1189             tx               = _mm_mul_ps(fscal,dx11);
1190             ty               = _mm_mul_ps(fscal,dy11);
1191             tz               = _mm_mul_ps(fscal,dz11);
1192
1193             /* Update vectorial force */
1194             fix1             = _mm_add_ps(fix1,tx);
1195             fiy1             = _mm_add_ps(fiy1,ty);
1196             fiz1             = _mm_add_ps(fiz1,tz);
1197
1198             fjx1             = _mm_add_ps(fjx1,tx);
1199             fjy1             = _mm_add_ps(fjy1,ty);
1200             fjz1             = _mm_add_ps(fjz1,tz);
1201
1202             }
1203
1204             /**************************
1205              * CALCULATE INTERACTIONS *
1206              **************************/
1207
1208             if (gmx_mm_any_lt(rsq12,rcutoff2))
1209             {
1210
1211             r12              = _mm_mul_ps(rsq12,rinv12);
1212             r12              = _mm_andnot_ps(dummy_mask,r12);
1213
1214             /* EWALD ELECTROSTATICS */
1215
1216             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1217             ewrt             = _mm_mul_ps(r12,ewtabscale);
1218             ewitab           = _mm_cvttps_epi32(ewrt);
1219             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1220             ewitab           = _mm_slli_epi32(ewitab,2);
1221             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1222             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1223             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1224             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1225             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1226             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1227             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1228             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1229             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1230
1231             d                = _mm_sub_ps(r12,rswitch);
1232             d                = _mm_max_ps(d,_mm_setzero_ps());
1233             d2               = _mm_mul_ps(d,d);
1234             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1235
1236             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1237
1238             /* Evaluate switch function */
1239             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1240             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1241             velec            = _mm_mul_ps(velec,sw);
1242             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1243
1244             /* Update potential sum for this i atom from the interaction with this j atom. */
1245             velec            = _mm_and_ps(velec,cutoff_mask);
1246             velec            = _mm_andnot_ps(dummy_mask,velec);
1247             velecsum         = _mm_add_ps(velecsum,velec);
1248
1249             fscal            = felec;
1250
1251             fscal            = _mm_and_ps(fscal,cutoff_mask);
1252
1253             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1254
1255             /* Calculate temporary vectorial force */
1256             tx               = _mm_mul_ps(fscal,dx12);
1257             ty               = _mm_mul_ps(fscal,dy12);
1258             tz               = _mm_mul_ps(fscal,dz12);
1259
1260             /* Update vectorial force */
1261             fix1             = _mm_add_ps(fix1,tx);
1262             fiy1             = _mm_add_ps(fiy1,ty);
1263             fiz1             = _mm_add_ps(fiz1,tz);
1264
1265             fjx2             = _mm_add_ps(fjx2,tx);
1266             fjy2             = _mm_add_ps(fjy2,ty);
1267             fjz2             = _mm_add_ps(fjz2,tz);
1268
1269             }
1270
1271             /**************************
1272              * CALCULATE INTERACTIONS *
1273              **************************/
1274
1275             if (gmx_mm_any_lt(rsq13,rcutoff2))
1276             {
1277
1278             r13              = _mm_mul_ps(rsq13,rinv13);
1279             r13              = _mm_andnot_ps(dummy_mask,r13);
1280
1281             /* EWALD ELECTROSTATICS */
1282
1283             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1284             ewrt             = _mm_mul_ps(r13,ewtabscale);
1285             ewitab           = _mm_cvttps_epi32(ewrt);
1286             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1287             ewitab           = _mm_slli_epi32(ewitab,2);
1288             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1289             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1290             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1291             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1292             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1293             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1294             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1295             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1296             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1297
1298             d                = _mm_sub_ps(r13,rswitch);
1299             d                = _mm_max_ps(d,_mm_setzero_ps());
1300             d2               = _mm_mul_ps(d,d);
1301             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1302
1303             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1304
1305             /* Evaluate switch function */
1306             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1307             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1308             velec            = _mm_mul_ps(velec,sw);
1309             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1310
1311             /* Update potential sum for this i atom from the interaction with this j atom. */
1312             velec            = _mm_and_ps(velec,cutoff_mask);
1313             velec            = _mm_andnot_ps(dummy_mask,velec);
1314             velecsum         = _mm_add_ps(velecsum,velec);
1315
1316             fscal            = felec;
1317
1318             fscal            = _mm_and_ps(fscal,cutoff_mask);
1319
1320             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1321
1322             /* Calculate temporary vectorial force */
1323             tx               = _mm_mul_ps(fscal,dx13);
1324             ty               = _mm_mul_ps(fscal,dy13);
1325             tz               = _mm_mul_ps(fscal,dz13);
1326
1327             /* Update vectorial force */
1328             fix1             = _mm_add_ps(fix1,tx);
1329             fiy1             = _mm_add_ps(fiy1,ty);
1330             fiz1             = _mm_add_ps(fiz1,tz);
1331
1332             fjx3             = _mm_add_ps(fjx3,tx);
1333             fjy3             = _mm_add_ps(fjy3,ty);
1334             fjz3             = _mm_add_ps(fjz3,tz);
1335
1336             }
1337
1338             /**************************
1339              * CALCULATE INTERACTIONS *
1340              **************************/
1341
1342             if (gmx_mm_any_lt(rsq21,rcutoff2))
1343             {
1344
1345             r21              = _mm_mul_ps(rsq21,rinv21);
1346             r21              = _mm_andnot_ps(dummy_mask,r21);
1347
1348             /* EWALD ELECTROSTATICS */
1349
1350             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1351             ewrt             = _mm_mul_ps(r21,ewtabscale);
1352             ewitab           = _mm_cvttps_epi32(ewrt);
1353             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1354             ewitab           = _mm_slli_epi32(ewitab,2);
1355             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1356             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1357             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1358             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1359             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1360             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1361             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1362             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1363             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1364
1365             d                = _mm_sub_ps(r21,rswitch);
1366             d                = _mm_max_ps(d,_mm_setzero_ps());
1367             d2               = _mm_mul_ps(d,d);
1368             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1369
1370             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1371
1372             /* Evaluate switch function */
1373             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1374             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1375             velec            = _mm_mul_ps(velec,sw);
1376             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1377
1378             /* Update potential sum for this i atom from the interaction with this j atom. */
1379             velec            = _mm_and_ps(velec,cutoff_mask);
1380             velec            = _mm_andnot_ps(dummy_mask,velec);
1381             velecsum         = _mm_add_ps(velecsum,velec);
1382
1383             fscal            = felec;
1384
1385             fscal            = _mm_and_ps(fscal,cutoff_mask);
1386
1387             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1388
1389             /* Calculate temporary vectorial force */
1390             tx               = _mm_mul_ps(fscal,dx21);
1391             ty               = _mm_mul_ps(fscal,dy21);
1392             tz               = _mm_mul_ps(fscal,dz21);
1393
1394             /* Update vectorial force */
1395             fix2             = _mm_add_ps(fix2,tx);
1396             fiy2             = _mm_add_ps(fiy2,ty);
1397             fiz2             = _mm_add_ps(fiz2,tz);
1398
1399             fjx1             = _mm_add_ps(fjx1,tx);
1400             fjy1             = _mm_add_ps(fjy1,ty);
1401             fjz1             = _mm_add_ps(fjz1,tz);
1402
1403             }
1404
1405             /**************************
1406              * CALCULATE INTERACTIONS *
1407              **************************/
1408
1409             if (gmx_mm_any_lt(rsq22,rcutoff2))
1410             {
1411
1412             r22              = _mm_mul_ps(rsq22,rinv22);
1413             r22              = _mm_andnot_ps(dummy_mask,r22);
1414
1415             /* EWALD ELECTROSTATICS */
1416
1417             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1418             ewrt             = _mm_mul_ps(r22,ewtabscale);
1419             ewitab           = _mm_cvttps_epi32(ewrt);
1420             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1421             ewitab           = _mm_slli_epi32(ewitab,2);
1422             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1423             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1424             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1425             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1426             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1427             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1428             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1429             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1430             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1431
1432             d                = _mm_sub_ps(r22,rswitch);
1433             d                = _mm_max_ps(d,_mm_setzero_ps());
1434             d2               = _mm_mul_ps(d,d);
1435             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1436
1437             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1438
1439             /* Evaluate switch function */
1440             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1441             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1442             velec            = _mm_mul_ps(velec,sw);
1443             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1444
1445             /* Update potential sum for this i atom from the interaction with this j atom. */
1446             velec            = _mm_and_ps(velec,cutoff_mask);
1447             velec            = _mm_andnot_ps(dummy_mask,velec);
1448             velecsum         = _mm_add_ps(velecsum,velec);
1449
1450             fscal            = felec;
1451
1452             fscal            = _mm_and_ps(fscal,cutoff_mask);
1453
1454             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1455
1456             /* Calculate temporary vectorial force */
1457             tx               = _mm_mul_ps(fscal,dx22);
1458             ty               = _mm_mul_ps(fscal,dy22);
1459             tz               = _mm_mul_ps(fscal,dz22);
1460
1461             /* Update vectorial force */
1462             fix2             = _mm_add_ps(fix2,tx);
1463             fiy2             = _mm_add_ps(fiy2,ty);
1464             fiz2             = _mm_add_ps(fiz2,tz);
1465
1466             fjx2             = _mm_add_ps(fjx2,tx);
1467             fjy2             = _mm_add_ps(fjy2,ty);
1468             fjz2             = _mm_add_ps(fjz2,tz);
1469
1470             }
1471
1472             /**************************
1473              * CALCULATE INTERACTIONS *
1474              **************************/
1475
1476             if (gmx_mm_any_lt(rsq23,rcutoff2))
1477             {
1478
1479             r23              = _mm_mul_ps(rsq23,rinv23);
1480             r23              = _mm_andnot_ps(dummy_mask,r23);
1481
1482             /* EWALD ELECTROSTATICS */
1483
1484             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1485             ewrt             = _mm_mul_ps(r23,ewtabscale);
1486             ewitab           = _mm_cvttps_epi32(ewrt);
1487             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1488             ewitab           = _mm_slli_epi32(ewitab,2);
1489             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1490             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1491             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1492             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1493             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1494             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1495             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1496             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1497             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1498
1499             d                = _mm_sub_ps(r23,rswitch);
1500             d                = _mm_max_ps(d,_mm_setzero_ps());
1501             d2               = _mm_mul_ps(d,d);
1502             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1503
1504             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1505
1506             /* Evaluate switch function */
1507             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1508             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1509             velec            = _mm_mul_ps(velec,sw);
1510             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1511
1512             /* Update potential sum for this i atom from the interaction with this j atom. */
1513             velec            = _mm_and_ps(velec,cutoff_mask);
1514             velec            = _mm_andnot_ps(dummy_mask,velec);
1515             velecsum         = _mm_add_ps(velecsum,velec);
1516
1517             fscal            = felec;
1518
1519             fscal            = _mm_and_ps(fscal,cutoff_mask);
1520
1521             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1522
1523             /* Calculate temporary vectorial force */
1524             tx               = _mm_mul_ps(fscal,dx23);
1525             ty               = _mm_mul_ps(fscal,dy23);
1526             tz               = _mm_mul_ps(fscal,dz23);
1527
1528             /* Update vectorial force */
1529             fix2             = _mm_add_ps(fix2,tx);
1530             fiy2             = _mm_add_ps(fiy2,ty);
1531             fiz2             = _mm_add_ps(fiz2,tz);
1532
1533             fjx3             = _mm_add_ps(fjx3,tx);
1534             fjy3             = _mm_add_ps(fjy3,ty);
1535             fjz3             = _mm_add_ps(fjz3,tz);
1536
1537             }
1538
1539             /**************************
1540              * CALCULATE INTERACTIONS *
1541              **************************/
1542
1543             if (gmx_mm_any_lt(rsq31,rcutoff2))
1544             {
1545
1546             r31              = _mm_mul_ps(rsq31,rinv31);
1547             r31              = _mm_andnot_ps(dummy_mask,r31);
1548
1549             /* EWALD ELECTROSTATICS */
1550
1551             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1552             ewrt             = _mm_mul_ps(r31,ewtabscale);
1553             ewitab           = _mm_cvttps_epi32(ewrt);
1554             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1555             ewitab           = _mm_slli_epi32(ewitab,2);
1556             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1557             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1558             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1559             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1560             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1561             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1562             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1563             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1564             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1565
1566             d                = _mm_sub_ps(r31,rswitch);
1567             d                = _mm_max_ps(d,_mm_setzero_ps());
1568             d2               = _mm_mul_ps(d,d);
1569             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1570
1571             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1572
1573             /* Evaluate switch function */
1574             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1575             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1576             velec            = _mm_mul_ps(velec,sw);
1577             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1578
1579             /* Update potential sum for this i atom from the interaction with this j atom. */
1580             velec            = _mm_and_ps(velec,cutoff_mask);
1581             velec            = _mm_andnot_ps(dummy_mask,velec);
1582             velecsum         = _mm_add_ps(velecsum,velec);
1583
1584             fscal            = felec;
1585
1586             fscal            = _mm_and_ps(fscal,cutoff_mask);
1587
1588             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1589
1590             /* Calculate temporary vectorial force */
1591             tx               = _mm_mul_ps(fscal,dx31);
1592             ty               = _mm_mul_ps(fscal,dy31);
1593             tz               = _mm_mul_ps(fscal,dz31);
1594
1595             /* Update vectorial force */
1596             fix3             = _mm_add_ps(fix3,tx);
1597             fiy3             = _mm_add_ps(fiy3,ty);
1598             fiz3             = _mm_add_ps(fiz3,tz);
1599
1600             fjx1             = _mm_add_ps(fjx1,tx);
1601             fjy1             = _mm_add_ps(fjy1,ty);
1602             fjz1             = _mm_add_ps(fjz1,tz);
1603
1604             }
1605
1606             /**************************
1607              * CALCULATE INTERACTIONS *
1608              **************************/
1609
1610             if (gmx_mm_any_lt(rsq32,rcutoff2))
1611             {
1612
1613             r32              = _mm_mul_ps(rsq32,rinv32);
1614             r32              = _mm_andnot_ps(dummy_mask,r32);
1615
1616             /* EWALD ELECTROSTATICS */
1617
1618             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1619             ewrt             = _mm_mul_ps(r32,ewtabscale);
1620             ewitab           = _mm_cvttps_epi32(ewrt);
1621             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1622             ewitab           = _mm_slli_epi32(ewitab,2);
1623             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1624             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1625             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1626             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1627             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1628             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1629             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1630             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1631             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1632
1633             d                = _mm_sub_ps(r32,rswitch);
1634             d                = _mm_max_ps(d,_mm_setzero_ps());
1635             d2               = _mm_mul_ps(d,d);
1636             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1637
1638             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1639
1640             /* Evaluate switch function */
1641             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1642             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1643             velec            = _mm_mul_ps(velec,sw);
1644             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1645
1646             /* Update potential sum for this i atom from the interaction with this j atom. */
1647             velec            = _mm_and_ps(velec,cutoff_mask);
1648             velec            = _mm_andnot_ps(dummy_mask,velec);
1649             velecsum         = _mm_add_ps(velecsum,velec);
1650
1651             fscal            = felec;
1652
1653             fscal            = _mm_and_ps(fscal,cutoff_mask);
1654
1655             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1656
1657             /* Calculate temporary vectorial force */
1658             tx               = _mm_mul_ps(fscal,dx32);
1659             ty               = _mm_mul_ps(fscal,dy32);
1660             tz               = _mm_mul_ps(fscal,dz32);
1661
1662             /* Update vectorial force */
1663             fix3             = _mm_add_ps(fix3,tx);
1664             fiy3             = _mm_add_ps(fiy3,ty);
1665             fiz3             = _mm_add_ps(fiz3,tz);
1666
1667             fjx2             = _mm_add_ps(fjx2,tx);
1668             fjy2             = _mm_add_ps(fjy2,ty);
1669             fjz2             = _mm_add_ps(fjz2,tz);
1670
1671             }
1672
1673             /**************************
1674              * CALCULATE INTERACTIONS *
1675              **************************/
1676
1677             if (gmx_mm_any_lt(rsq33,rcutoff2))
1678             {
1679
1680             r33              = _mm_mul_ps(rsq33,rinv33);
1681             r33              = _mm_andnot_ps(dummy_mask,r33);
1682
1683             /* EWALD ELECTROSTATICS */
1684
1685             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1686             ewrt             = _mm_mul_ps(r33,ewtabscale);
1687             ewitab           = _mm_cvttps_epi32(ewrt);
1688             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1689             ewitab           = _mm_slli_epi32(ewitab,2);
1690             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1691             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1692             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1693             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1694             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1695             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1696             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1697             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1698             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1699
1700             d                = _mm_sub_ps(r33,rswitch);
1701             d                = _mm_max_ps(d,_mm_setzero_ps());
1702             d2               = _mm_mul_ps(d,d);
1703             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1704
1705             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1706
1707             /* Evaluate switch function */
1708             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1709             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1710             velec            = _mm_mul_ps(velec,sw);
1711             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1712
1713             /* Update potential sum for this i atom from the interaction with this j atom. */
1714             velec            = _mm_and_ps(velec,cutoff_mask);
1715             velec            = _mm_andnot_ps(dummy_mask,velec);
1716             velecsum         = _mm_add_ps(velecsum,velec);
1717
1718             fscal            = felec;
1719
1720             fscal            = _mm_and_ps(fscal,cutoff_mask);
1721
1722             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1723
1724             /* Calculate temporary vectorial force */
1725             tx               = _mm_mul_ps(fscal,dx33);
1726             ty               = _mm_mul_ps(fscal,dy33);
1727             tz               = _mm_mul_ps(fscal,dz33);
1728
1729             /* Update vectorial force */
1730             fix3             = _mm_add_ps(fix3,tx);
1731             fiy3             = _mm_add_ps(fiy3,ty);
1732             fiz3             = _mm_add_ps(fiz3,tz);
1733
1734             fjx3             = _mm_add_ps(fjx3,tx);
1735             fjy3             = _mm_add_ps(fjy3,ty);
1736             fjz3             = _mm_add_ps(fjz3,tz);
1737
1738             }
1739
1740             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1741             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1742             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1743             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1744
1745             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1746                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1747                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1748
1749             /* Inner loop uses 657 flops */
1750         }
1751
1752         /* End of innermost loop */
1753
1754         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1755                                               f+i_coord_offset,fshift+i_shift_offset);
1756
1757         ggid                        = gid[iidx];
1758         /* Update potential energies */
1759         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1760         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1761
1762         /* Increment number of inner iterations */
1763         inneriter                  += j_index_end - j_index_start;
1764
1765         /* Outer loop uses 26 flops */
1766     }
1767
1768     /* Increment number of outer iterations */
1769     outeriter        += nri;
1770
1771     /* Update outer/inner flops */
1772
1773     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
1774 }
1775 /*
1776  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single
1777  * Electrostatics interaction: Ewald
1778  * VdW interaction:            LennardJones
1779  * Geometry:                   Water4-Water4
1780  * Calculate force/pot:        Force
1781  */
1782 void
1783 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single
1784                     (t_nblist                    * gmx_restrict       nlist,
1785                      rvec                        * gmx_restrict          xx,
1786                      rvec                        * gmx_restrict          ff,
1787                      t_forcerec                  * gmx_restrict          fr,
1788                      t_mdatoms                   * gmx_restrict     mdatoms,
1789                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1790                      t_nrnb                      * gmx_restrict        nrnb)
1791 {
1792     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1793      * just 0 for non-waters.
1794      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1795      * jnr indices corresponding to data put in the four positions in the SIMD register.
1796      */
1797     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1798     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1799     int              jnrA,jnrB,jnrC,jnrD;
1800     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1801     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1802     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1803     real             rcutoff_scalar;
1804     real             *shiftvec,*fshift,*x,*f;
1805     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1806     real             scratch[4*DIM];
1807     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1808     int              vdwioffset0;
1809     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1810     int              vdwioffset1;
1811     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1812     int              vdwioffset2;
1813     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1814     int              vdwioffset3;
1815     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1816     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1817     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1818     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1819     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1820     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1821     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1822     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1823     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1824     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1825     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1826     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1827     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1828     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1829     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1830     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1831     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1832     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1833     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1834     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1835     real             *charge;
1836     int              nvdwtype;
1837     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1838     int              *vdwtype;
1839     real             *vdwparam;
1840     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1841     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1842     __m128i          ewitab;
1843     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1844     real             *ewtab;
1845     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1846     real             rswitch_scalar,d_scalar;
1847     __m128           dummy_mask,cutoff_mask;
1848     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1849     __m128           one     = _mm_set1_ps(1.0);
1850     __m128           two     = _mm_set1_ps(2.0);
1851     x                = xx[0];
1852     f                = ff[0];
1853
1854     nri              = nlist->nri;
1855     iinr             = nlist->iinr;
1856     jindex           = nlist->jindex;
1857     jjnr             = nlist->jjnr;
1858     shiftidx         = nlist->shift;
1859     gid              = nlist->gid;
1860     shiftvec         = fr->shift_vec[0];
1861     fshift           = fr->fshift[0];
1862     facel            = _mm_set1_ps(fr->epsfac);
1863     charge           = mdatoms->chargeA;
1864     nvdwtype         = fr->ntype;
1865     vdwparam         = fr->nbfp;
1866     vdwtype          = mdatoms->typeA;
1867
1868     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1869     ewtab            = fr->ic->tabq_coul_FDV0;
1870     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1871     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1872
1873     /* Setup water-specific parameters */
1874     inr              = nlist->iinr[0];
1875     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1876     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1877     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1878     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1879
1880     jq1              = _mm_set1_ps(charge[inr+1]);
1881     jq2              = _mm_set1_ps(charge[inr+2]);
1882     jq3              = _mm_set1_ps(charge[inr+3]);
1883     vdwjidx0A        = 2*vdwtype[inr+0];
1884     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1885     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1886     qq11             = _mm_mul_ps(iq1,jq1);
1887     qq12             = _mm_mul_ps(iq1,jq2);
1888     qq13             = _mm_mul_ps(iq1,jq3);
1889     qq21             = _mm_mul_ps(iq2,jq1);
1890     qq22             = _mm_mul_ps(iq2,jq2);
1891     qq23             = _mm_mul_ps(iq2,jq3);
1892     qq31             = _mm_mul_ps(iq3,jq1);
1893     qq32             = _mm_mul_ps(iq3,jq2);
1894     qq33             = _mm_mul_ps(iq3,jq3);
1895
1896     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1897     rcutoff_scalar   = fr->rcoulomb;
1898     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1899     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1900
1901     rswitch_scalar   = fr->rcoulomb_switch;
1902     rswitch          = _mm_set1_ps(rswitch_scalar);
1903     /* Setup switch parameters */
1904     d_scalar         = rcutoff_scalar-rswitch_scalar;
1905     d                = _mm_set1_ps(d_scalar);
1906     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1907     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1908     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1909     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1910     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1911     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1912
1913     /* Avoid stupid compiler warnings */
1914     jnrA = jnrB = jnrC = jnrD = 0;
1915     j_coord_offsetA = 0;
1916     j_coord_offsetB = 0;
1917     j_coord_offsetC = 0;
1918     j_coord_offsetD = 0;
1919
1920     outeriter        = 0;
1921     inneriter        = 0;
1922
1923     for(iidx=0;iidx<4*DIM;iidx++)
1924     {
1925         scratch[iidx] = 0.0;
1926     }
1927
1928     /* Start outer loop over neighborlists */
1929     for(iidx=0; iidx<nri; iidx++)
1930     {
1931         /* Load shift vector for this list */
1932         i_shift_offset   = DIM*shiftidx[iidx];
1933
1934         /* Load limits for loop over neighbors */
1935         j_index_start    = jindex[iidx];
1936         j_index_end      = jindex[iidx+1];
1937
1938         /* Get outer coordinate index */
1939         inr              = iinr[iidx];
1940         i_coord_offset   = DIM*inr;
1941
1942         /* Load i particle coords and add shift vector */
1943         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1944                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1945
1946         fix0             = _mm_setzero_ps();
1947         fiy0             = _mm_setzero_ps();
1948         fiz0             = _mm_setzero_ps();
1949         fix1             = _mm_setzero_ps();
1950         fiy1             = _mm_setzero_ps();
1951         fiz1             = _mm_setzero_ps();
1952         fix2             = _mm_setzero_ps();
1953         fiy2             = _mm_setzero_ps();
1954         fiz2             = _mm_setzero_ps();
1955         fix3             = _mm_setzero_ps();
1956         fiy3             = _mm_setzero_ps();
1957         fiz3             = _mm_setzero_ps();
1958
1959         /* Start inner kernel loop */
1960         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1961         {
1962
1963             /* Get j neighbor index, and coordinate index */
1964             jnrA             = jjnr[jidx];
1965             jnrB             = jjnr[jidx+1];
1966             jnrC             = jjnr[jidx+2];
1967             jnrD             = jjnr[jidx+3];
1968             j_coord_offsetA  = DIM*jnrA;
1969             j_coord_offsetB  = DIM*jnrB;
1970             j_coord_offsetC  = DIM*jnrC;
1971             j_coord_offsetD  = DIM*jnrD;
1972
1973             /* load j atom coordinates */
1974             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1975                                               x+j_coord_offsetC,x+j_coord_offsetD,
1976                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1977                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1978
1979             /* Calculate displacement vector */
1980             dx00             = _mm_sub_ps(ix0,jx0);
1981             dy00             = _mm_sub_ps(iy0,jy0);
1982             dz00             = _mm_sub_ps(iz0,jz0);
1983             dx11             = _mm_sub_ps(ix1,jx1);
1984             dy11             = _mm_sub_ps(iy1,jy1);
1985             dz11             = _mm_sub_ps(iz1,jz1);
1986             dx12             = _mm_sub_ps(ix1,jx2);
1987             dy12             = _mm_sub_ps(iy1,jy2);
1988             dz12             = _mm_sub_ps(iz1,jz2);
1989             dx13             = _mm_sub_ps(ix1,jx3);
1990             dy13             = _mm_sub_ps(iy1,jy3);
1991             dz13             = _mm_sub_ps(iz1,jz3);
1992             dx21             = _mm_sub_ps(ix2,jx1);
1993             dy21             = _mm_sub_ps(iy2,jy1);
1994             dz21             = _mm_sub_ps(iz2,jz1);
1995             dx22             = _mm_sub_ps(ix2,jx2);
1996             dy22             = _mm_sub_ps(iy2,jy2);
1997             dz22             = _mm_sub_ps(iz2,jz2);
1998             dx23             = _mm_sub_ps(ix2,jx3);
1999             dy23             = _mm_sub_ps(iy2,jy3);
2000             dz23             = _mm_sub_ps(iz2,jz3);
2001             dx31             = _mm_sub_ps(ix3,jx1);
2002             dy31             = _mm_sub_ps(iy3,jy1);
2003             dz31             = _mm_sub_ps(iz3,jz1);
2004             dx32             = _mm_sub_ps(ix3,jx2);
2005             dy32             = _mm_sub_ps(iy3,jy2);
2006             dz32             = _mm_sub_ps(iz3,jz2);
2007             dx33             = _mm_sub_ps(ix3,jx3);
2008             dy33             = _mm_sub_ps(iy3,jy3);
2009             dz33             = _mm_sub_ps(iz3,jz3);
2010
2011             /* Calculate squared distance and things based on it */
2012             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2013             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2014             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2015             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2016             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2017             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2018             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2019             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2020             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2021             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2022
2023             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2024             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2025             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2026             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2027             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2028             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2029             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2030             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2031             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2032             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2033
2034             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2035             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2036             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2037             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2038             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2039             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2040             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2041             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2042             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2043             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2044
2045             fjx0             = _mm_setzero_ps();
2046             fjy0             = _mm_setzero_ps();
2047             fjz0             = _mm_setzero_ps();
2048             fjx1             = _mm_setzero_ps();
2049             fjy1             = _mm_setzero_ps();
2050             fjz1             = _mm_setzero_ps();
2051             fjx2             = _mm_setzero_ps();
2052             fjy2             = _mm_setzero_ps();
2053             fjz2             = _mm_setzero_ps();
2054             fjx3             = _mm_setzero_ps();
2055             fjy3             = _mm_setzero_ps();
2056             fjz3             = _mm_setzero_ps();
2057
2058             /**************************
2059              * CALCULATE INTERACTIONS *
2060              **************************/
2061
2062             if (gmx_mm_any_lt(rsq00,rcutoff2))
2063             {
2064
2065             r00              = _mm_mul_ps(rsq00,rinv00);
2066
2067             /* LENNARD-JONES DISPERSION/REPULSION */
2068
2069             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2070             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2071             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2072             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2073             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2074
2075             d                = _mm_sub_ps(r00,rswitch);
2076             d                = _mm_max_ps(d,_mm_setzero_ps());
2077             d2               = _mm_mul_ps(d,d);
2078             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2079
2080             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2081
2082             /* Evaluate switch function */
2083             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2084             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2085             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2086
2087             fscal            = fvdw;
2088
2089             fscal            = _mm_and_ps(fscal,cutoff_mask);
2090
2091             /* Calculate temporary vectorial force */
2092             tx               = _mm_mul_ps(fscal,dx00);
2093             ty               = _mm_mul_ps(fscal,dy00);
2094             tz               = _mm_mul_ps(fscal,dz00);
2095
2096             /* Update vectorial force */
2097             fix0             = _mm_add_ps(fix0,tx);
2098             fiy0             = _mm_add_ps(fiy0,ty);
2099             fiz0             = _mm_add_ps(fiz0,tz);
2100
2101             fjx0             = _mm_add_ps(fjx0,tx);
2102             fjy0             = _mm_add_ps(fjy0,ty);
2103             fjz0             = _mm_add_ps(fjz0,tz);
2104
2105             }
2106
2107             /**************************
2108              * CALCULATE INTERACTIONS *
2109              **************************/
2110
2111             if (gmx_mm_any_lt(rsq11,rcutoff2))
2112             {
2113
2114             r11              = _mm_mul_ps(rsq11,rinv11);
2115
2116             /* EWALD ELECTROSTATICS */
2117
2118             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2119             ewrt             = _mm_mul_ps(r11,ewtabscale);
2120             ewitab           = _mm_cvttps_epi32(ewrt);
2121             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2122             ewitab           = _mm_slli_epi32(ewitab,2);
2123             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2124             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2125             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2126             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2127             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2128             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2129             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2130             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2131             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2132
2133             d                = _mm_sub_ps(r11,rswitch);
2134             d                = _mm_max_ps(d,_mm_setzero_ps());
2135             d2               = _mm_mul_ps(d,d);
2136             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2137
2138             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2139
2140             /* Evaluate switch function */
2141             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2142             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2143             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2144
2145             fscal            = felec;
2146
2147             fscal            = _mm_and_ps(fscal,cutoff_mask);
2148
2149             /* Calculate temporary vectorial force */
2150             tx               = _mm_mul_ps(fscal,dx11);
2151             ty               = _mm_mul_ps(fscal,dy11);
2152             tz               = _mm_mul_ps(fscal,dz11);
2153
2154             /* Update vectorial force */
2155             fix1             = _mm_add_ps(fix1,tx);
2156             fiy1             = _mm_add_ps(fiy1,ty);
2157             fiz1             = _mm_add_ps(fiz1,tz);
2158
2159             fjx1             = _mm_add_ps(fjx1,tx);
2160             fjy1             = _mm_add_ps(fjy1,ty);
2161             fjz1             = _mm_add_ps(fjz1,tz);
2162
2163             }
2164
2165             /**************************
2166              * CALCULATE INTERACTIONS *
2167              **************************/
2168
2169             if (gmx_mm_any_lt(rsq12,rcutoff2))
2170             {
2171
2172             r12              = _mm_mul_ps(rsq12,rinv12);
2173
2174             /* EWALD ELECTROSTATICS */
2175
2176             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2177             ewrt             = _mm_mul_ps(r12,ewtabscale);
2178             ewitab           = _mm_cvttps_epi32(ewrt);
2179             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2180             ewitab           = _mm_slli_epi32(ewitab,2);
2181             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2182             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2183             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2184             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2185             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2186             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2187             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2188             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2189             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2190
2191             d                = _mm_sub_ps(r12,rswitch);
2192             d                = _mm_max_ps(d,_mm_setzero_ps());
2193             d2               = _mm_mul_ps(d,d);
2194             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2195
2196             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2197
2198             /* Evaluate switch function */
2199             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2200             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2201             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2202
2203             fscal            = felec;
2204
2205             fscal            = _mm_and_ps(fscal,cutoff_mask);
2206
2207             /* Calculate temporary vectorial force */
2208             tx               = _mm_mul_ps(fscal,dx12);
2209             ty               = _mm_mul_ps(fscal,dy12);
2210             tz               = _mm_mul_ps(fscal,dz12);
2211
2212             /* Update vectorial force */
2213             fix1             = _mm_add_ps(fix1,tx);
2214             fiy1             = _mm_add_ps(fiy1,ty);
2215             fiz1             = _mm_add_ps(fiz1,tz);
2216
2217             fjx2             = _mm_add_ps(fjx2,tx);
2218             fjy2             = _mm_add_ps(fjy2,ty);
2219             fjz2             = _mm_add_ps(fjz2,tz);
2220
2221             }
2222
2223             /**************************
2224              * CALCULATE INTERACTIONS *
2225              **************************/
2226
2227             if (gmx_mm_any_lt(rsq13,rcutoff2))
2228             {
2229
2230             r13              = _mm_mul_ps(rsq13,rinv13);
2231
2232             /* EWALD ELECTROSTATICS */
2233
2234             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2235             ewrt             = _mm_mul_ps(r13,ewtabscale);
2236             ewitab           = _mm_cvttps_epi32(ewrt);
2237             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2238             ewitab           = _mm_slli_epi32(ewitab,2);
2239             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2240             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2241             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2242             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2243             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2244             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2245             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2246             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2247             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2248
2249             d                = _mm_sub_ps(r13,rswitch);
2250             d                = _mm_max_ps(d,_mm_setzero_ps());
2251             d2               = _mm_mul_ps(d,d);
2252             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2253
2254             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2255
2256             /* Evaluate switch function */
2257             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2258             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2259             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2260
2261             fscal            = felec;
2262
2263             fscal            = _mm_and_ps(fscal,cutoff_mask);
2264
2265             /* Calculate temporary vectorial force */
2266             tx               = _mm_mul_ps(fscal,dx13);
2267             ty               = _mm_mul_ps(fscal,dy13);
2268             tz               = _mm_mul_ps(fscal,dz13);
2269
2270             /* Update vectorial force */
2271             fix1             = _mm_add_ps(fix1,tx);
2272             fiy1             = _mm_add_ps(fiy1,ty);
2273             fiz1             = _mm_add_ps(fiz1,tz);
2274
2275             fjx3             = _mm_add_ps(fjx3,tx);
2276             fjy3             = _mm_add_ps(fjy3,ty);
2277             fjz3             = _mm_add_ps(fjz3,tz);
2278
2279             }
2280
2281             /**************************
2282              * CALCULATE INTERACTIONS *
2283              **************************/
2284
2285             if (gmx_mm_any_lt(rsq21,rcutoff2))
2286             {
2287
2288             r21              = _mm_mul_ps(rsq21,rinv21);
2289
2290             /* EWALD ELECTROSTATICS */
2291
2292             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2293             ewrt             = _mm_mul_ps(r21,ewtabscale);
2294             ewitab           = _mm_cvttps_epi32(ewrt);
2295             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2296             ewitab           = _mm_slli_epi32(ewitab,2);
2297             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2298             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2299             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2300             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2301             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2302             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2303             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2304             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2305             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2306
2307             d                = _mm_sub_ps(r21,rswitch);
2308             d                = _mm_max_ps(d,_mm_setzero_ps());
2309             d2               = _mm_mul_ps(d,d);
2310             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2311
2312             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2313
2314             /* Evaluate switch function */
2315             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2316             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2317             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2318
2319             fscal            = felec;
2320
2321             fscal            = _mm_and_ps(fscal,cutoff_mask);
2322
2323             /* Calculate temporary vectorial force */
2324             tx               = _mm_mul_ps(fscal,dx21);
2325             ty               = _mm_mul_ps(fscal,dy21);
2326             tz               = _mm_mul_ps(fscal,dz21);
2327
2328             /* Update vectorial force */
2329             fix2             = _mm_add_ps(fix2,tx);
2330             fiy2             = _mm_add_ps(fiy2,ty);
2331             fiz2             = _mm_add_ps(fiz2,tz);
2332
2333             fjx1             = _mm_add_ps(fjx1,tx);
2334             fjy1             = _mm_add_ps(fjy1,ty);
2335             fjz1             = _mm_add_ps(fjz1,tz);
2336
2337             }
2338
2339             /**************************
2340              * CALCULATE INTERACTIONS *
2341              **************************/
2342
2343             if (gmx_mm_any_lt(rsq22,rcutoff2))
2344             {
2345
2346             r22              = _mm_mul_ps(rsq22,rinv22);
2347
2348             /* EWALD ELECTROSTATICS */
2349
2350             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2351             ewrt             = _mm_mul_ps(r22,ewtabscale);
2352             ewitab           = _mm_cvttps_epi32(ewrt);
2353             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2354             ewitab           = _mm_slli_epi32(ewitab,2);
2355             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2356             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2357             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2358             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2359             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2360             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2361             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2362             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2363             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2364
2365             d                = _mm_sub_ps(r22,rswitch);
2366             d                = _mm_max_ps(d,_mm_setzero_ps());
2367             d2               = _mm_mul_ps(d,d);
2368             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2369
2370             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2371
2372             /* Evaluate switch function */
2373             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2374             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2375             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2376
2377             fscal            = felec;
2378
2379             fscal            = _mm_and_ps(fscal,cutoff_mask);
2380
2381             /* Calculate temporary vectorial force */
2382             tx               = _mm_mul_ps(fscal,dx22);
2383             ty               = _mm_mul_ps(fscal,dy22);
2384             tz               = _mm_mul_ps(fscal,dz22);
2385
2386             /* Update vectorial force */
2387             fix2             = _mm_add_ps(fix2,tx);
2388             fiy2             = _mm_add_ps(fiy2,ty);
2389             fiz2             = _mm_add_ps(fiz2,tz);
2390
2391             fjx2             = _mm_add_ps(fjx2,tx);
2392             fjy2             = _mm_add_ps(fjy2,ty);
2393             fjz2             = _mm_add_ps(fjz2,tz);
2394
2395             }
2396
2397             /**************************
2398              * CALCULATE INTERACTIONS *
2399              **************************/
2400
2401             if (gmx_mm_any_lt(rsq23,rcutoff2))
2402             {
2403
2404             r23              = _mm_mul_ps(rsq23,rinv23);
2405
2406             /* EWALD ELECTROSTATICS */
2407
2408             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2409             ewrt             = _mm_mul_ps(r23,ewtabscale);
2410             ewitab           = _mm_cvttps_epi32(ewrt);
2411             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2412             ewitab           = _mm_slli_epi32(ewitab,2);
2413             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2414             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2415             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2416             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2417             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2418             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2419             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2420             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2421             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2422
2423             d                = _mm_sub_ps(r23,rswitch);
2424             d                = _mm_max_ps(d,_mm_setzero_ps());
2425             d2               = _mm_mul_ps(d,d);
2426             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2427
2428             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2429
2430             /* Evaluate switch function */
2431             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2432             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2433             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2434
2435             fscal            = felec;
2436
2437             fscal            = _mm_and_ps(fscal,cutoff_mask);
2438
2439             /* Calculate temporary vectorial force */
2440             tx               = _mm_mul_ps(fscal,dx23);
2441             ty               = _mm_mul_ps(fscal,dy23);
2442             tz               = _mm_mul_ps(fscal,dz23);
2443
2444             /* Update vectorial force */
2445             fix2             = _mm_add_ps(fix2,tx);
2446             fiy2             = _mm_add_ps(fiy2,ty);
2447             fiz2             = _mm_add_ps(fiz2,tz);
2448
2449             fjx3             = _mm_add_ps(fjx3,tx);
2450             fjy3             = _mm_add_ps(fjy3,ty);
2451             fjz3             = _mm_add_ps(fjz3,tz);
2452
2453             }
2454
2455             /**************************
2456              * CALCULATE INTERACTIONS *
2457              **************************/
2458
2459             if (gmx_mm_any_lt(rsq31,rcutoff2))
2460             {
2461
2462             r31              = _mm_mul_ps(rsq31,rinv31);
2463
2464             /* EWALD ELECTROSTATICS */
2465
2466             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2467             ewrt             = _mm_mul_ps(r31,ewtabscale);
2468             ewitab           = _mm_cvttps_epi32(ewrt);
2469             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2470             ewitab           = _mm_slli_epi32(ewitab,2);
2471             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2472             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2473             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2474             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2475             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2476             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2477             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2478             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2479             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2480
2481             d                = _mm_sub_ps(r31,rswitch);
2482             d                = _mm_max_ps(d,_mm_setzero_ps());
2483             d2               = _mm_mul_ps(d,d);
2484             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2485
2486             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2487
2488             /* Evaluate switch function */
2489             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2490             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2491             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2492
2493             fscal            = felec;
2494
2495             fscal            = _mm_and_ps(fscal,cutoff_mask);
2496
2497             /* Calculate temporary vectorial force */
2498             tx               = _mm_mul_ps(fscal,dx31);
2499             ty               = _mm_mul_ps(fscal,dy31);
2500             tz               = _mm_mul_ps(fscal,dz31);
2501
2502             /* Update vectorial force */
2503             fix3             = _mm_add_ps(fix3,tx);
2504             fiy3             = _mm_add_ps(fiy3,ty);
2505             fiz3             = _mm_add_ps(fiz3,tz);
2506
2507             fjx1             = _mm_add_ps(fjx1,tx);
2508             fjy1             = _mm_add_ps(fjy1,ty);
2509             fjz1             = _mm_add_ps(fjz1,tz);
2510
2511             }
2512
2513             /**************************
2514              * CALCULATE INTERACTIONS *
2515              **************************/
2516
2517             if (gmx_mm_any_lt(rsq32,rcutoff2))
2518             {
2519
2520             r32              = _mm_mul_ps(rsq32,rinv32);
2521
2522             /* EWALD ELECTROSTATICS */
2523
2524             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2525             ewrt             = _mm_mul_ps(r32,ewtabscale);
2526             ewitab           = _mm_cvttps_epi32(ewrt);
2527             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2528             ewitab           = _mm_slli_epi32(ewitab,2);
2529             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2530             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2531             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2532             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2533             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2534             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2535             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2536             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2537             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2538
2539             d                = _mm_sub_ps(r32,rswitch);
2540             d                = _mm_max_ps(d,_mm_setzero_ps());
2541             d2               = _mm_mul_ps(d,d);
2542             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2543
2544             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2545
2546             /* Evaluate switch function */
2547             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2548             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2549             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2550
2551             fscal            = felec;
2552
2553             fscal            = _mm_and_ps(fscal,cutoff_mask);
2554
2555             /* Calculate temporary vectorial force */
2556             tx               = _mm_mul_ps(fscal,dx32);
2557             ty               = _mm_mul_ps(fscal,dy32);
2558             tz               = _mm_mul_ps(fscal,dz32);
2559
2560             /* Update vectorial force */
2561             fix3             = _mm_add_ps(fix3,tx);
2562             fiy3             = _mm_add_ps(fiy3,ty);
2563             fiz3             = _mm_add_ps(fiz3,tz);
2564
2565             fjx2             = _mm_add_ps(fjx2,tx);
2566             fjy2             = _mm_add_ps(fjy2,ty);
2567             fjz2             = _mm_add_ps(fjz2,tz);
2568
2569             }
2570
2571             /**************************
2572              * CALCULATE INTERACTIONS *
2573              **************************/
2574
2575             if (gmx_mm_any_lt(rsq33,rcutoff2))
2576             {
2577
2578             r33              = _mm_mul_ps(rsq33,rinv33);
2579
2580             /* EWALD ELECTROSTATICS */
2581
2582             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2583             ewrt             = _mm_mul_ps(r33,ewtabscale);
2584             ewitab           = _mm_cvttps_epi32(ewrt);
2585             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2586             ewitab           = _mm_slli_epi32(ewitab,2);
2587             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2588             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2589             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2590             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2591             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2592             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2593             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2594             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2595             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2596
2597             d                = _mm_sub_ps(r33,rswitch);
2598             d                = _mm_max_ps(d,_mm_setzero_ps());
2599             d2               = _mm_mul_ps(d,d);
2600             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2601
2602             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2603
2604             /* Evaluate switch function */
2605             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2606             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2607             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2608
2609             fscal            = felec;
2610
2611             fscal            = _mm_and_ps(fscal,cutoff_mask);
2612
2613             /* Calculate temporary vectorial force */
2614             tx               = _mm_mul_ps(fscal,dx33);
2615             ty               = _mm_mul_ps(fscal,dy33);
2616             tz               = _mm_mul_ps(fscal,dz33);
2617
2618             /* Update vectorial force */
2619             fix3             = _mm_add_ps(fix3,tx);
2620             fiy3             = _mm_add_ps(fiy3,ty);
2621             fiz3             = _mm_add_ps(fiz3,tz);
2622
2623             fjx3             = _mm_add_ps(fjx3,tx);
2624             fjy3             = _mm_add_ps(fjy3,ty);
2625             fjz3             = _mm_add_ps(fjz3,tz);
2626
2627             }
2628
2629             fjptrA             = f+j_coord_offsetA;
2630             fjptrB             = f+j_coord_offsetB;
2631             fjptrC             = f+j_coord_offsetC;
2632             fjptrD             = f+j_coord_offsetD;
2633
2634             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2635                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2636                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2637
2638             /* Inner loop uses 617 flops */
2639         }
2640
2641         if(jidx<j_index_end)
2642         {
2643
2644             /* Get j neighbor index, and coordinate index */
2645             jnrlistA         = jjnr[jidx];
2646             jnrlistB         = jjnr[jidx+1];
2647             jnrlistC         = jjnr[jidx+2];
2648             jnrlistD         = jjnr[jidx+3];
2649             /* Sign of each element will be negative for non-real atoms.
2650              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2651              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2652              */
2653             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2654             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2655             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2656             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2657             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2658             j_coord_offsetA  = DIM*jnrA;
2659             j_coord_offsetB  = DIM*jnrB;
2660             j_coord_offsetC  = DIM*jnrC;
2661             j_coord_offsetD  = DIM*jnrD;
2662
2663             /* load j atom coordinates */
2664             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2665                                               x+j_coord_offsetC,x+j_coord_offsetD,
2666                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2667                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2668
2669             /* Calculate displacement vector */
2670             dx00             = _mm_sub_ps(ix0,jx0);
2671             dy00             = _mm_sub_ps(iy0,jy0);
2672             dz00             = _mm_sub_ps(iz0,jz0);
2673             dx11             = _mm_sub_ps(ix1,jx1);
2674             dy11             = _mm_sub_ps(iy1,jy1);
2675             dz11             = _mm_sub_ps(iz1,jz1);
2676             dx12             = _mm_sub_ps(ix1,jx2);
2677             dy12             = _mm_sub_ps(iy1,jy2);
2678             dz12             = _mm_sub_ps(iz1,jz2);
2679             dx13             = _mm_sub_ps(ix1,jx3);
2680             dy13             = _mm_sub_ps(iy1,jy3);
2681             dz13             = _mm_sub_ps(iz1,jz3);
2682             dx21             = _mm_sub_ps(ix2,jx1);
2683             dy21             = _mm_sub_ps(iy2,jy1);
2684             dz21             = _mm_sub_ps(iz2,jz1);
2685             dx22             = _mm_sub_ps(ix2,jx2);
2686             dy22             = _mm_sub_ps(iy2,jy2);
2687             dz22             = _mm_sub_ps(iz2,jz2);
2688             dx23             = _mm_sub_ps(ix2,jx3);
2689             dy23             = _mm_sub_ps(iy2,jy3);
2690             dz23             = _mm_sub_ps(iz2,jz3);
2691             dx31             = _mm_sub_ps(ix3,jx1);
2692             dy31             = _mm_sub_ps(iy3,jy1);
2693             dz31             = _mm_sub_ps(iz3,jz1);
2694             dx32             = _mm_sub_ps(ix3,jx2);
2695             dy32             = _mm_sub_ps(iy3,jy2);
2696             dz32             = _mm_sub_ps(iz3,jz2);
2697             dx33             = _mm_sub_ps(ix3,jx3);
2698             dy33             = _mm_sub_ps(iy3,jy3);
2699             dz33             = _mm_sub_ps(iz3,jz3);
2700
2701             /* Calculate squared distance and things based on it */
2702             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2703             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2704             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2705             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2706             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2707             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2708             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2709             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2710             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2711             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2712
2713             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2714             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2715             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2716             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2717             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2718             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2719             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2720             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2721             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2722             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2723
2724             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2725             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2726             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2727             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2728             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2729             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2730             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2731             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2732             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2733             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2734
2735             fjx0             = _mm_setzero_ps();
2736             fjy0             = _mm_setzero_ps();
2737             fjz0             = _mm_setzero_ps();
2738             fjx1             = _mm_setzero_ps();
2739             fjy1             = _mm_setzero_ps();
2740             fjz1             = _mm_setzero_ps();
2741             fjx2             = _mm_setzero_ps();
2742             fjy2             = _mm_setzero_ps();
2743             fjz2             = _mm_setzero_ps();
2744             fjx3             = _mm_setzero_ps();
2745             fjy3             = _mm_setzero_ps();
2746             fjz3             = _mm_setzero_ps();
2747
2748             /**************************
2749              * CALCULATE INTERACTIONS *
2750              **************************/
2751
2752             if (gmx_mm_any_lt(rsq00,rcutoff2))
2753             {
2754
2755             r00              = _mm_mul_ps(rsq00,rinv00);
2756             r00              = _mm_andnot_ps(dummy_mask,r00);
2757
2758             /* LENNARD-JONES DISPERSION/REPULSION */
2759
2760             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2761             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2762             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2763             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2764             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2765
2766             d                = _mm_sub_ps(r00,rswitch);
2767             d                = _mm_max_ps(d,_mm_setzero_ps());
2768             d2               = _mm_mul_ps(d,d);
2769             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2770
2771             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2772
2773             /* Evaluate switch function */
2774             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2775             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2776             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2777
2778             fscal            = fvdw;
2779
2780             fscal            = _mm_and_ps(fscal,cutoff_mask);
2781
2782             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2783
2784             /* Calculate temporary vectorial force */
2785             tx               = _mm_mul_ps(fscal,dx00);
2786             ty               = _mm_mul_ps(fscal,dy00);
2787             tz               = _mm_mul_ps(fscal,dz00);
2788
2789             /* Update vectorial force */
2790             fix0             = _mm_add_ps(fix0,tx);
2791             fiy0             = _mm_add_ps(fiy0,ty);
2792             fiz0             = _mm_add_ps(fiz0,tz);
2793
2794             fjx0             = _mm_add_ps(fjx0,tx);
2795             fjy0             = _mm_add_ps(fjy0,ty);
2796             fjz0             = _mm_add_ps(fjz0,tz);
2797
2798             }
2799
2800             /**************************
2801              * CALCULATE INTERACTIONS *
2802              **************************/
2803
2804             if (gmx_mm_any_lt(rsq11,rcutoff2))
2805             {
2806
2807             r11              = _mm_mul_ps(rsq11,rinv11);
2808             r11              = _mm_andnot_ps(dummy_mask,r11);
2809
2810             /* EWALD ELECTROSTATICS */
2811
2812             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2813             ewrt             = _mm_mul_ps(r11,ewtabscale);
2814             ewitab           = _mm_cvttps_epi32(ewrt);
2815             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2816             ewitab           = _mm_slli_epi32(ewitab,2);
2817             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2818             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2819             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2820             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2821             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2822             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2823             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2824             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2825             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2826
2827             d                = _mm_sub_ps(r11,rswitch);
2828             d                = _mm_max_ps(d,_mm_setzero_ps());
2829             d2               = _mm_mul_ps(d,d);
2830             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2831
2832             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2833
2834             /* Evaluate switch function */
2835             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2836             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2837             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2838
2839             fscal            = felec;
2840
2841             fscal            = _mm_and_ps(fscal,cutoff_mask);
2842
2843             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2844
2845             /* Calculate temporary vectorial force */
2846             tx               = _mm_mul_ps(fscal,dx11);
2847             ty               = _mm_mul_ps(fscal,dy11);
2848             tz               = _mm_mul_ps(fscal,dz11);
2849
2850             /* Update vectorial force */
2851             fix1             = _mm_add_ps(fix1,tx);
2852             fiy1             = _mm_add_ps(fiy1,ty);
2853             fiz1             = _mm_add_ps(fiz1,tz);
2854
2855             fjx1             = _mm_add_ps(fjx1,tx);
2856             fjy1             = _mm_add_ps(fjy1,ty);
2857             fjz1             = _mm_add_ps(fjz1,tz);
2858
2859             }
2860
2861             /**************************
2862              * CALCULATE INTERACTIONS *
2863              **************************/
2864
2865             if (gmx_mm_any_lt(rsq12,rcutoff2))
2866             {
2867
2868             r12              = _mm_mul_ps(rsq12,rinv12);
2869             r12              = _mm_andnot_ps(dummy_mask,r12);
2870
2871             /* EWALD ELECTROSTATICS */
2872
2873             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2874             ewrt             = _mm_mul_ps(r12,ewtabscale);
2875             ewitab           = _mm_cvttps_epi32(ewrt);
2876             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2877             ewitab           = _mm_slli_epi32(ewitab,2);
2878             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2879             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2880             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2881             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2882             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2883             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2884             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2885             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2886             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2887
2888             d                = _mm_sub_ps(r12,rswitch);
2889             d                = _mm_max_ps(d,_mm_setzero_ps());
2890             d2               = _mm_mul_ps(d,d);
2891             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2892
2893             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2894
2895             /* Evaluate switch function */
2896             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2897             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2898             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2899
2900             fscal            = felec;
2901
2902             fscal            = _mm_and_ps(fscal,cutoff_mask);
2903
2904             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2905
2906             /* Calculate temporary vectorial force */
2907             tx               = _mm_mul_ps(fscal,dx12);
2908             ty               = _mm_mul_ps(fscal,dy12);
2909             tz               = _mm_mul_ps(fscal,dz12);
2910
2911             /* Update vectorial force */
2912             fix1             = _mm_add_ps(fix1,tx);
2913             fiy1             = _mm_add_ps(fiy1,ty);
2914             fiz1             = _mm_add_ps(fiz1,tz);
2915
2916             fjx2             = _mm_add_ps(fjx2,tx);
2917             fjy2             = _mm_add_ps(fjy2,ty);
2918             fjz2             = _mm_add_ps(fjz2,tz);
2919
2920             }
2921
2922             /**************************
2923              * CALCULATE INTERACTIONS *
2924              **************************/
2925
2926             if (gmx_mm_any_lt(rsq13,rcutoff2))
2927             {
2928
2929             r13              = _mm_mul_ps(rsq13,rinv13);
2930             r13              = _mm_andnot_ps(dummy_mask,r13);
2931
2932             /* EWALD ELECTROSTATICS */
2933
2934             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2935             ewrt             = _mm_mul_ps(r13,ewtabscale);
2936             ewitab           = _mm_cvttps_epi32(ewrt);
2937             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2938             ewitab           = _mm_slli_epi32(ewitab,2);
2939             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2940             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2941             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2942             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2943             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2944             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2945             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2946             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2947             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2948
2949             d                = _mm_sub_ps(r13,rswitch);
2950             d                = _mm_max_ps(d,_mm_setzero_ps());
2951             d2               = _mm_mul_ps(d,d);
2952             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2953
2954             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2955
2956             /* Evaluate switch function */
2957             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2958             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2959             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2960
2961             fscal            = felec;
2962
2963             fscal            = _mm_and_ps(fscal,cutoff_mask);
2964
2965             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2966
2967             /* Calculate temporary vectorial force */
2968             tx               = _mm_mul_ps(fscal,dx13);
2969             ty               = _mm_mul_ps(fscal,dy13);
2970             tz               = _mm_mul_ps(fscal,dz13);
2971
2972             /* Update vectorial force */
2973             fix1             = _mm_add_ps(fix1,tx);
2974             fiy1             = _mm_add_ps(fiy1,ty);
2975             fiz1             = _mm_add_ps(fiz1,tz);
2976
2977             fjx3             = _mm_add_ps(fjx3,tx);
2978             fjy3             = _mm_add_ps(fjy3,ty);
2979             fjz3             = _mm_add_ps(fjz3,tz);
2980
2981             }
2982
2983             /**************************
2984              * CALCULATE INTERACTIONS *
2985              **************************/
2986
2987             if (gmx_mm_any_lt(rsq21,rcutoff2))
2988             {
2989
2990             r21              = _mm_mul_ps(rsq21,rinv21);
2991             r21              = _mm_andnot_ps(dummy_mask,r21);
2992
2993             /* EWALD ELECTROSTATICS */
2994
2995             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2996             ewrt             = _mm_mul_ps(r21,ewtabscale);
2997             ewitab           = _mm_cvttps_epi32(ewrt);
2998             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2999             ewitab           = _mm_slli_epi32(ewitab,2);
3000             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3001             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3002             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3003             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3004             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3005             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3006             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3007             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3008             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3009
3010             d                = _mm_sub_ps(r21,rswitch);
3011             d                = _mm_max_ps(d,_mm_setzero_ps());
3012             d2               = _mm_mul_ps(d,d);
3013             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3014
3015             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3016
3017             /* Evaluate switch function */
3018             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3019             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3020             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
3021
3022             fscal            = felec;
3023
3024             fscal            = _mm_and_ps(fscal,cutoff_mask);
3025
3026             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3027
3028             /* Calculate temporary vectorial force */
3029             tx               = _mm_mul_ps(fscal,dx21);
3030             ty               = _mm_mul_ps(fscal,dy21);
3031             tz               = _mm_mul_ps(fscal,dz21);
3032
3033             /* Update vectorial force */
3034             fix2             = _mm_add_ps(fix2,tx);
3035             fiy2             = _mm_add_ps(fiy2,ty);
3036             fiz2             = _mm_add_ps(fiz2,tz);
3037
3038             fjx1             = _mm_add_ps(fjx1,tx);
3039             fjy1             = _mm_add_ps(fjy1,ty);
3040             fjz1             = _mm_add_ps(fjz1,tz);
3041
3042             }
3043
3044             /**************************
3045              * CALCULATE INTERACTIONS *
3046              **************************/
3047
3048             if (gmx_mm_any_lt(rsq22,rcutoff2))
3049             {
3050
3051             r22              = _mm_mul_ps(rsq22,rinv22);
3052             r22              = _mm_andnot_ps(dummy_mask,r22);
3053
3054             /* EWALD ELECTROSTATICS */
3055
3056             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3057             ewrt             = _mm_mul_ps(r22,ewtabscale);
3058             ewitab           = _mm_cvttps_epi32(ewrt);
3059             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3060             ewitab           = _mm_slli_epi32(ewitab,2);
3061             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3062             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3063             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3064             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3065             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3066             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3067             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3068             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3069             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3070
3071             d                = _mm_sub_ps(r22,rswitch);
3072             d                = _mm_max_ps(d,_mm_setzero_ps());
3073             d2               = _mm_mul_ps(d,d);
3074             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3075
3076             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3077
3078             /* Evaluate switch function */
3079             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3080             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3081             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
3082
3083             fscal            = felec;
3084
3085             fscal            = _mm_and_ps(fscal,cutoff_mask);
3086
3087             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3088
3089             /* Calculate temporary vectorial force */
3090             tx               = _mm_mul_ps(fscal,dx22);
3091             ty               = _mm_mul_ps(fscal,dy22);
3092             tz               = _mm_mul_ps(fscal,dz22);
3093
3094             /* Update vectorial force */
3095             fix2             = _mm_add_ps(fix2,tx);
3096             fiy2             = _mm_add_ps(fiy2,ty);
3097             fiz2             = _mm_add_ps(fiz2,tz);
3098
3099             fjx2             = _mm_add_ps(fjx2,tx);
3100             fjy2             = _mm_add_ps(fjy2,ty);
3101             fjz2             = _mm_add_ps(fjz2,tz);
3102
3103             }
3104
3105             /**************************
3106              * CALCULATE INTERACTIONS *
3107              **************************/
3108
3109             if (gmx_mm_any_lt(rsq23,rcutoff2))
3110             {
3111
3112             r23              = _mm_mul_ps(rsq23,rinv23);
3113             r23              = _mm_andnot_ps(dummy_mask,r23);
3114
3115             /* EWALD ELECTROSTATICS */
3116
3117             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3118             ewrt             = _mm_mul_ps(r23,ewtabscale);
3119             ewitab           = _mm_cvttps_epi32(ewrt);
3120             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3121             ewitab           = _mm_slli_epi32(ewitab,2);
3122             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3123             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3124             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3125             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3126             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3127             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3128             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3129             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
3130             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
3131
3132             d                = _mm_sub_ps(r23,rswitch);
3133             d                = _mm_max_ps(d,_mm_setzero_ps());
3134             d2               = _mm_mul_ps(d,d);
3135             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3136
3137             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3138
3139             /* Evaluate switch function */
3140             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3141             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
3142             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
3143
3144             fscal            = felec;
3145
3146             fscal            = _mm_and_ps(fscal,cutoff_mask);
3147
3148             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3149
3150             /* Calculate temporary vectorial force */
3151             tx               = _mm_mul_ps(fscal,dx23);
3152             ty               = _mm_mul_ps(fscal,dy23);
3153             tz               = _mm_mul_ps(fscal,dz23);
3154
3155             /* Update vectorial force */
3156             fix2             = _mm_add_ps(fix2,tx);
3157             fiy2             = _mm_add_ps(fiy2,ty);
3158             fiz2             = _mm_add_ps(fiz2,tz);
3159
3160             fjx3             = _mm_add_ps(fjx3,tx);
3161             fjy3             = _mm_add_ps(fjy3,ty);
3162             fjz3             = _mm_add_ps(fjz3,tz);
3163
3164             }
3165
3166             /**************************
3167              * CALCULATE INTERACTIONS *
3168              **************************/
3169
3170             if (gmx_mm_any_lt(rsq31,rcutoff2))
3171             {
3172
3173             r31              = _mm_mul_ps(rsq31,rinv31);
3174             r31              = _mm_andnot_ps(dummy_mask,r31);
3175
3176             /* EWALD ELECTROSTATICS */
3177
3178             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3179             ewrt             = _mm_mul_ps(r31,ewtabscale);
3180             ewitab           = _mm_cvttps_epi32(ewrt);
3181             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3182             ewitab           = _mm_slli_epi32(ewitab,2);
3183             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3184             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3185             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3186             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3187             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3188             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3189             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3190             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
3191             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
3192
3193             d                = _mm_sub_ps(r31,rswitch);
3194             d                = _mm_max_ps(d,_mm_setzero_ps());
3195             d2               = _mm_mul_ps(d,d);
3196             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3197
3198             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3199
3200             /* Evaluate switch function */
3201             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3202             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
3203             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
3204
3205             fscal            = felec;
3206
3207             fscal            = _mm_and_ps(fscal,cutoff_mask);
3208
3209             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3210
3211             /* Calculate temporary vectorial force */
3212             tx               = _mm_mul_ps(fscal,dx31);
3213             ty               = _mm_mul_ps(fscal,dy31);
3214             tz               = _mm_mul_ps(fscal,dz31);
3215
3216             /* Update vectorial force */
3217             fix3             = _mm_add_ps(fix3,tx);
3218             fiy3             = _mm_add_ps(fiy3,ty);
3219             fiz3             = _mm_add_ps(fiz3,tz);
3220
3221             fjx1             = _mm_add_ps(fjx1,tx);
3222             fjy1             = _mm_add_ps(fjy1,ty);
3223             fjz1             = _mm_add_ps(fjz1,tz);
3224
3225             }
3226
3227             /**************************
3228              * CALCULATE INTERACTIONS *
3229              **************************/
3230
3231             if (gmx_mm_any_lt(rsq32,rcutoff2))
3232             {
3233
3234             r32              = _mm_mul_ps(rsq32,rinv32);
3235             r32              = _mm_andnot_ps(dummy_mask,r32);
3236
3237             /* EWALD ELECTROSTATICS */
3238
3239             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3240             ewrt             = _mm_mul_ps(r32,ewtabscale);
3241             ewitab           = _mm_cvttps_epi32(ewrt);
3242             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3243             ewitab           = _mm_slli_epi32(ewitab,2);
3244             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3245             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3246             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3247             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3248             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3249             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3250             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3251             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
3252             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
3253
3254             d                = _mm_sub_ps(r32,rswitch);
3255             d                = _mm_max_ps(d,_mm_setzero_ps());
3256             d2               = _mm_mul_ps(d,d);
3257             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3258
3259             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3260
3261             /* Evaluate switch function */
3262             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3263             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
3264             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
3265
3266             fscal            = felec;
3267
3268             fscal            = _mm_and_ps(fscal,cutoff_mask);
3269
3270             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3271
3272             /* Calculate temporary vectorial force */
3273             tx               = _mm_mul_ps(fscal,dx32);
3274             ty               = _mm_mul_ps(fscal,dy32);
3275             tz               = _mm_mul_ps(fscal,dz32);
3276
3277             /* Update vectorial force */
3278             fix3             = _mm_add_ps(fix3,tx);
3279             fiy3             = _mm_add_ps(fiy3,ty);
3280             fiz3             = _mm_add_ps(fiz3,tz);
3281
3282             fjx2             = _mm_add_ps(fjx2,tx);
3283             fjy2             = _mm_add_ps(fjy2,ty);
3284             fjz2             = _mm_add_ps(fjz2,tz);
3285
3286             }
3287
3288             /**************************
3289              * CALCULATE INTERACTIONS *
3290              **************************/
3291
3292             if (gmx_mm_any_lt(rsq33,rcutoff2))
3293             {
3294
3295             r33              = _mm_mul_ps(rsq33,rinv33);
3296             r33              = _mm_andnot_ps(dummy_mask,r33);
3297
3298             /* EWALD ELECTROSTATICS */
3299
3300             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3301             ewrt             = _mm_mul_ps(r33,ewtabscale);
3302             ewitab           = _mm_cvttps_epi32(ewrt);
3303             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3304             ewitab           = _mm_slli_epi32(ewitab,2);
3305             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3306             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3307             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3308             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3309             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3310             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3311             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3312             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3313             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3314
3315             d                = _mm_sub_ps(r33,rswitch);
3316             d                = _mm_max_ps(d,_mm_setzero_ps());
3317             d2               = _mm_mul_ps(d,d);
3318             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3319
3320             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3321
3322             /* Evaluate switch function */
3323             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3324             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3325             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
3326
3327             fscal            = felec;
3328
3329             fscal            = _mm_and_ps(fscal,cutoff_mask);
3330
3331             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3332
3333             /* Calculate temporary vectorial force */
3334             tx               = _mm_mul_ps(fscal,dx33);
3335             ty               = _mm_mul_ps(fscal,dy33);
3336             tz               = _mm_mul_ps(fscal,dz33);
3337
3338             /* Update vectorial force */
3339             fix3             = _mm_add_ps(fix3,tx);
3340             fiy3             = _mm_add_ps(fiy3,ty);
3341             fiz3             = _mm_add_ps(fiz3,tz);
3342
3343             fjx3             = _mm_add_ps(fjx3,tx);
3344             fjy3             = _mm_add_ps(fjy3,ty);
3345             fjz3             = _mm_add_ps(fjz3,tz);
3346
3347             }
3348
3349             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3350             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3351             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3352             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3353
3354             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3355                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3356                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3357
3358             /* Inner loop uses 627 flops */
3359         }
3360
3361         /* End of innermost loop */
3362
3363         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3364                                               f+i_coord_offset,fshift+i_shift_offset);
3365
3366         /* Increment number of inner iterations */
3367         inneriter                  += j_index_end - j_index_start;
3368
3369         /* Outer loop uses 24 flops */
3370     }
3371
3372     /* Increment number of outer iterations */
3373     outeriter        += nri;
3374
3375     /* Update outer/inner flops */
3376
3377     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);
3378 }