02928c06985f5216144cfdcd99798555e89e14ad
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwioffset3;
90     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
116     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
117     __m128i          ewitab;
118     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
119     real             *ewtab;
120     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
121     real             rswitch_scalar,d_scalar;
122     __m128           dummy_mask,cutoff_mask;
123     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
124     __m128           one     = _mm_set1_ps(1.0);
125     __m128           two     = _mm_set1_ps(2.0);
126     x                = xx[0];
127     f                = ff[0];
128
129     nri              = nlist->nri;
130     iinr             = nlist->iinr;
131     jindex           = nlist->jindex;
132     jjnr             = nlist->jjnr;
133     shiftidx         = nlist->shift;
134     gid              = nlist->gid;
135     shiftvec         = fr->shift_vec[0];
136     fshift           = fr->fshift[0];
137     facel            = _mm_set1_ps(fr->epsfac);
138     charge           = mdatoms->chargeA;
139     nvdwtype         = fr->ntype;
140     vdwparam         = fr->nbfp;
141     vdwtype          = mdatoms->typeA;
142
143     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
144     ewtab            = fr->ic->tabq_coul_FDV0;
145     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
146     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
147
148     /* Setup water-specific parameters */
149     inr              = nlist->iinr[0];
150     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
151     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
152     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
153     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
154
155     jq1              = _mm_set1_ps(charge[inr+1]);
156     jq2              = _mm_set1_ps(charge[inr+2]);
157     jq3              = _mm_set1_ps(charge[inr+3]);
158     vdwjidx0A        = 2*vdwtype[inr+0];
159     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
160     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
161     qq11             = _mm_mul_ps(iq1,jq1);
162     qq12             = _mm_mul_ps(iq1,jq2);
163     qq13             = _mm_mul_ps(iq1,jq3);
164     qq21             = _mm_mul_ps(iq2,jq1);
165     qq22             = _mm_mul_ps(iq2,jq2);
166     qq23             = _mm_mul_ps(iq2,jq3);
167     qq31             = _mm_mul_ps(iq3,jq1);
168     qq32             = _mm_mul_ps(iq3,jq2);
169     qq33             = _mm_mul_ps(iq3,jq3);
170
171     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
172     rcutoff_scalar   = fr->rcoulomb;
173     rcutoff          = _mm_set1_ps(rcutoff_scalar);
174     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
175
176     rswitch_scalar   = fr->rcoulomb_switch;
177     rswitch          = _mm_set1_ps(rswitch_scalar);
178     /* Setup switch parameters */
179     d_scalar         = rcutoff_scalar-rswitch_scalar;
180     d                = _mm_set1_ps(d_scalar);
181     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
182     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
183     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
184     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
185     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
186     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
187
188     /* Avoid stupid compiler warnings */
189     jnrA = jnrB = jnrC = jnrD = 0;
190     j_coord_offsetA = 0;
191     j_coord_offsetB = 0;
192     j_coord_offsetC = 0;
193     j_coord_offsetD = 0;
194
195     outeriter        = 0;
196     inneriter        = 0;
197
198     for(iidx=0;iidx<4*DIM;iidx++)
199     {
200         scratch[iidx] = 0.0;
201     }
202
203     /* Start outer loop over neighborlists */
204     for(iidx=0; iidx<nri; iidx++)
205     {
206         /* Load shift vector for this list */
207         i_shift_offset   = DIM*shiftidx[iidx];
208
209         /* Load limits for loop over neighbors */
210         j_index_start    = jindex[iidx];
211         j_index_end      = jindex[iidx+1];
212
213         /* Get outer coordinate index */
214         inr              = iinr[iidx];
215         i_coord_offset   = DIM*inr;
216
217         /* Load i particle coords and add shift vector */
218         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
219                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
220
221         fix0             = _mm_setzero_ps();
222         fiy0             = _mm_setzero_ps();
223         fiz0             = _mm_setzero_ps();
224         fix1             = _mm_setzero_ps();
225         fiy1             = _mm_setzero_ps();
226         fiz1             = _mm_setzero_ps();
227         fix2             = _mm_setzero_ps();
228         fiy2             = _mm_setzero_ps();
229         fiz2             = _mm_setzero_ps();
230         fix3             = _mm_setzero_ps();
231         fiy3             = _mm_setzero_ps();
232         fiz3             = _mm_setzero_ps();
233
234         /* Reset potential sums */
235         velecsum         = _mm_setzero_ps();
236         vvdwsum          = _mm_setzero_ps();
237
238         /* Start inner kernel loop */
239         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
240         {
241
242             /* Get j neighbor index, and coordinate index */
243             jnrA             = jjnr[jidx];
244             jnrB             = jjnr[jidx+1];
245             jnrC             = jjnr[jidx+2];
246             jnrD             = jjnr[jidx+3];
247             j_coord_offsetA  = DIM*jnrA;
248             j_coord_offsetB  = DIM*jnrB;
249             j_coord_offsetC  = DIM*jnrC;
250             j_coord_offsetD  = DIM*jnrD;
251
252             /* load j atom coordinates */
253             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
254                                               x+j_coord_offsetC,x+j_coord_offsetD,
255                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
256                                               &jy2,&jz2,&jx3,&jy3,&jz3);
257
258             /* Calculate displacement vector */
259             dx00             = _mm_sub_ps(ix0,jx0);
260             dy00             = _mm_sub_ps(iy0,jy0);
261             dz00             = _mm_sub_ps(iz0,jz0);
262             dx11             = _mm_sub_ps(ix1,jx1);
263             dy11             = _mm_sub_ps(iy1,jy1);
264             dz11             = _mm_sub_ps(iz1,jz1);
265             dx12             = _mm_sub_ps(ix1,jx2);
266             dy12             = _mm_sub_ps(iy1,jy2);
267             dz12             = _mm_sub_ps(iz1,jz2);
268             dx13             = _mm_sub_ps(ix1,jx3);
269             dy13             = _mm_sub_ps(iy1,jy3);
270             dz13             = _mm_sub_ps(iz1,jz3);
271             dx21             = _mm_sub_ps(ix2,jx1);
272             dy21             = _mm_sub_ps(iy2,jy1);
273             dz21             = _mm_sub_ps(iz2,jz1);
274             dx22             = _mm_sub_ps(ix2,jx2);
275             dy22             = _mm_sub_ps(iy2,jy2);
276             dz22             = _mm_sub_ps(iz2,jz2);
277             dx23             = _mm_sub_ps(ix2,jx3);
278             dy23             = _mm_sub_ps(iy2,jy3);
279             dz23             = _mm_sub_ps(iz2,jz3);
280             dx31             = _mm_sub_ps(ix3,jx1);
281             dy31             = _mm_sub_ps(iy3,jy1);
282             dz31             = _mm_sub_ps(iz3,jz1);
283             dx32             = _mm_sub_ps(ix3,jx2);
284             dy32             = _mm_sub_ps(iy3,jy2);
285             dz32             = _mm_sub_ps(iz3,jz2);
286             dx33             = _mm_sub_ps(ix3,jx3);
287             dy33             = _mm_sub_ps(iy3,jy3);
288             dz33             = _mm_sub_ps(iz3,jz3);
289
290             /* Calculate squared distance and things based on it */
291             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
292             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
293             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
294             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
295             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
296             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
297             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
298             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
299             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
300             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
301
302             rinv00           = gmx_mm_invsqrt_ps(rsq00);
303             rinv11           = gmx_mm_invsqrt_ps(rsq11);
304             rinv12           = gmx_mm_invsqrt_ps(rsq12);
305             rinv13           = gmx_mm_invsqrt_ps(rsq13);
306             rinv21           = gmx_mm_invsqrt_ps(rsq21);
307             rinv22           = gmx_mm_invsqrt_ps(rsq22);
308             rinv23           = gmx_mm_invsqrt_ps(rsq23);
309             rinv31           = gmx_mm_invsqrt_ps(rsq31);
310             rinv32           = gmx_mm_invsqrt_ps(rsq32);
311             rinv33           = gmx_mm_invsqrt_ps(rsq33);
312
313             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
314             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
315             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
316             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
317             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
318             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
319             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
320             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
321             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
322             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
323
324             fjx0             = _mm_setzero_ps();
325             fjy0             = _mm_setzero_ps();
326             fjz0             = _mm_setzero_ps();
327             fjx1             = _mm_setzero_ps();
328             fjy1             = _mm_setzero_ps();
329             fjz1             = _mm_setzero_ps();
330             fjx2             = _mm_setzero_ps();
331             fjy2             = _mm_setzero_ps();
332             fjz2             = _mm_setzero_ps();
333             fjx3             = _mm_setzero_ps();
334             fjy3             = _mm_setzero_ps();
335             fjz3             = _mm_setzero_ps();
336
337             /**************************
338              * CALCULATE INTERACTIONS *
339              **************************/
340
341             if (gmx_mm_any_lt(rsq00,rcutoff2))
342             {
343
344             r00              = _mm_mul_ps(rsq00,rinv00);
345
346             /* LENNARD-JONES DISPERSION/REPULSION */
347
348             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
349             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
350             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
351             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
352             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
353
354             d                = _mm_sub_ps(r00,rswitch);
355             d                = _mm_max_ps(d,_mm_setzero_ps());
356             d2               = _mm_mul_ps(d,d);
357             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
358
359             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
360
361             /* Evaluate switch function */
362             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
363             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
364             vvdw             = _mm_mul_ps(vvdw,sw);
365             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
366
367             /* Update potential sum for this i atom from the interaction with this j atom. */
368             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
369             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
370
371             fscal            = fvdw;
372
373             fscal            = _mm_and_ps(fscal,cutoff_mask);
374
375             /* Calculate temporary vectorial force */
376             tx               = _mm_mul_ps(fscal,dx00);
377             ty               = _mm_mul_ps(fscal,dy00);
378             tz               = _mm_mul_ps(fscal,dz00);
379
380             /* Update vectorial force */
381             fix0             = _mm_add_ps(fix0,tx);
382             fiy0             = _mm_add_ps(fiy0,ty);
383             fiz0             = _mm_add_ps(fiz0,tz);
384
385             fjx0             = _mm_add_ps(fjx0,tx);
386             fjy0             = _mm_add_ps(fjy0,ty);
387             fjz0             = _mm_add_ps(fjz0,tz);
388
389             }
390
391             /**************************
392              * CALCULATE INTERACTIONS *
393              **************************/
394
395             if (gmx_mm_any_lt(rsq11,rcutoff2))
396             {
397
398             r11              = _mm_mul_ps(rsq11,rinv11);
399
400             /* EWALD ELECTROSTATICS */
401
402             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
403             ewrt             = _mm_mul_ps(r11,ewtabscale);
404             ewitab           = _mm_cvttps_epi32(ewrt);
405             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
406             ewitab           = _mm_slli_epi32(ewitab,2);
407             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
408             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
409             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
410             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
411             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
412             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
413             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
414             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
415             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
416
417             d                = _mm_sub_ps(r11,rswitch);
418             d                = _mm_max_ps(d,_mm_setzero_ps());
419             d2               = _mm_mul_ps(d,d);
420             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
421
422             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
423
424             /* Evaluate switch function */
425             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
426             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
427             velec            = _mm_mul_ps(velec,sw);
428             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
429
430             /* Update potential sum for this i atom from the interaction with this j atom. */
431             velec            = _mm_and_ps(velec,cutoff_mask);
432             velecsum         = _mm_add_ps(velecsum,velec);
433
434             fscal            = felec;
435
436             fscal            = _mm_and_ps(fscal,cutoff_mask);
437
438             /* Calculate temporary vectorial force */
439             tx               = _mm_mul_ps(fscal,dx11);
440             ty               = _mm_mul_ps(fscal,dy11);
441             tz               = _mm_mul_ps(fscal,dz11);
442
443             /* Update vectorial force */
444             fix1             = _mm_add_ps(fix1,tx);
445             fiy1             = _mm_add_ps(fiy1,ty);
446             fiz1             = _mm_add_ps(fiz1,tz);
447
448             fjx1             = _mm_add_ps(fjx1,tx);
449             fjy1             = _mm_add_ps(fjy1,ty);
450             fjz1             = _mm_add_ps(fjz1,tz);
451
452             }
453
454             /**************************
455              * CALCULATE INTERACTIONS *
456              **************************/
457
458             if (gmx_mm_any_lt(rsq12,rcutoff2))
459             {
460
461             r12              = _mm_mul_ps(rsq12,rinv12);
462
463             /* EWALD ELECTROSTATICS */
464
465             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
466             ewrt             = _mm_mul_ps(r12,ewtabscale);
467             ewitab           = _mm_cvttps_epi32(ewrt);
468             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
469             ewitab           = _mm_slli_epi32(ewitab,2);
470             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
471             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
472             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
473             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
474             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
475             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
476             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
477             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
478             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
479
480             d                = _mm_sub_ps(r12,rswitch);
481             d                = _mm_max_ps(d,_mm_setzero_ps());
482             d2               = _mm_mul_ps(d,d);
483             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
484
485             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
486
487             /* Evaluate switch function */
488             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
489             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
490             velec            = _mm_mul_ps(velec,sw);
491             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
492
493             /* Update potential sum for this i atom from the interaction with this j atom. */
494             velec            = _mm_and_ps(velec,cutoff_mask);
495             velecsum         = _mm_add_ps(velecsum,velec);
496
497             fscal            = felec;
498
499             fscal            = _mm_and_ps(fscal,cutoff_mask);
500
501             /* Calculate temporary vectorial force */
502             tx               = _mm_mul_ps(fscal,dx12);
503             ty               = _mm_mul_ps(fscal,dy12);
504             tz               = _mm_mul_ps(fscal,dz12);
505
506             /* Update vectorial force */
507             fix1             = _mm_add_ps(fix1,tx);
508             fiy1             = _mm_add_ps(fiy1,ty);
509             fiz1             = _mm_add_ps(fiz1,tz);
510
511             fjx2             = _mm_add_ps(fjx2,tx);
512             fjy2             = _mm_add_ps(fjy2,ty);
513             fjz2             = _mm_add_ps(fjz2,tz);
514
515             }
516
517             /**************************
518              * CALCULATE INTERACTIONS *
519              **************************/
520
521             if (gmx_mm_any_lt(rsq13,rcutoff2))
522             {
523
524             r13              = _mm_mul_ps(rsq13,rinv13);
525
526             /* EWALD ELECTROSTATICS */
527
528             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
529             ewrt             = _mm_mul_ps(r13,ewtabscale);
530             ewitab           = _mm_cvttps_epi32(ewrt);
531             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
532             ewitab           = _mm_slli_epi32(ewitab,2);
533             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
534             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
535             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
536             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
537             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
538             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
539             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
540             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
541             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
542
543             d                = _mm_sub_ps(r13,rswitch);
544             d                = _mm_max_ps(d,_mm_setzero_ps());
545             d2               = _mm_mul_ps(d,d);
546             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
547
548             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
549
550             /* Evaluate switch function */
551             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
552             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
553             velec            = _mm_mul_ps(velec,sw);
554             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
555
556             /* Update potential sum for this i atom from the interaction with this j atom. */
557             velec            = _mm_and_ps(velec,cutoff_mask);
558             velecsum         = _mm_add_ps(velecsum,velec);
559
560             fscal            = felec;
561
562             fscal            = _mm_and_ps(fscal,cutoff_mask);
563
564             /* Calculate temporary vectorial force */
565             tx               = _mm_mul_ps(fscal,dx13);
566             ty               = _mm_mul_ps(fscal,dy13);
567             tz               = _mm_mul_ps(fscal,dz13);
568
569             /* Update vectorial force */
570             fix1             = _mm_add_ps(fix1,tx);
571             fiy1             = _mm_add_ps(fiy1,ty);
572             fiz1             = _mm_add_ps(fiz1,tz);
573
574             fjx3             = _mm_add_ps(fjx3,tx);
575             fjy3             = _mm_add_ps(fjy3,ty);
576             fjz3             = _mm_add_ps(fjz3,tz);
577
578             }
579
580             /**************************
581              * CALCULATE INTERACTIONS *
582              **************************/
583
584             if (gmx_mm_any_lt(rsq21,rcutoff2))
585             {
586
587             r21              = _mm_mul_ps(rsq21,rinv21);
588
589             /* EWALD ELECTROSTATICS */
590
591             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
592             ewrt             = _mm_mul_ps(r21,ewtabscale);
593             ewitab           = _mm_cvttps_epi32(ewrt);
594             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
595             ewitab           = _mm_slli_epi32(ewitab,2);
596             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
597             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
598             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
599             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
600             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
601             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
602             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
603             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
604             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
605
606             d                = _mm_sub_ps(r21,rswitch);
607             d                = _mm_max_ps(d,_mm_setzero_ps());
608             d2               = _mm_mul_ps(d,d);
609             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
610
611             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
612
613             /* Evaluate switch function */
614             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
615             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
616             velec            = _mm_mul_ps(velec,sw);
617             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
618
619             /* Update potential sum for this i atom from the interaction with this j atom. */
620             velec            = _mm_and_ps(velec,cutoff_mask);
621             velecsum         = _mm_add_ps(velecsum,velec);
622
623             fscal            = felec;
624
625             fscal            = _mm_and_ps(fscal,cutoff_mask);
626
627             /* Calculate temporary vectorial force */
628             tx               = _mm_mul_ps(fscal,dx21);
629             ty               = _mm_mul_ps(fscal,dy21);
630             tz               = _mm_mul_ps(fscal,dz21);
631
632             /* Update vectorial force */
633             fix2             = _mm_add_ps(fix2,tx);
634             fiy2             = _mm_add_ps(fiy2,ty);
635             fiz2             = _mm_add_ps(fiz2,tz);
636
637             fjx1             = _mm_add_ps(fjx1,tx);
638             fjy1             = _mm_add_ps(fjy1,ty);
639             fjz1             = _mm_add_ps(fjz1,tz);
640
641             }
642
643             /**************************
644              * CALCULATE INTERACTIONS *
645              **************************/
646
647             if (gmx_mm_any_lt(rsq22,rcutoff2))
648             {
649
650             r22              = _mm_mul_ps(rsq22,rinv22);
651
652             /* EWALD ELECTROSTATICS */
653
654             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
655             ewrt             = _mm_mul_ps(r22,ewtabscale);
656             ewitab           = _mm_cvttps_epi32(ewrt);
657             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
658             ewitab           = _mm_slli_epi32(ewitab,2);
659             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
660             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
661             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
662             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
663             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
664             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
665             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
666             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
667             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
668
669             d                = _mm_sub_ps(r22,rswitch);
670             d                = _mm_max_ps(d,_mm_setzero_ps());
671             d2               = _mm_mul_ps(d,d);
672             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
673
674             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
675
676             /* Evaluate switch function */
677             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
678             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
679             velec            = _mm_mul_ps(velec,sw);
680             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
681
682             /* Update potential sum for this i atom from the interaction with this j atom. */
683             velec            = _mm_and_ps(velec,cutoff_mask);
684             velecsum         = _mm_add_ps(velecsum,velec);
685
686             fscal            = felec;
687
688             fscal            = _mm_and_ps(fscal,cutoff_mask);
689
690             /* Calculate temporary vectorial force */
691             tx               = _mm_mul_ps(fscal,dx22);
692             ty               = _mm_mul_ps(fscal,dy22);
693             tz               = _mm_mul_ps(fscal,dz22);
694
695             /* Update vectorial force */
696             fix2             = _mm_add_ps(fix2,tx);
697             fiy2             = _mm_add_ps(fiy2,ty);
698             fiz2             = _mm_add_ps(fiz2,tz);
699
700             fjx2             = _mm_add_ps(fjx2,tx);
701             fjy2             = _mm_add_ps(fjy2,ty);
702             fjz2             = _mm_add_ps(fjz2,tz);
703
704             }
705
706             /**************************
707              * CALCULATE INTERACTIONS *
708              **************************/
709
710             if (gmx_mm_any_lt(rsq23,rcutoff2))
711             {
712
713             r23              = _mm_mul_ps(rsq23,rinv23);
714
715             /* EWALD ELECTROSTATICS */
716
717             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
718             ewrt             = _mm_mul_ps(r23,ewtabscale);
719             ewitab           = _mm_cvttps_epi32(ewrt);
720             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
721             ewitab           = _mm_slli_epi32(ewitab,2);
722             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
723             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
724             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
725             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
726             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
727             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
728             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
729             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
730             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
731
732             d                = _mm_sub_ps(r23,rswitch);
733             d                = _mm_max_ps(d,_mm_setzero_ps());
734             d2               = _mm_mul_ps(d,d);
735             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
736
737             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
738
739             /* Evaluate switch function */
740             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
741             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
742             velec            = _mm_mul_ps(velec,sw);
743             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
744
745             /* Update potential sum for this i atom from the interaction with this j atom. */
746             velec            = _mm_and_ps(velec,cutoff_mask);
747             velecsum         = _mm_add_ps(velecsum,velec);
748
749             fscal            = felec;
750
751             fscal            = _mm_and_ps(fscal,cutoff_mask);
752
753             /* Calculate temporary vectorial force */
754             tx               = _mm_mul_ps(fscal,dx23);
755             ty               = _mm_mul_ps(fscal,dy23);
756             tz               = _mm_mul_ps(fscal,dz23);
757
758             /* Update vectorial force */
759             fix2             = _mm_add_ps(fix2,tx);
760             fiy2             = _mm_add_ps(fiy2,ty);
761             fiz2             = _mm_add_ps(fiz2,tz);
762
763             fjx3             = _mm_add_ps(fjx3,tx);
764             fjy3             = _mm_add_ps(fjy3,ty);
765             fjz3             = _mm_add_ps(fjz3,tz);
766
767             }
768
769             /**************************
770              * CALCULATE INTERACTIONS *
771              **************************/
772
773             if (gmx_mm_any_lt(rsq31,rcutoff2))
774             {
775
776             r31              = _mm_mul_ps(rsq31,rinv31);
777
778             /* EWALD ELECTROSTATICS */
779
780             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
781             ewrt             = _mm_mul_ps(r31,ewtabscale);
782             ewitab           = _mm_cvttps_epi32(ewrt);
783             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
784             ewitab           = _mm_slli_epi32(ewitab,2);
785             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
786             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
787             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
788             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
789             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
790             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
791             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
792             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
793             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
794
795             d                = _mm_sub_ps(r31,rswitch);
796             d                = _mm_max_ps(d,_mm_setzero_ps());
797             d2               = _mm_mul_ps(d,d);
798             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
799
800             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
801
802             /* Evaluate switch function */
803             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
804             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
805             velec            = _mm_mul_ps(velec,sw);
806             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
807
808             /* Update potential sum for this i atom from the interaction with this j atom. */
809             velec            = _mm_and_ps(velec,cutoff_mask);
810             velecsum         = _mm_add_ps(velecsum,velec);
811
812             fscal            = felec;
813
814             fscal            = _mm_and_ps(fscal,cutoff_mask);
815
816             /* Calculate temporary vectorial force */
817             tx               = _mm_mul_ps(fscal,dx31);
818             ty               = _mm_mul_ps(fscal,dy31);
819             tz               = _mm_mul_ps(fscal,dz31);
820
821             /* Update vectorial force */
822             fix3             = _mm_add_ps(fix3,tx);
823             fiy3             = _mm_add_ps(fiy3,ty);
824             fiz3             = _mm_add_ps(fiz3,tz);
825
826             fjx1             = _mm_add_ps(fjx1,tx);
827             fjy1             = _mm_add_ps(fjy1,ty);
828             fjz1             = _mm_add_ps(fjz1,tz);
829
830             }
831
832             /**************************
833              * CALCULATE INTERACTIONS *
834              **************************/
835
836             if (gmx_mm_any_lt(rsq32,rcutoff2))
837             {
838
839             r32              = _mm_mul_ps(rsq32,rinv32);
840
841             /* EWALD ELECTROSTATICS */
842
843             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
844             ewrt             = _mm_mul_ps(r32,ewtabscale);
845             ewitab           = _mm_cvttps_epi32(ewrt);
846             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
847             ewitab           = _mm_slli_epi32(ewitab,2);
848             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
849             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
850             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
851             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
852             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
853             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
854             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
855             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
856             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
857
858             d                = _mm_sub_ps(r32,rswitch);
859             d                = _mm_max_ps(d,_mm_setzero_ps());
860             d2               = _mm_mul_ps(d,d);
861             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
862
863             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
864
865             /* Evaluate switch function */
866             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
867             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
868             velec            = _mm_mul_ps(velec,sw);
869             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
870
871             /* Update potential sum for this i atom from the interaction with this j atom. */
872             velec            = _mm_and_ps(velec,cutoff_mask);
873             velecsum         = _mm_add_ps(velecsum,velec);
874
875             fscal            = felec;
876
877             fscal            = _mm_and_ps(fscal,cutoff_mask);
878
879             /* Calculate temporary vectorial force */
880             tx               = _mm_mul_ps(fscal,dx32);
881             ty               = _mm_mul_ps(fscal,dy32);
882             tz               = _mm_mul_ps(fscal,dz32);
883
884             /* Update vectorial force */
885             fix3             = _mm_add_ps(fix3,tx);
886             fiy3             = _mm_add_ps(fiy3,ty);
887             fiz3             = _mm_add_ps(fiz3,tz);
888
889             fjx2             = _mm_add_ps(fjx2,tx);
890             fjy2             = _mm_add_ps(fjy2,ty);
891             fjz2             = _mm_add_ps(fjz2,tz);
892
893             }
894
895             /**************************
896              * CALCULATE INTERACTIONS *
897              **************************/
898
899             if (gmx_mm_any_lt(rsq33,rcutoff2))
900             {
901
902             r33              = _mm_mul_ps(rsq33,rinv33);
903
904             /* EWALD ELECTROSTATICS */
905
906             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
907             ewrt             = _mm_mul_ps(r33,ewtabscale);
908             ewitab           = _mm_cvttps_epi32(ewrt);
909             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
910             ewitab           = _mm_slli_epi32(ewitab,2);
911             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
912             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
913             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
914             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
915             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
916             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
917             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
918             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
919             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
920
921             d                = _mm_sub_ps(r33,rswitch);
922             d                = _mm_max_ps(d,_mm_setzero_ps());
923             d2               = _mm_mul_ps(d,d);
924             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
925
926             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
927
928             /* Evaluate switch function */
929             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
930             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
931             velec            = _mm_mul_ps(velec,sw);
932             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
933
934             /* Update potential sum for this i atom from the interaction with this j atom. */
935             velec            = _mm_and_ps(velec,cutoff_mask);
936             velecsum         = _mm_add_ps(velecsum,velec);
937
938             fscal            = felec;
939
940             fscal            = _mm_and_ps(fscal,cutoff_mask);
941
942             /* Calculate temporary vectorial force */
943             tx               = _mm_mul_ps(fscal,dx33);
944             ty               = _mm_mul_ps(fscal,dy33);
945             tz               = _mm_mul_ps(fscal,dz33);
946
947             /* Update vectorial force */
948             fix3             = _mm_add_ps(fix3,tx);
949             fiy3             = _mm_add_ps(fiy3,ty);
950             fiz3             = _mm_add_ps(fiz3,tz);
951
952             fjx3             = _mm_add_ps(fjx3,tx);
953             fjy3             = _mm_add_ps(fjy3,ty);
954             fjz3             = _mm_add_ps(fjz3,tz);
955
956             }
957
958             fjptrA             = f+j_coord_offsetA;
959             fjptrB             = f+j_coord_offsetB;
960             fjptrC             = f+j_coord_offsetC;
961             fjptrD             = f+j_coord_offsetD;
962
963             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
964                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
965                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
966
967             /* Inner loop uses 647 flops */
968         }
969
970         if(jidx<j_index_end)
971         {
972
973             /* Get j neighbor index, and coordinate index */
974             jnrlistA         = jjnr[jidx];
975             jnrlistB         = jjnr[jidx+1];
976             jnrlistC         = jjnr[jidx+2];
977             jnrlistD         = jjnr[jidx+3];
978             /* Sign of each element will be negative for non-real atoms.
979              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
980              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
981              */
982             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
983             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
984             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
985             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
986             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
987             j_coord_offsetA  = DIM*jnrA;
988             j_coord_offsetB  = DIM*jnrB;
989             j_coord_offsetC  = DIM*jnrC;
990             j_coord_offsetD  = DIM*jnrD;
991
992             /* load j atom coordinates */
993             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
994                                               x+j_coord_offsetC,x+j_coord_offsetD,
995                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
996                                               &jy2,&jz2,&jx3,&jy3,&jz3);
997
998             /* Calculate displacement vector */
999             dx00             = _mm_sub_ps(ix0,jx0);
1000             dy00             = _mm_sub_ps(iy0,jy0);
1001             dz00             = _mm_sub_ps(iz0,jz0);
1002             dx11             = _mm_sub_ps(ix1,jx1);
1003             dy11             = _mm_sub_ps(iy1,jy1);
1004             dz11             = _mm_sub_ps(iz1,jz1);
1005             dx12             = _mm_sub_ps(ix1,jx2);
1006             dy12             = _mm_sub_ps(iy1,jy2);
1007             dz12             = _mm_sub_ps(iz1,jz2);
1008             dx13             = _mm_sub_ps(ix1,jx3);
1009             dy13             = _mm_sub_ps(iy1,jy3);
1010             dz13             = _mm_sub_ps(iz1,jz3);
1011             dx21             = _mm_sub_ps(ix2,jx1);
1012             dy21             = _mm_sub_ps(iy2,jy1);
1013             dz21             = _mm_sub_ps(iz2,jz1);
1014             dx22             = _mm_sub_ps(ix2,jx2);
1015             dy22             = _mm_sub_ps(iy2,jy2);
1016             dz22             = _mm_sub_ps(iz2,jz2);
1017             dx23             = _mm_sub_ps(ix2,jx3);
1018             dy23             = _mm_sub_ps(iy2,jy3);
1019             dz23             = _mm_sub_ps(iz2,jz3);
1020             dx31             = _mm_sub_ps(ix3,jx1);
1021             dy31             = _mm_sub_ps(iy3,jy1);
1022             dz31             = _mm_sub_ps(iz3,jz1);
1023             dx32             = _mm_sub_ps(ix3,jx2);
1024             dy32             = _mm_sub_ps(iy3,jy2);
1025             dz32             = _mm_sub_ps(iz3,jz2);
1026             dx33             = _mm_sub_ps(ix3,jx3);
1027             dy33             = _mm_sub_ps(iy3,jy3);
1028             dz33             = _mm_sub_ps(iz3,jz3);
1029
1030             /* Calculate squared distance and things based on it */
1031             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1032             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1033             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1034             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1035             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1036             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1037             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1038             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1039             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1040             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1041
1042             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1043             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1044             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1045             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1046             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1047             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1048             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1049             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1050             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1051             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1052
1053             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1054             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1055             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1056             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1057             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1058             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1059             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1060             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1061             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1062             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1063
1064             fjx0             = _mm_setzero_ps();
1065             fjy0             = _mm_setzero_ps();
1066             fjz0             = _mm_setzero_ps();
1067             fjx1             = _mm_setzero_ps();
1068             fjy1             = _mm_setzero_ps();
1069             fjz1             = _mm_setzero_ps();
1070             fjx2             = _mm_setzero_ps();
1071             fjy2             = _mm_setzero_ps();
1072             fjz2             = _mm_setzero_ps();
1073             fjx3             = _mm_setzero_ps();
1074             fjy3             = _mm_setzero_ps();
1075             fjz3             = _mm_setzero_ps();
1076
1077             /**************************
1078              * CALCULATE INTERACTIONS *
1079              **************************/
1080
1081             if (gmx_mm_any_lt(rsq00,rcutoff2))
1082             {
1083
1084             r00              = _mm_mul_ps(rsq00,rinv00);
1085             r00              = _mm_andnot_ps(dummy_mask,r00);
1086
1087             /* LENNARD-JONES DISPERSION/REPULSION */
1088
1089             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1090             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1091             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1092             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1093             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1094
1095             d                = _mm_sub_ps(r00,rswitch);
1096             d                = _mm_max_ps(d,_mm_setzero_ps());
1097             d2               = _mm_mul_ps(d,d);
1098             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1099
1100             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1101
1102             /* Evaluate switch function */
1103             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1104             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1105             vvdw             = _mm_mul_ps(vvdw,sw);
1106             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1107
1108             /* Update potential sum for this i atom from the interaction with this j atom. */
1109             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
1110             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
1111             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
1112
1113             fscal            = fvdw;
1114
1115             fscal            = _mm_and_ps(fscal,cutoff_mask);
1116
1117             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1118
1119             /* Calculate temporary vectorial force */
1120             tx               = _mm_mul_ps(fscal,dx00);
1121             ty               = _mm_mul_ps(fscal,dy00);
1122             tz               = _mm_mul_ps(fscal,dz00);
1123
1124             /* Update vectorial force */
1125             fix0             = _mm_add_ps(fix0,tx);
1126             fiy0             = _mm_add_ps(fiy0,ty);
1127             fiz0             = _mm_add_ps(fiz0,tz);
1128
1129             fjx0             = _mm_add_ps(fjx0,tx);
1130             fjy0             = _mm_add_ps(fjy0,ty);
1131             fjz0             = _mm_add_ps(fjz0,tz);
1132
1133             }
1134
1135             /**************************
1136              * CALCULATE INTERACTIONS *
1137              **************************/
1138
1139             if (gmx_mm_any_lt(rsq11,rcutoff2))
1140             {
1141
1142             r11              = _mm_mul_ps(rsq11,rinv11);
1143             r11              = _mm_andnot_ps(dummy_mask,r11);
1144
1145             /* EWALD ELECTROSTATICS */
1146
1147             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1148             ewrt             = _mm_mul_ps(r11,ewtabscale);
1149             ewitab           = _mm_cvttps_epi32(ewrt);
1150             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1151             ewitab           = _mm_slli_epi32(ewitab,2);
1152             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1153             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1154             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1155             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1156             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1157             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1158             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1159             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1160             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1161
1162             d                = _mm_sub_ps(r11,rswitch);
1163             d                = _mm_max_ps(d,_mm_setzero_ps());
1164             d2               = _mm_mul_ps(d,d);
1165             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1166
1167             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1168
1169             /* Evaluate switch function */
1170             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1171             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1172             velec            = _mm_mul_ps(velec,sw);
1173             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1174
1175             /* Update potential sum for this i atom from the interaction with this j atom. */
1176             velec            = _mm_and_ps(velec,cutoff_mask);
1177             velec            = _mm_andnot_ps(dummy_mask,velec);
1178             velecsum         = _mm_add_ps(velecsum,velec);
1179
1180             fscal            = felec;
1181
1182             fscal            = _mm_and_ps(fscal,cutoff_mask);
1183
1184             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1185
1186             /* Calculate temporary vectorial force */
1187             tx               = _mm_mul_ps(fscal,dx11);
1188             ty               = _mm_mul_ps(fscal,dy11);
1189             tz               = _mm_mul_ps(fscal,dz11);
1190
1191             /* Update vectorial force */
1192             fix1             = _mm_add_ps(fix1,tx);
1193             fiy1             = _mm_add_ps(fiy1,ty);
1194             fiz1             = _mm_add_ps(fiz1,tz);
1195
1196             fjx1             = _mm_add_ps(fjx1,tx);
1197             fjy1             = _mm_add_ps(fjy1,ty);
1198             fjz1             = _mm_add_ps(fjz1,tz);
1199
1200             }
1201
1202             /**************************
1203              * CALCULATE INTERACTIONS *
1204              **************************/
1205
1206             if (gmx_mm_any_lt(rsq12,rcutoff2))
1207             {
1208
1209             r12              = _mm_mul_ps(rsq12,rinv12);
1210             r12              = _mm_andnot_ps(dummy_mask,r12);
1211
1212             /* EWALD ELECTROSTATICS */
1213
1214             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1215             ewrt             = _mm_mul_ps(r12,ewtabscale);
1216             ewitab           = _mm_cvttps_epi32(ewrt);
1217             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1218             ewitab           = _mm_slli_epi32(ewitab,2);
1219             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1220             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1221             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1222             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1223             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1224             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1225             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1226             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1227             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1228
1229             d                = _mm_sub_ps(r12,rswitch);
1230             d                = _mm_max_ps(d,_mm_setzero_ps());
1231             d2               = _mm_mul_ps(d,d);
1232             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1233
1234             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1235
1236             /* Evaluate switch function */
1237             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1238             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1239             velec            = _mm_mul_ps(velec,sw);
1240             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1241
1242             /* Update potential sum for this i atom from the interaction with this j atom. */
1243             velec            = _mm_and_ps(velec,cutoff_mask);
1244             velec            = _mm_andnot_ps(dummy_mask,velec);
1245             velecsum         = _mm_add_ps(velecsum,velec);
1246
1247             fscal            = felec;
1248
1249             fscal            = _mm_and_ps(fscal,cutoff_mask);
1250
1251             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1252
1253             /* Calculate temporary vectorial force */
1254             tx               = _mm_mul_ps(fscal,dx12);
1255             ty               = _mm_mul_ps(fscal,dy12);
1256             tz               = _mm_mul_ps(fscal,dz12);
1257
1258             /* Update vectorial force */
1259             fix1             = _mm_add_ps(fix1,tx);
1260             fiy1             = _mm_add_ps(fiy1,ty);
1261             fiz1             = _mm_add_ps(fiz1,tz);
1262
1263             fjx2             = _mm_add_ps(fjx2,tx);
1264             fjy2             = _mm_add_ps(fjy2,ty);
1265             fjz2             = _mm_add_ps(fjz2,tz);
1266
1267             }
1268
1269             /**************************
1270              * CALCULATE INTERACTIONS *
1271              **************************/
1272
1273             if (gmx_mm_any_lt(rsq13,rcutoff2))
1274             {
1275
1276             r13              = _mm_mul_ps(rsq13,rinv13);
1277             r13              = _mm_andnot_ps(dummy_mask,r13);
1278
1279             /* EWALD ELECTROSTATICS */
1280
1281             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1282             ewrt             = _mm_mul_ps(r13,ewtabscale);
1283             ewitab           = _mm_cvttps_epi32(ewrt);
1284             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1285             ewitab           = _mm_slli_epi32(ewitab,2);
1286             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1287             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1288             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1289             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1290             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1291             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1292             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1293             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1294             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1295
1296             d                = _mm_sub_ps(r13,rswitch);
1297             d                = _mm_max_ps(d,_mm_setzero_ps());
1298             d2               = _mm_mul_ps(d,d);
1299             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1300
1301             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1302
1303             /* Evaluate switch function */
1304             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1305             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1306             velec            = _mm_mul_ps(velec,sw);
1307             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1308
1309             /* Update potential sum for this i atom from the interaction with this j atom. */
1310             velec            = _mm_and_ps(velec,cutoff_mask);
1311             velec            = _mm_andnot_ps(dummy_mask,velec);
1312             velecsum         = _mm_add_ps(velecsum,velec);
1313
1314             fscal            = felec;
1315
1316             fscal            = _mm_and_ps(fscal,cutoff_mask);
1317
1318             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1319
1320             /* Calculate temporary vectorial force */
1321             tx               = _mm_mul_ps(fscal,dx13);
1322             ty               = _mm_mul_ps(fscal,dy13);
1323             tz               = _mm_mul_ps(fscal,dz13);
1324
1325             /* Update vectorial force */
1326             fix1             = _mm_add_ps(fix1,tx);
1327             fiy1             = _mm_add_ps(fiy1,ty);
1328             fiz1             = _mm_add_ps(fiz1,tz);
1329
1330             fjx3             = _mm_add_ps(fjx3,tx);
1331             fjy3             = _mm_add_ps(fjy3,ty);
1332             fjz3             = _mm_add_ps(fjz3,tz);
1333
1334             }
1335
1336             /**************************
1337              * CALCULATE INTERACTIONS *
1338              **************************/
1339
1340             if (gmx_mm_any_lt(rsq21,rcutoff2))
1341             {
1342
1343             r21              = _mm_mul_ps(rsq21,rinv21);
1344             r21              = _mm_andnot_ps(dummy_mask,r21);
1345
1346             /* EWALD ELECTROSTATICS */
1347
1348             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1349             ewrt             = _mm_mul_ps(r21,ewtabscale);
1350             ewitab           = _mm_cvttps_epi32(ewrt);
1351             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1352             ewitab           = _mm_slli_epi32(ewitab,2);
1353             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1354             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1355             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1356             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1357             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1358             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1359             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1360             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1361             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1362
1363             d                = _mm_sub_ps(r21,rswitch);
1364             d                = _mm_max_ps(d,_mm_setzero_ps());
1365             d2               = _mm_mul_ps(d,d);
1366             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1367
1368             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1369
1370             /* Evaluate switch function */
1371             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1372             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1373             velec            = _mm_mul_ps(velec,sw);
1374             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1375
1376             /* Update potential sum for this i atom from the interaction with this j atom. */
1377             velec            = _mm_and_ps(velec,cutoff_mask);
1378             velec            = _mm_andnot_ps(dummy_mask,velec);
1379             velecsum         = _mm_add_ps(velecsum,velec);
1380
1381             fscal            = felec;
1382
1383             fscal            = _mm_and_ps(fscal,cutoff_mask);
1384
1385             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1386
1387             /* Calculate temporary vectorial force */
1388             tx               = _mm_mul_ps(fscal,dx21);
1389             ty               = _mm_mul_ps(fscal,dy21);
1390             tz               = _mm_mul_ps(fscal,dz21);
1391
1392             /* Update vectorial force */
1393             fix2             = _mm_add_ps(fix2,tx);
1394             fiy2             = _mm_add_ps(fiy2,ty);
1395             fiz2             = _mm_add_ps(fiz2,tz);
1396
1397             fjx1             = _mm_add_ps(fjx1,tx);
1398             fjy1             = _mm_add_ps(fjy1,ty);
1399             fjz1             = _mm_add_ps(fjz1,tz);
1400
1401             }
1402
1403             /**************************
1404              * CALCULATE INTERACTIONS *
1405              **************************/
1406
1407             if (gmx_mm_any_lt(rsq22,rcutoff2))
1408             {
1409
1410             r22              = _mm_mul_ps(rsq22,rinv22);
1411             r22              = _mm_andnot_ps(dummy_mask,r22);
1412
1413             /* EWALD ELECTROSTATICS */
1414
1415             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1416             ewrt             = _mm_mul_ps(r22,ewtabscale);
1417             ewitab           = _mm_cvttps_epi32(ewrt);
1418             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1419             ewitab           = _mm_slli_epi32(ewitab,2);
1420             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1421             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1422             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1423             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1424             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1425             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1426             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1427             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1428             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1429
1430             d                = _mm_sub_ps(r22,rswitch);
1431             d                = _mm_max_ps(d,_mm_setzero_ps());
1432             d2               = _mm_mul_ps(d,d);
1433             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1434
1435             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1436
1437             /* Evaluate switch function */
1438             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1439             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1440             velec            = _mm_mul_ps(velec,sw);
1441             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1442
1443             /* Update potential sum for this i atom from the interaction with this j atom. */
1444             velec            = _mm_and_ps(velec,cutoff_mask);
1445             velec            = _mm_andnot_ps(dummy_mask,velec);
1446             velecsum         = _mm_add_ps(velecsum,velec);
1447
1448             fscal            = felec;
1449
1450             fscal            = _mm_and_ps(fscal,cutoff_mask);
1451
1452             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1453
1454             /* Calculate temporary vectorial force */
1455             tx               = _mm_mul_ps(fscal,dx22);
1456             ty               = _mm_mul_ps(fscal,dy22);
1457             tz               = _mm_mul_ps(fscal,dz22);
1458
1459             /* Update vectorial force */
1460             fix2             = _mm_add_ps(fix2,tx);
1461             fiy2             = _mm_add_ps(fiy2,ty);
1462             fiz2             = _mm_add_ps(fiz2,tz);
1463
1464             fjx2             = _mm_add_ps(fjx2,tx);
1465             fjy2             = _mm_add_ps(fjy2,ty);
1466             fjz2             = _mm_add_ps(fjz2,tz);
1467
1468             }
1469
1470             /**************************
1471              * CALCULATE INTERACTIONS *
1472              **************************/
1473
1474             if (gmx_mm_any_lt(rsq23,rcutoff2))
1475             {
1476
1477             r23              = _mm_mul_ps(rsq23,rinv23);
1478             r23              = _mm_andnot_ps(dummy_mask,r23);
1479
1480             /* EWALD ELECTROSTATICS */
1481
1482             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1483             ewrt             = _mm_mul_ps(r23,ewtabscale);
1484             ewitab           = _mm_cvttps_epi32(ewrt);
1485             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1486             ewitab           = _mm_slli_epi32(ewitab,2);
1487             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1488             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1489             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1490             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1491             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1492             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1493             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1494             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1495             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1496
1497             d                = _mm_sub_ps(r23,rswitch);
1498             d                = _mm_max_ps(d,_mm_setzero_ps());
1499             d2               = _mm_mul_ps(d,d);
1500             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1501
1502             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1503
1504             /* Evaluate switch function */
1505             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1506             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1507             velec            = _mm_mul_ps(velec,sw);
1508             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1509
1510             /* Update potential sum for this i atom from the interaction with this j atom. */
1511             velec            = _mm_and_ps(velec,cutoff_mask);
1512             velec            = _mm_andnot_ps(dummy_mask,velec);
1513             velecsum         = _mm_add_ps(velecsum,velec);
1514
1515             fscal            = felec;
1516
1517             fscal            = _mm_and_ps(fscal,cutoff_mask);
1518
1519             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1520
1521             /* Calculate temporary vectorial force */
1522             tx               = _mm_mul_ps(fscal,dx23);
1523             ty               = _mm_mul_ps(fscal,dy23);
1524             tz               = _mm_mul_ps(fscal,dz23);
1525
1526             /* Update vectorial force */
1527             fix2             = _mm_add_ps(fix2,tx);
1528             fiy2             = _mm_add_ps(fiy2,ty);
1529             fiz2             = _mm_add_ps(fiz2,tz);
1530
1531             fjx3             = _mm_add_ps(fjx3,tx);
1532             fjy3             = _mm_add_ps(fjy3,ty);
1533             fjz3             = _mm_add_ps(fjz3,tz);
1534
1535             }
1536
1537             /**************************
1538              * CALCULATE INTERACTIONS *
1539              **************************/
1540
1541             if (gmx_mm_any_lt(rsq31,rcutoff2))
1542             {
1543
1544             r31              = _mm_mul_ps(rsq31,rinv31);
1545             r31              = _mm_andnot_ps(dummy_mask,r31);
1546
1547             /* EWALD ELECTROSTATICS */
1548
1549             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1550             ewrt             = _mm_mul_ps(r31,ewtabscale);
1551             ewitab           = _mm_cvttps_epi32(ewrt);
1552             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1553             ewitab           = _mm_slli_epi32(ewitab,2);
1554             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1555             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1556             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1557             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1558             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1559             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1560             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1561             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1562             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1563
1564             d                = _mm_sub_ps(r31,rswitch);
1565             d                = _mm_max_ps(d,_mm_setzero_ps());
1566             d2               = _mm_mul_ps(d,d);
1567             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1568
1569             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1570
1571             /* Evaluate switch function */
1572             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1573             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1574             velec            = _mm_mul_ps(velec,sw);
1575             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1576
1577             /* Update potential sum for this i atom from the interaction with this j atom. */
1578             velec            = _mm_and_ps(velec,cutoff_mask);
1579             velec            = _mm_andnot_ps(dummy_mask,velec);
1580             velecsum         = _mm_add_ps(velecsum,velec);
1581
1582             fscal            = felec;
1583
1584             fscal            = _mm_and_ps(fscal,cutoff_mask);
1585
1586             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1587
1588             /* Calculate temporary vectorial force */
1589             tx               = _mm_mul_ps(fscal,dx31);
1590             ty               = _mm_mul_ps(fscal,dy31);
1591             tz               = _mm_mul_ps(fscal,dz31);
1592
1593             /* Update vectorial force */
1594             fix3             = _mm_add_ps(fix3,tx);
1595             fiy3             = _mm_add_ps(fiy3,ty);
1596             fiz3             = _mm_add_ps(fiz3,tz);
1597
1598             fjx1             = _mm_add_ps(fjx1,tx);
1599             fjy1             = _mm_add_ps(fjy1,ty);
1600             fjz1             = _mm_add_ps(fjz1,tz);
1601
1602             }
1603
1604             /**************************
1605              * CALCULATE INTERACTIONS *
1606              **************************/
1607
1608             if (gmx_mm_any_lt(rsq32,rcutoff2))
1609             {
1610
1611             r32              = _mm_mul_ps(rsq32,rinv32);
1612             r32              = _mm_andnot_ps(dummy_mask,r32);
1613
1614             /* EWALD ELECTROSTATICS */
1615
1616             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1617             ewrt             = _mm_mul_ps(r32,ewtabscale);
1618             ewitab           = _mm_cvttps_epi32(ewrt);
1619             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1620             ewitab           = _mm_slli_epi32(ewitab,2);
1621             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1622             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1623             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1624             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1625             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1626             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1627             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1628             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1629             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1630
1631             d                = _mm_sub_ps(r32,rswitch);
1632             d                = _mm_max_ps(d,_mm_setzero_ps());
1633             d2               = _mm_mul_ps(d,d);
1634             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1635
1636             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1637
1638             /* Evaluate switch function */
1639             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1640             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1641             velec            = _mm_mul_ps(velec,sw);
1642             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1643
1644             /* Update potential sum for this i atom from the interaction with this j atom. */
1645             velec            = _mm_and_ps(velec,cutoff_mask);
1646             velec            = _mm_andnot_ps(dummy_mask,velec);
1647             velecsum         = _mm_add_ps(velecsum,velec);
1648
1649             fscal            = felec;
1650
1651             fscal            = _mm_and_ps(fscal,cutoff_mask);
1652
1653             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1654
1655             /* Calculate temporary vectorial force */
1656             tx               = _mm_mul_ps(fscal,dx32);
1657             ty               = _mm_mul_ps(fscal,dy32);
1658             tz               = _mm_mul_ps(fscal,dz32);
1659
1660             /* Update vectorial force */
1661             fix3             = _mm_add_ps(fix3,tx);
1662             fiy3             = _mm_add_ps(fiy3,ty);
1663             fiz3             = _mm_add_ps(fiz3,tz);
1664
1665             fjx2             = _mm_add_ps(fjx2,tx);
1666             fjy2             = _mm_add_ps(fjy2,ty);
1667             fjz2             = _mm_add_ps(fjz2,tz);
1668
1669             }
1670
1671             /**************************
1672              * CALCULATE INTERACTIONS *
1673              **************************/
1674
1675             if (gmx_mm_any_lt(rsq33,rcutoff2))
1676             {
1677
1678             r33              = _mm_mul_ps(rsq33,rinv33);
1679             r33              = _mm_andnot_ps(dummy_mask,r33);
1680
1681             /* EWALD ELECTROSTATICS */
1682
1683             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1684             ewrt             = _mm_mul_ps(r33,ewtabscale);
1685             ewitab           = _mm_cvttps_epi32(ewrt);
1686             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1687             ewitab           = _mm_slli_epi32(ewitab,2);
1688             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1689             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1690             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1691             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1692             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1693             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1694             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1695             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1696             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1697
1698             d                = _mm_sub_ps(r33,rswitch);
1699             d                = _mm_max_ps(d,_mm_setzero_ps());
1700             d2               = _mm_mul_ps(d,d);
1701             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1702
1703             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1704
1705             /* Evaluate switch function */
1706             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1707             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1708             velec            = _mm_mul_ps(velec,sw);
1709             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1710
1711             /* Update potential sum for this i atom from the interaction with this j atom. */
1712             velec            = _mm_and_ps(velec,cutoff_mask);
1713             velec            = _mm_andnot_ps(dummy_mask,velec);
1714             velecsum         = _mm_add_ps(velecsum,velec);
1715
1716             fscal            = felec;
1717
1718             fscal            = _mm_and_ps(fscal,cutoff_mask);
1719
1720             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1721
1722             /* Calculate temporary vectorial force */
1723             tx               = _mm_mul_ps(fscal,dx33);
1724             ty               = _mm_mul_ps(fscal,dy33);
1725             tz               = _mm_mul_ps(fscal,dz33);
1726
1727             /* Update vectorial force */
1728             fix3             = _mm_add_ps(fix3,tx);
1729             fiy3             = _mm_add_ps(fiy3,ty);
1730             fiz3             = _mm_add_ps(fiz3,tz);
1731
1732             fjx3             = _mm_add_ps(fjx3,tx);
1733             fjy3             = _mm_add_ps(fjy3,ty);
1734             fjz3             = _mm_add_ps(fjz3,tz);
1735
1736             }
1737
1738             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1739             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1740             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1741             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1742
1743             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1744                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1745                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1746
1747             /* Inner loop uses 657 flops */
1748         }
1749
1750         /* End of innermost loop */
1751
1752         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1753                                               f+i_coord_offset,fshift+i_shift_offset);
1754
1755         ggid                        = gid[iidx];
1756         /* Update potential energies */
1757         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1758         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1759
1760         /* Increment number of inner iterations */
1761         inneriter                  += j_index_end - j_index_start;
1762
1763         /* Outer loop uses 26 flops */
1764     }
1765
1766     /* Increment number of outer iterations */
1767     outeriter        += nri;
1768
1769     /* Update outer/inner flops */
1770
1771     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
1772 }
1773 /*
1774  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single
1775  * Electrostatics interaction: Ewald
1776  * VdW interaction:            LennardJones
1777  * Geometry:                   Water4-Water4
1778  * Calculate force/pot:        Force
1779  */
1780 void
1781 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single
1782                     (t_nblist                    * gmx_restrict       nlist,
1783                      rvec                        * gmx_restrict          xx,
1784                      rvec                        * gmx_restrict          ff,
1785                      t_forcerec                  * gmx_restrict          fr,
1786                      t_mdatoms                   * gmx_restrict     mdatoms,
1787                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1788                      t_nrnb                      * gmx_restrict        nrnb)
1789 {
1790     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1791      * just 0 for non-waters.
1792      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1793      * jnr indices corresponding to data put in the four positions in the SIMD register.
1794      */
1795     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1796     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1797     int              jnrA,jnrB,jnrC,jnrD;
1798     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1799     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1800     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1801     real             rcutoff_scalar;
1802     real             *shiftvec,*fshift,*x,*f;
1803     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1804     real             scratch[4*DIM];
1805     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1806     int              vdwioffset0;
1807     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1808     int              vdwioffset1;
1809     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1810     int              vdwioffset2;
1811     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1812     int              vdwioffset3;
1813     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1814     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1815     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1816     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1817     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1818     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1819     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1820     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1821     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1822     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1823     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1824     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1825     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1826     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1827     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1828     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1829     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1830     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1831     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1832     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1833     real             *charge;
1834     int              nvdwtype;
1835     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1836     int              *vdwtype;
1837     real             *vdwparam;
1838     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1839     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1840     __m128i          ewitab;
1841     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1842     real             *ewtab;
1843     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1844     real             rswitch_scalar,d_scalar;
1845     __m128           dummy_mask,cutoff_mask;
1846     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1847     __m128           one     = _mm_set1_ps(1.0);
1848     __m128           two     = _mm_set1_ps(2.0);
1849     x                = xx[0];
1850     f                = ff[0];
1851
1852     nri              = nlist->nri;
1853     iinr             = nlist->iinr;
1854     jindex           = nlist->jindex;
1855     jjnr             = nlist->jjnr;
1856     shiftidx         = nlist->shift;
1857     gid              = nlist->gid;
1858     shiftvec         = fr->shift_vec[0];
1859     fshift           = fr->fshift[0];
1860     facel            = _mm_set1_ps(fr->epsfac);
1861     charge           = mdatoms->chargeA;
1862     nvdwtype         = fr->ntype;
1863     vdwparam         = fr->nbfp;
1864     vdwtype          = mdatoms->typeA;
1865
1866     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1867     ewtab            = fr->ic->tabq_coul_FDV0;
1868     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1869     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1870
1871     /* Setup water-specific parameters */
1872     inr              = nlist->iinr[0];
1873     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1874     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1875     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1876     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1877
1878     jq1              = _mm_set1_ps(charge[inr+1]);
1879     jq2              = _mm_set1_ps(charge[inr+2]);
1880     jq3              = _mm_set1_ps(charge[inr+3]);
1881     vdwjidx0A        = 2*vdwtype[inr+0];
1882     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1883     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1884     qq11             = _mm_mul_ps(iq1,jq1);
1885     qq12             = _mm_mul_ps(iq1,jq2);
1886     qq13             = _mm_mul_ps(iq1,jq3);
1887     qq21             = _mm_mul_ps(iq2,jq1);
1888     qq22             = _mm_mul_ps(iq2,jq2);
1889     qq23             = _mm_mul_ps(iq2,jq3);
1890     qq31             = _mm_mul_ps(iq3,jq1);
1891     qq32             = _mm_mul_ps(iq3,jq2);
1892     qq33             = _mm_mul_ps(iq3,jq3);
1893
1894     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1895     rcutoff_scalar   = fr->rcoulomb;
1896     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1897     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1898
1899     rswitch_scalar   = fr->rcoulomb_switch;
1900     rswitch          = _mm_set1_ps(rswitch_scalar);
1901     /* Setup switch parameters */
1902     d_scalar         = rcutoff_scalar-rswitch_scalar;
1903     d                = _mm_set1_ps(d_scalar);
1904     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1905     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1906     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1907     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1908     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1909     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1910
1911     /* Avoid stupid compiler warnings */
1912     jnrA = jnrB = jnrC = jnrD = 0;
1913     j_coord_offsetA = 0;
1914     j_coord_offsetB = 0;
1915     j_coord_offsetC = 0;
1916     j_coord_offsetD = 0;
1917
1918     outeriter        = 0;
1919     inneriter        = 0;
1920
1921     for(iidx=0;iidx<4*DIM;iidx++)
1922     {
1923         scratch[iidx] = 0.0;
1924     }
1925
1926     /* Start outer loop over neighborlists */
1927     for(iidx=0; iidx<nri; iidx++)
1928     {
1929         /* Load shift vector for this list */
1930         i_shift_offset   = DIM*shiftidx[iidx];
1931
1932         /* Load limits for loop over neighbors */
1933         j_index_start    = jindex[iidx];
1934         j_index_end      = jindex[iidx+1];
1935
1936         /* Get outer coordinate index */
1937         inr              = iinr[iidx];
1938         i_coord_offset   = DIM*inr;
1939
1940         /* Load i particle coords and add shift vector */
1941         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1942                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1943
1944         fix0             = _mm_setzero_ps();
1945         fiy0             = _mm_setzero_ps();
1946         fiz0             = _mm_setzero_ps();
1947         fix1             = _mm_setzero_ps();
1948         fiy1             = _mm_setzero_ps();
1949         fiz1             = _mm_setzero_ps();
1950         fix2             = _mm_setzero_ps();
1951         fiy2             = _mm_setzero_ps();
1952         fiz2             = _mm_setzero_ps();
1953         fix3             = _mm_setzero_ps();
1954         fiy3             = _mm_setzero_ps();
1955         fiz3             = _mm_setzero_ps();
1956
1957         /* Start inner kernel loop */
1958         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1959         {
1960
1961             /* Get j neighbor index, and coordinate index */
1962             jnrA             = jjnr[jidx];
1963             jnrB             = jjnr[jidx+1];
1964             jnrC             = jjnr[jidx+2];
1965             jnrD             = jjnr[jidx+3];
1966             j_coord_offsetA  = DIM*jnrA;
1967             j_coord_offsetB  = DIM*jnrB;
1968             j_coord_offsetC  = DIM*jnrC;
1969             j_coord_offsetD  = DIM*jnrD;
1970
1971             /* load j atom coordinates */
1972             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1973                                               x+j_coord_offsetC,x+j_coord_offsetD,
1974                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1975                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1976
1977             /* Calculate displacement vector */
1978             dx00             = _mm_sub_ps(ix0,jx0);
1979             dy00             = _mm_sub_ps(iy0,jy0);
1980             dz00             = _mm_sub_ps(iz0,jz0);
1981             dx11             = _mm_sub_ps(ix1,jx1);
1982             dy11             = _mm_sub_ps(iy1,jy1);
1983             dz11             = _mm_sub_ps(iz1,jz1);
1984             dx12             = _mm_sub_ps(ix1,jx2);
1985             dy12             = _mm_sub_ps(iy1,jy2);
1986             dz12             = _mm_sub_ps(iz1,jz2);
1987             dx13             = _mm_sub_ps(ix1,jx3);
1988             dy13             = _mm_sub_ps(iy1,jy3);
1989             dz13             = _mm_sub_ps(iz1,jz3);
1990             dx21             = _mm_sub_ps(ix2,jx1);
1991             dy21             = _mm_sub_ps(iy2,jy1);
1992             dz21             = _mm_sub_ps(iz2,jz1);
1993             dx22             = _mm_sub_ps(ix2,jx2);
1994             dy22             = _mm_sub_ps(iy2,jy2);
1995             dz22             = _mm_sub_ps(iz2,jz2);
1996             dx23             = _mm_sub_ps(ix2,jx3);
1997             dy23             = _mm_sub_ps(iy2,jy3);
1998             dz23             = _mm_sub_ps(iz2,jz3);
1999             dx31             = _mm_sub_ps(ix3,jx1);
2000             dy31             = _mm_sub_ps(iy3,jy1);
2001             dz31             = _mm_sub_ps(iz3,jz1);
2002             dx32             = _mm_sub_ps(ix3,jx2);
2003             dy32             = _mm_sub_ps(iy3,jy2);
2004             dz32             = _mm_sub_ps(iz3,jz2);
2005             dx33             = _mm_sub_ps(ix3,jx3);
2006             dy33             = _mm_sub_ps(iy3,jy3);
2007             dz33             = _mm_sub_ps(iz3,jz3);
2008
2009             /* Calculate squared distance and things based on it */
2010             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2011             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2012             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2013             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2014             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2015             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2016             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2017             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2018             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2019             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2020
2021             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2022             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2023             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2024             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2025             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2026             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2027             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2028             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2029             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2030             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2031
2032             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2033             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2034             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2035             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2036             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2037             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2038             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2039             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2040             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2041             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2042
2043             fjx0             = _mm_setzero_ps();
2044             fjy0             = _mm_setzero_ps();
2045             fjz0             = _mm_setzero_ps();
2046             fjx1             = _mm_setzero_ps();
2047             fjy1             = _mm_setzero_ps();
2048             fjz1             = _mm_setzero_ps();
2049             fjx2             = _mm_setzero_ps();
2050             fjy2             = _mm_setzero_ps();
2051             fjz2             = _mm_setzero_ps();
2052             fjx3             = _mm_setzero_ps();
2053             fjy3             = _mm_setzero_ps();
2054             fjz3             = _mm_setzero_ps();
2055
2056             /**************************
2057              * CALCULATE INTERACTIONS *
2058              **************************/
2059
2060             if (gmx_mm_any_lt(rsq00,rcutoff2))
2061             {
2062
2063             r00              = _mm_mul_ps(rsq00,rinv00);
2064
2065             /* LENNARD-JONES DISPERSION/REPULSION */
2066
2067             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2068             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2069             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2070             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2071             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2072
2073             d                = _mm_sub_ps(r00,rswitch);
2074             d                = _mm_max_ps(d,_mm_setzero_ps());
2075             d2               = _mm_mul_ps(d,d);
2076             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2077
2078             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2079
2080             /* Evaluate switch function */
2081             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2082             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2083             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2084
2085             fscal            = fvdw;
2086
2087             fscal            = _mm_and_ps(fscal,cutoff_mask);
2088
2089             /* Calculate temporary vectorial force */
2090             tx               = _mm_mul_ps(fscal,dx00);
2091             ty               = _mm_mul_ps(fscal,dy00);
2092             tz               = _mm_mul_ps(fscal,dz00);
2093
2094             /* Update vectorial force */
2095             fix0             = _mm_add_ps(fix0,tx);
2096             fiy0             = _mm_add_ps(fiy0,ty);
2097             fiz0             = _mm_add_ps(fiz0,tz);
2098
2099             fjx0             = _mm_add_ps(fjx0,tx);
2100             fjy0             = _mm_add_ps(fjy0,ty);
2101             fjz0             = _mm_add_ps(fjz0,tz);
2102
2103             }
2104
2105             /**************************
2106              * CALCULATE INTERACTIONS *
2107              **************************/
2108
2109             if (gmx_mm_any_lt(rsq11,rcutoff2))
2110             {
2111
2112             r11              = _mm_mul_ps(rsq11,rinv11);
2113
2114             /* EWALD ELECTROSTATICS */
2115
2116             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2117             ewrt             = _mm_mul_ps(r11,ewtabscale);
2118             ewitab           = _mm_cvttps_epi32(ewrt);
2119             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2120             ewitab           = _mm_slli_epi32(ewitab,2);
2121             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2122             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2123             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2124             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2125             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2126             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2127             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2128             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2129             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2130
2131             d                = _mm_sub_ps(r11,rswitch);
2132             d                = _mm_max_ps(d,_mm_setzero_ps());
2133             d2               = _mm_mul_ps(d,d);
2134             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2135
2136             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2137
2138             /* Evaluate switch function */
2139             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2140             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2141             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2142
2143             fscal            = felec;
2144
2145             fscal            = _mm_and_ps(fscal,cutoff_mask);
2146
2147             /* Calculate temporary vectorial force */
2148             tx               = _mm_mul_ps(fscal,dx11);
2149             ty               = _mm_mul_ps(fscal,dy11);
2150             tz               = _mm_mul_ps(fscal,dz11);
2151
2152             /* Update vectorial force */
2153             fix1             = _mm_add_ps(fix1,tx);
2154             fiy1             = _mm_add_ps(fiy1,ty);
2155             fiz1             = _mm_add_ps(fiz1,tz);
2156
2157             fjx1             = _mm_add_ps(fjx1,tx);
2158             fjy1             = _mm_add_ps(fjy1,ty);
2159             fjz1             = _mm_add_ps(fjz1,tz);
2160
2161             }
2162
2163             /**************************
2164              * CALCULATE INTERACTIONS *
2165              **************************/
2166
2167             if (gmx_mm_any_lt(rsq12,rcutoff2))
2168             {
2169
2170             r12              = _mm_mul_ps(rsq12,rinv12);
2171
2172             /* EWALD ELECTROSTATICS */
2173
2174             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2175             ewrt             = _mm_mul_ps(r12,ewtabscale);
2176             ewitab           = _mm_cvttps_epi32(ewrt);
2177             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2178             ewitab           = _mm_slli_epi32(ewitab,2);
2179             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2180             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2181             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2182             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2183             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2184             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2185             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2186             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2187             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2188
2189             d                = _mm_sub_ps(r12,rswitch);
2190             d                = _mm_max_ps(d,_mm_setzero_ps());
2191             d2               = _mm_mul_ps(d,d);
2192             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2193
2194             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2195
2196             /* Evaluate switch function */
2197             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2198             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2199             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2200
2201             fscal            = felec;
2202
2203             fscal            = _mm_and_ps(fscal,cutoff_mask);
2204
2205             /* Calculate temporary vectorial force */
2206             tx               = _mm_mul_ps(fscal,dx12);
2207             ty               = _mm_mul_ps(fscal,dy12);
2208             tz               = _mm_mul_ps(fscal,dz12);
2209
2210             /* Update vectorial force */
2211             fix1             = _mm_add_ps(fix1,tx);
2212             fiy1             = _mm_add_ps(fiy1,ty);
2213             fiz1             = _mm_add_ps(fiz1,tz);
2214
2215             fjx2             = _mm_add_ps(fjx2,tx);
2216             fjy2             = _mm_add_ps(fjy2,ty);
2217             fjz2             = _mm_add_ps(fjz2,tz);
2218
2219             }
2220
2221             /**************************
2222              * CALCULATE INTERACTIONS *
2223              **************************/
2224
2225             if (gmx_mm_any_lt(rsq13,rcutoff2))
2226             {
2227
2228             r13              = _mm_mul_ps(rsq13,rinv13);
2229
2230             /* EWALD ELECTROSTATICS */
2231
2232             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2233             ewrt             = _mm_mul_ps(r13,ewtabscale);
2234             ewitab           = _mm_cvttps_epi32(ewrt);
2235             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2236             ewitab           = _mm_slli_epi32(ewitab,2);
2237             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2238             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2239             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2240             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2241             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2242             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2243             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2244             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2245             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2246
2247             d                = _mm_sub_ps(r13,rswitch);
2248             d                = _mm_max_ps(d,_mm_setzero_ps());
2249             d2               = _mm_mul_ps(d,d);
2250             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2251
2252             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2253
2254             /* Evaluate switch function */
2255             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2256             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2257             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2258
2259             fscal            = felec;
2260
2261             fscal            = _mm_and_ps(fscal,cutoff_mask);
2262
2263             /* Calculate temporary vectorial force */
2264             tx               = _mm_mul_ps(fscal,dx13);
2265             ty               = _mm_mul_ps(fscal,dy13);
2266             tz               = _mm_mul_ps(fscal,dz13);
2267
2268             /* Update vectorial force */
2269             fix1             = _mm_add_ps(fix1,tx);
2270             fiy1             = _mm_add_ps(fiy1,ty);
2271             fiz1             = _mm_add_ps(fiz1,tz);
2272
2273             fjx3             = _mm_add_ps(fjx3,tx);
2274             fjy3             = _mm_add_ps(fjy3,ty);
2275             fjz3             = _mm_add_ps(fjz3,tz);
2276
2277             }
2278
2279             /**************************
2280              * CALCULATE INTERACTIONS *
2281              **************************/
2282
2283             if (gmx_mm_any_lt(rsq21,rcutoff2))
2284             {
2285
2286             r21              = _mm_mul_ps(rsq21,rinv21);
2287
2288             /* EWALD ELECTROSTATICS */
2289
2290             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2291             ewrt             = _mm_mul_ps(r21,ewtabscale);
2292             ewitab           = _mm_cvttps_epi32(ewrt);
2293             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2294             ewitab           = _mm_slli_epi32(ewitab,2);
2295             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2296             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2297             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2298             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2299             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2300             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2301             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2302             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2303             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2304
2305             d                = _mm_sub_ps(r21,rswitch);
2306             d                = _mm_max_ps(d,_mm_setzero_ps());
2307             d2               = _mm_mul_ps(d,d);
2308             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2309
2310             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2311
2312             /* Evaluate switch function */
2313             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2314             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2315             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2316
2317             fscal            = felec;
2318
2319             fscal            = _mm_and_ps(fscal,cutoff_mask);
2320
2321             /* Calculate temporary vectorial force */
2322             tx               = _mm_mul_ps(fscal,dx21);
2323             ty               = _mm_mul_ps(fscal,dy21);
2324             tz               = _mm_mul_ps(fscal,dz21);
2325
2326             /* Update vectorial force */
2327             fix2             = _mm_add_ps(fix2,tx);
2328             fiy2             = _mm_add_ps(fiy2,ty);
2329             fiz2             = _mm_add_ps(fiz2,tz);
2330
2331             fjx1             = _mm_add_ps(fjx1,tx);
2332             fjy1             = _mm_add_ps(fjy1,ty);
2333             fjz1             = _mm_add_ps(fjz1,tz);
2334
2335             }
2336
2337             /**************************
2338              * CALCULATE INTERACTIONS *
2339              **************************/
2340
2341             if (gmx_mm_any_lt(rsq22,rcutoff2))
2342             {
2343
2344             r22              = _mm_mul_ps(rsq22,rinv22);
2345
2346             /* EWALD ELECTROSTATICS */
2347
2348             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2349             ewrt             = _mm_mul_ps(r22,ewtabscale);
2350             ewitab           = _mm_cvttps_epi32(ewrt);
2351             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2352             ewitab           = _mm_slli_epi32(ewitab,2);
2353             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2354             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2355             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2356             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2357             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2358             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2359             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2360             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2361             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2362
2363             d                = _mm_sub_ps(r22,rswitch);
2364             d                = _mm_max_ps(d,_mm_setzero_ps());
2365             d2               = _mm_mul_ps(d,d);
2366             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2367
2368             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2369
2370             /* Evaluate switch function */
2371             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2372             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2373             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2374
2375             fscal            = felec;
2376
2377             fscal            = _mm_and_ps(fscal,cutoff_mask);
2378
2379             /* Calculate temporary vectorial force */
2380             tx               = _mm_mul_ps(fscal,dx22);
2381             ty               = _mm_mul_ps(fscal,dy22);
2382             tz               = _mm_mul_ps(fscal,dz22);
2383
2384             /* Update vectorial force */
2385             fix2             = _mm_add_ps(fix2,tx);
2386             fiy2             = _mm_add_ps(fiy2,ty);
2387             fiz2             = _mm_add_ps(fiz2,tz);
2388
2389             fjx2             = _mm_add_ps(fjx2,tx);
2390             fjy2             = _mm_add_ps(fjy2,ty);
2391             fjz2             = _mm_add_ps(fjz2,tz);
2392
2393             }
2394
2395             /**************************
2396              * CALCULATE INTERACTIONS *
2397              **************************/
2398
2399             if (gmx_mm_any_lt(rsq23,rcutoff2))
2400             {
2401
2402             r23              = _mm_mul_ps(rsq23,rinv23);
2403
2404             /* EWALD ELECTROSTATICS */
2405
2406             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2407             ewrt             = _mm_mul_ps(r23,ewtabscale);
2408             ewitab           = _mm_cvttps_epi32(ewrt);
2409             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2410             ewitab           = _mm_slli_epi32(ewitab,2);
2411             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2412             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2413             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2414             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2415             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2416             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2417             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2418             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2419             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2420
2421             d                = _mm_sub_ps(r23,rswitch);
2422             d                = _mm_max_ps(d,_mm_setzero_ps());
2423             d2               = _mm_mul_ps(d,d);
2424             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2425
2426             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2427
2428             /* Evaluate switch function */
2429             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2430             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2431             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2432
2433             fscal            = felec;
2434
2435             fscal            = _mm_and_ps(fscal,cutoff_mask);
2436
2437             /* Calculate temporary vectorial force */
2438             tx               = _mm_mul_ps(fscal,dx23);
2439             ty               = _mm_mul_ps(fscal,dy23);
2440             tz               = _mm_mul_ps(fscal,dz23);
2441
2442             /* Update vectorial force */
2443             fix2             = _mm_add_ps(fix2,tx);
2444             fiy2             = _mm_add_ps(fiy2,ty);
2445             fiz2             = _mm_add_ps(fiz2,tz);
2446
2447             fjx3             = _mm_add_ps(fjx3,tx);
2448             fjy3             = _mm_add_ps(fjy3,ty);
2449             fjz3             = _mm_add_ps(fjz3,tz);
2450
2451             }
2452
2453             /**************************
2454              * CALCULATE INTERACTIONS *
2455              **************************/
2456
2457             if (gmx_mm_any_lt(rsq31,rcutoff2))
2458             {
2459
2460             r31              = _mm_mul_ps(rsq31,rinv31);
2461
2462             /* EWALD ELECTROSTATICS */
2463
2464             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2465             ewrt             = _mm_mul_ps(r31,ewtabscale);
2466             ewitab           = _mm_cvttps_epi32(ewrt);
2467             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2468             ewitab           = _mm_slli_epi32(ewitab,2);
2469             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2470             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2471             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2472             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2473             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2474             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2475             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2476             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2477             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2478
2479             d                = _mm_sub_ps(r31,rswitch);
2480             d                = _mm_max_ps(d,_mm_setzero_ps());
2481             d2               = _mm_mul_ps(d,d);
2482             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2483
2484             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2485
2486             /* Evaluate switch function */
2487             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2488             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2489             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2490
2491             fscal            = felec;
2492
2493             fscal            = _mm_and_ps(fscal,cutoff_mask);
2494
2495             /* Calculate temporary vectorial force */
2496             tx               = _mm_mul_ps(fscal,dx31);
2497             ty               = _mm_mul_ps(fscal,dy31);
2498             tz               = _mm_mul_ps(fscal,dz31);
2499
2500             /* Update vectorial force */
2501             fix3             = _mm_add_ps(fix3,tx);
2502             fiy3             = _mm_add_ps(fiy3,ty);
2503             fiz3             = _mm_add_ps(fiz3,tz);
2504
2505             fjx1             = _mm_add_ps(fjx1,tx);
2506             fjy1             = _mm_add_ps(fjy1,ty);
2507             fjz1             = _mm_add_ps(fjz1,tz);
2508
2509             }
2510
2511             /**************************
2512              * CALCULATE INTERACTIONS *
2513              **************************/
2514
2515             if (gmx_mm_any_lt(rsq32,rcutoff2))
2516             {
2517
2518             r32              = _mm_mul_ps(rsq32,rinv32);
2519
2520             /* EWALD ELECTROSTATICS */
2521
2522             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2523             ewrt             = _mm_mul_ps(r32,ewtabscale);
2524             ewitab           = _mm_cvttps_epi32(ewrt);
2525             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2526             ewitab           = _mm_slli_epi32(ewitab,2);
2527             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2528             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2529             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2530             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2531             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2532             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2533             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2534             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2535             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2536
2537             d                = _mm_sub_ps(r32,rswitch);
2538             d                = _mm_max_ps(d,_mm_setzero_ps());
2539             d2               = _mm_mul_ps(d,d);
2540             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2541
2542             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2543
2544             /* Evaluate switch function */
2545             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2546             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2547             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2548
2549             fscal            = felec;
2550
2551             fscal            = _mm_and_ps(fscal,cutoff_mask);
2552
2553             /* Calculate temporary vectorial force */
2554             tx               = _mm_mul_ps(fscal,dx32);
2555             ty               = _mm_mul_ps(fscal,dy32);
2556             tz               = _mm_mul_ps(fscal,dz32);
2557
2558             /* Update vectorial force */
2559             fix3             = _mm_add_ps(fix3,tx);
2560             fiy3             = _mm_add_ps(fiy3,ty);
2561             fiz3             = _mm_add_ps(fiz3,tz);
2562
2563             fjx2             = _mm_add_ps(fjx2,tx);
2564             fjy2             = _mm_add_ps(fjy2,ty);
2565             fjz2             = _mm_add_ps(fjz2,tz);
2566
2567             }
2568
2569             /**************************
2570              * CALCULATE INTERACTIONS *
2571              **************************/
2572
2573             if (gmx_mm_any_lt(rsq33,rcutoff2))
2574             {
2575
2576             r33              = _mm_mul_ps(rsq33,rinv33);
2577
2578             /* EWALD ELECTROSTATICS */
2579
2580             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2581             ewrt             = _mm_mul_ps(r33,ewtabscale);
2582             ewitab           = _mm_cvttps_epi32(ewrt);
2583             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2584             ewitab           = _mm_slli_epi32(ewitab,2);
2585             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2586             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2587             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2588             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2589             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2590             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2591             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2592             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2593             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2594
2595             d                = _mm_sub_ps(r33,rswitch);
2596             d                = _mm_max_ps(d,_mm_setzero_ps());
2597             d2               = _mm_mul_ps(d,d);
2598             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2599
2600             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2601
2602             /* Evaluate switch function */
2603             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2604             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2605             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2606
2607             fscal            = felec;
2608
2609             fscal            = _mm_and_ps(fscal,cutoff_mask);
2610
2611             /* Calculate temporary vectorial force */
2612             tx               = _mm_mul_ps(fscal,dx33);
2613             ty               = _mm_mul_ps(fscal,dy33);
2614             tz               = _mm_mul_ps(fscal,dz33);
2615
2616             /* Update vectorial force */
2617             fix3             = _mm_add_ps(fix3,tx);
2618             fiy3             = _mm_add_ps(fiy3,ty);
2619             fiz3             = _mm_add_ps(fiz3,tz);
2620
2621             fjx3             = _mm_add_ps(fjx3,tx);
2622             fjy3             = _mm_add_ps(fjy3,ty);
2623             fjz3             = _mm_add_ps(fjz3,tz);
2624
2625             }
2626
2627             fjptrA             = f+j_coord_offsetA;
2628             fjptrB             = f+j_coord_offsetB;
2629             fjptrC             = f+j_coord_offsetC;
2630             fjptrD             = f+j_coord_offsetD;
2631
2632             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2633                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2634                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2635
2636             /* Inner loop uses 617 flops */
2637         }
2638
2639         if(jidx<j_index_end)
2640         {
2641
2642             /* Get j neighbor index, and coordinate index */
2643             jnrlistA         = jjnr[jidx];
2644             jnrlistB         = jjnr[jidx+1];
2645             jnrlistC         = jjnr[jidx+2];
2646             jnrlistD         = jjnr[jidx+3];
2647             /* Sign of each element will be negative for non-real atoms.
2648              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2649              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2650              */
2651             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2652             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2653             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2654             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2655             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2656             j_coord_offsetA  = DIM*jnrA;
2657             j_coord_offsetB  = DIM*jnrB;
2658             j_coord_offsetC  = DIM*jnrC;
2659             j_coord_offsetD  = DIM*jnrD;
2660
2661             /* load j atom coordinates */
2662             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2663                                               x+j_coord_offsetC,x+j_coord_offsetD,
2664                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2665                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2666
2667             /* Calculate displacement vector */
2668             dx00             = _mm_sub_ps(ix0,jx0);
2669             dy00             = _mm_sub_ps(iy0,jy0);
2670             dz00             = _mm_sub_ps(iz0,jz0);
2671             dx11             = _mm_sub_ps(ix1,jx1);
2672             dy11             = _mm_sub_ps(iy1,jy1);
2673             dz11             = _mm_sub_ps(iz1,jz1);
2674             dx12             = _mm_sub_ps(ix1,jx2);
2675             dy12             = _mm_sub_ps(iy1,jy2);
2676             dz12             = _mm_sub_ps(iz1,jz2);
2677             dx13             = _mm_sub_ps(ix1,jx3);
2678             dy13             = _mm_sub_ps(iy1,jy3);
2679             dz13             = _mm_sub_ps(iz1,jz3);
2680             dx21             = _mm_sub_ps(ix2,jx1);
2681             dy21             = _mm_sub_ps(iy2,jy1);
2682             dz21             = _mm_sub_ps(iz2,jz1);
2683             dx22             = _mm_sub_ps(ix2,jx2);
2684             dy22             = _mm_sub_ps(iy2,jy2);
2685             dz22             = _mm_sub_ps(iz2,jz2);
2686             dx23             = _mm_sub_ps(ix2,jx3);
2687             dy23             = _mm_sub_ps(iy2,jy3);
2688             dz23             = _mm_sub_ps(iz2,jz3);
2689             dx31             = _mm_sub_ps(ix3,jx1);
2690             dy31             = _mm_sub_ps(iy3,jy1);
2691             dz31             = _mm_sub_ps(iz3,jz1);
2692             dx32             = _mm_sub_ps(ix3,jx2);
2693             dy32             = _mm_sub_ps(iy3,jy2);
2694             dz32             = _mm_sub_ps(iz3,jz2);
2695             dx33             = _mm_sub_ps(ix3,jx3);
2696             dy33             = _mm_sub_ps(iy3,jy3);
2697             dz33             = _mm_sub_ps(iz3,jz3);
2698
2699             /* Calculate squared distance and things based on it */
2700             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2701             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2702             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2703             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2704             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2705             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2706             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2707             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2708             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2709             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2710
2711             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2712             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2713             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2714             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2715             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2716             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2717             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2718             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2719             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2720             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2721
2722             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2723             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2724             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2725             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2726             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2727             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2728             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2729             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2730             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2731             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2732
2733             fjx0             = _mm_setzero_ps();
2734             fjy0             = _mm_setzero_ps();
2735             fjz0             = _mm_setzero_ps();
2736             fjx1             = _mm_setzero_ps();
2737             fjy1             = _mm_setzero_ps();
2738             fjz1             = _mm_setzero_ps();
2739             fjx2             = _mm_setzero_ps();
2740             fjy2             = _mm_setzero_ps();
2741             fjz2             = _mm_setzero_ps();
2742             fjx3             = _mm_setzero_ps();
2743             fjy3             = _mm_setzero_ps();
2744             fjz3             = _mm_setzero_ps();
2745
2746             /**************************
2747              * CALCULATE INTERACTIONS *
2748              **************************/
2749
2750             if (gmx_mm_any_lt(rsq00,rcutoff2))
2751             {
2752
2753             r00              = _mm_mul_ps(rsq00,rinv00);
2754             r00              = _mm_andnot_ps(dummy_mask,r00);
2755
2756             /* LENNARD-JONES DISPERSION/REPULSION */
2757
2758             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2759             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2760             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2761             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2762             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2763
2764             d                = _mm_sub_ps(r00,rswitch);
2765             d                = _mm_max_ps(d,_mm_setzero_ps());
2766             d2               = _mm_mul_ps(d,d);
2767             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2768
2769             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2770
2771             /* Evaluate switch function */
2772             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2773             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2774             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2775
2776             fscal            = fvdw;
2777
2778             fscal            = _mm_and_ps(fscal,cutoff_mask);
2779
2780             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2781
2782             /* Calculate temporary vectorial force */
2783             tx               = _mm_mul_ps(fscal,dx00);
2784             ty               = _mm_mul_ps(fscal,dy00);
2785             tz               = _mm_mul_ps(fscal,dz00);
2786
2787             /* Update vectorial force */
2788             fix0             = _mm_add_ps(fix0,tx);
2789             fiy0             = _mm_add_ps(fiy0,ty);
2790             fiz0             = _mm_add_ps(fiz0,tz);
2791
2792             fjx0             = _mm_add_ps(fjx0,tx);
2793             fjy0             = _mm_add_ps(fjy0,ty);
2794             fjz0             = _mm_add_ps(fjz0,tz);
2795
2796             }
2797
2798             /**************************
2799              * CALCULATE INTERACTIONS *
2800              **************************/
2801
2802             if (gmx_mm_any_lt(rsq11,rcutoff2))
2803             {
2804
2805             r11              = _mm_mul_ps(rsq11,rinv11);
2806             r11              = _mm_andnot_ps(dummy_mask,r11);
2807
2808             /* EWALD ELECTROSTATICS */
2809
2810             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2811             ewrt             = _mm_mul_ps(r11,ewtabscale);
2812             ewitab           = _mm_cvttps_epi32(ewrt);
2813             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2814             ewitab           = _mm_slli_epi32(ewitab,2);
2815             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2816             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2817             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2818             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2819             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2820             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2821             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2822             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2823             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2824
2825             d                = _mm_sub_ps(r11,rswitch);
2826             d                = _mm_max_ps(d,_mm_setzero_ps());
2827             d2               = _mm_mul_ps(d,d);
2828             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2829
2830             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2831
2832             /* Evaluate switch function */
2833             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2834             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2835             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2836
2837             fscal            = felec;
2838
2839             fscal            = _mm_and_ps(fscal,cutoff_mask);
2840
2841             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2842
2843             /* Calculate temporary vectorial force */
2844             tx               = _mm_mul_ps(fscal,dx11);
2845             ty               = _mm_mul_ps(fscal,dy11);
2846             tz               = _mm_mul_ps(fscal,dz11);
2847
2848             /* Update vectorial force */
2849             fix1             = _mm_add_ps(fix1,tx);
2850             fiy1             = _mm_add_ps(fiy1,ty);
2851             fiz1             = _mm_add_ps(fiz1,tz);
2852
2853             fjx1             = _mm_add_ps(fjx1,tx);
2854             fjy1             = _mm_add_ps(fjy1,ty);
2855             fjz1             = _mm_add_ps(fjz1,tz);
2856
2857             }
2858
2859             /**************************
2860              * CALCULATE INTERACTIONS *
2861              **************************/
2862
2863             if (gmx_mm_any_lt(rsq12,rcutoff2))
2864             {
2865
2866             r12              = _mm_mul_ps(rsq12,rinv12);
2867             r12              = _mm_andnot_ps(dummy_mask,r12);
2868
2869             /* EWALD ELECTROSTATICS */
2870
2871             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2872             ewrt             = _mm_mul_ps(r12,ewtabscale);
2873             ewitab           = _mm_cvttps_epi32(ewrt);
2874             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2875             ewitab           = _mm_slli_epi32(ewitab,2);
2876             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2877             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2878             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2879             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2880             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2881             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2882             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2883             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2884             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2885
2886             d                = _mm_sub_ps(r12,rswitch);
2887             d                = _mm_max_ps(d,_mm_setzero_ps());
2888             d2               = _mm_mul_ps(d,d);
2889             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2890
2891             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2892
2893             /* Evaluate switch function */
2894             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2895             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2896             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2897
2898             fscal            = felec;
2899
2900             fscal            = _mm_and_ps(fscal,cutoff_mask);
2901
2902             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2903
2904             /* Calculate temporary vectorial force */
2905             tx               = _mm_mul_ps(fscal,dx12);
2906             ty               = _mm_mul_ps(fscal,dy12);
2907             tz               = _mm_mul_ps(fscal,dz12);
2908
2909             /* Update vectorial force */
2910             fix1             = _mm_add_ps(fix1,tx);
2911             fiy1             = _mm_add_ps(fiy1,ty);
2912             fiz1             = _mm_add_ps(fiz1,tz);
2913
2914             fjx2             = _mm_add_ps(fjx2,tx);
2915             fjy2             = _mm_add_ps(fjy2,ty);
2916             fjz2             = _mm_add_ps(fjz2,tz);
2917
2918             }
2919
2920             /**************************
2921              * CALCULATE INTERACTIONS *
2922              **************************/
2923
2924             if (gmx_mm_any_lt(rsq13,rcutoff2))
2925             {
2926
2927             r13              = _mm_mul_ps(rsq13,rinv13);
2928             r13              = _mm_andnot_ps(dummy_mask,r13);
2929
2930             /* EWALD ELECTROSTATICS */
2931
2932             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2933             ewrt             = _mm_mul_ps(r13,ewtabscale);
2934             ewitab           = _mm_cvttps_epi32(ewrt);
2935             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2936             ewitab           = _mm_slli_epi32(ewitab,2);
2937             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2938             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2939             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2940             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2941             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2942             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2943             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2944             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2945             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2946
2947             d                = _mm_sub_ps(r13,rswitch);
2948             d                = _mm_max_ps(d,_mm_setzero_ps());
2949             d2               = _mm_mul_ps(d,d);
2950             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2951
2952             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2953
2954             /* Evaluate switch function */
2955             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2956             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2957             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2958
2959             fscal            = felec;
2960
2961             fscal            = _mm_and_ps(fscal,cutoff_mask);
2962
2963             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2964
2965             /* Calculate temporary vectorial force */
2966             tx               = _mm_mul_ps(fscal,dx13);
2967             ty               = _mm_mul_ps(fscal,dy13);
2968             tz               = _mm_mul_ps(fscal,dz13);
2969
2970             /* Update vectorial force */
2971             fix1             = _mm_add_ps(fix1,tx);
2972             fiy1             = _mm_add_ps(fiy1,ty);
2973             fiz1             = _mm_add_ps(fiz1,tz);
2974
2975             fjx3             = _mm_add_ps(fjx3,tx);
2976             fjy3             = _mm_add_ps(fjy3,ty);
2977             fjz3             = _mm_add_ps(fjz3,tz);
2978
2979             }
2980
2981             /**************************
2982              * CALCULATE INTERACTIONS *
2983              **************************/
2984
2985             if (gmx_mm_any_lt(rsq21,rcutoff2))
2986             {
2987
2988             r21              = _mm_mul_ps(rsq21,rinv21);
2989             r21              = _mm_andnot_ps(dummy_mask,r21);
2990
2991             /* EWALD ELECTROSTATICS */
2992
2993             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2994             ewrt             = _mm_mul_ps(r21,ewtabscale);
2995             ewitab           = _mm_cvttps_epi32(ewrt);
2996             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2997             ewitab           = _mm_slli_epi32(ewitab,2);
2998             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2999             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3000             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3001             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3002             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3003             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3004             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3005             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3006             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3007
3008             d                = _mm_sub_ps(r21,rswitch);
3009             d                = _mm_max_ps(d,_mm_setzero_ps());
3010             d2               = _mm_mul_ps(d,d);
3011             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3012
3013             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3014
3015             /* Evaluate switch function */
3016             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3017             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3018             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
3019
3020             fscal            = felec;
3021
3022             fscal            = _mm_and_ps(fscal,cutoff_mask);
3023
3024             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3025
3026             /* Calculate temporary vectorial force */
3027             tx               = _mm_mul_ps(fscal,dx21);
3028             ty               = _mm_mul_ps(fscal,dy21);
3029             tz               = _mm_mul_ps(fscal,dz21);
3030
3031             /* Update vectorial force */
3032             fix2             = _mm_add_ps(fix2,tx);
3033             fiy2             = _mm_add_ps(fiy2,ty);
3034             fiz2             = _mm_add_ps(fiz2,tz);
3035
3036             fjx1             = _mm_add_ps(fjx1,tx);
3037             fjy1             = _mm_add_ps(fjy1,ty);
3038             fjz1             = _mm_add_ps(fjz1,tz);
3039
3040             }
3041
3042             /**************************
3043              * CALCULATE INTERACTIONS *
3044              **************************/
3045
3046             if (gmx_mm_any_lt(rsq22,rcutoff2))
3047             {
3048
3049             r22              = _mm_mul_ps(rsq22,rinv22);
3050             r22              = _mm_andnot_ps(dummy_mask,r22);
3051
3052             /* EWALD ELECTROSTATICS */
3053
3054             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3055             ewrt             = _mm_mul_ps(r22,ewtabscale);
3056             ewitab           = _mm_cvttps_epi32(ewrt);
3057             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3058             ewitab           = _mm_slli_epi32(ewitab,2);
3059             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3060             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3061             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3062             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3063             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3064             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3065             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3066             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3067             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3068
3069             d                = _mm_sub_ps(r22,rswitch);
3070             d                = _mm_max_ps(d,_mm_setzero_ps());
3071             d2               = _mm_mul_ps(d,d);
3072             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3073
3074             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3075
3076             /* Evaluate switch function */
3077             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3078             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3079             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
3080
3081             fscal            = felec;
3082
3083             fscal            = _mm_and_ps(fscal,cutoff_mask);
3084
3085             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3086
3087             /* Calculate temporary vectorial force */
3088             tx               = _mm_mul_ps(fscal,dx22);
3089             ty               = _mm_mul_ps(fscal,dy22);
3090             tz               = _mm_mul_ps(fscal,dz22);
3091
3092             /* Update vectorial force */
3093             fix2             = _mm_add_ps(fix2,tx);
3094             fiy2             = _mm_add_ps(fiy2,ty);
3095             fiz2             = _mm_add_ps(fiz2,tz);
3096
3097             fjx2             = _mm_add_ps(fjx2,tx);
3098             fjy2             = _mm_add_ps(fjy2,ty);
3099             fjz2             = _mm_add_ps(fjz2,tz);
3100
3101             }
3102
3103             /**************************
3104              * CALCULATE INTERACTIONS *
3105              **************************/
3106
3107             if (gmx_mm_any_lt(rsq23,rcutoff2))
3108             {
3109
3110             r23              = _mm_mul_ps(rsq23,rinv23);
3111             r23              = _mm_andnot_ps(dummy_mask,r23);
3112
3113             /* EWALD ELECTROSTATICS */
3114
3115             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3116             ewrt             = _mm_mul_ps(r23,ewtabscale);
3117             ewitab           = _mm_cvttps_epi32(ewrt);
3118             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3119             ewitab           = _mm_slli_epi32(ewitab,2);
3120             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3121             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3122             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3123             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3124             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3125             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3126             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3127             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
3128             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
3129
3130             d                = _mm_sub_ps(r23,rswitch);
3131             d                = _mm_max_ps(d,_mm_setzero_ps());
3132             d2               = _mm_mul_ps(d,d);
3133             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3134
3135             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3136
3137             /* Evaluate switch function */
3138             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3139             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
3140             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
3141
3142             fscal            = felec;
3143
3144             fscal            = _mm_and_ps(fscal,cutoff_mask);
3145
3146             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3147
3148             /* Calculate temporary vectorial force */
3149             tx               = _mm_mul_ps(fscal,dx23);
3150             ty               = _mm_mul_ps(fscal,dy23);
3151             tz               = _mm_mul_ps(fscal,dz23);
3152
3153             /* Update vectorial force */
3154             fix2             = _mm_add_ps(fix2,tx);
3155             fiy2             = _mm_add_ps(fiy2,ty);
3156             fiz2             = _mm_add_ps(fiz2,tz);
3157
3158             fjx3             = _mm_add_ps(fjx3,tx);
3159             fjy3             = _mm_add_ps(fjy3,ty);
3160             fjz3             = _mm_add_ps(fjz3,tz);
3161
3162             }
3163
3164             /**************************
3165              * CALCULATE INTERACTIONS *
3166              **************************/
3167
3168             if (gmx_mm_any_lt(rsq31,rcutoff2))
3169             {
3170
3171             r31              = _mm_mul_ps(rsq31,rinv31);
3172             r31              = _mm_andnot_ps(dummy_mask,r31);
3173
3174             /* EWALD ELECTROSTATICS */
3175
3176             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3177             ewrt             = _mm_mul_ps(r31,ewtabscale);
3178             ewitab           = _mm_cvttps_epi32(ewrt);
3179             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3180             ewitab           = _mm_slli_epi32(ewitab,2);
3181             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3182             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3183             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3184             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3185             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3186             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3187             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3188             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
3189             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
3190
3191             d                = _mm_sub_ps(r31,rswitch);
3192             d                = _mm_max_ps(d,_mm_setzero_ps());
3193             d2               = _mm_mul_ps(d,d);
3194             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3195
3196             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3197
3198             /* Evaluate switch function */
3199             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3200             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
3201             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
3202
3203             fscal            = felec;
3204
3205             fscal            = _mm_and_ps(fscal,cutoff_mask);
3206
3207             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3208
3209             /* Calculate temporary vectorial force */
3210             tx               = _mm_mul_ps(fscal,dx31);
3211             ty               = _mm_mul_ps(fscal,dy31);
3212             tz               = _mm_mul_ps(fscal,dz31);
3213
3214             /* Update vectorial force */
3215             fix3             = _mm_add_ps(fix3,tx);
3216             fiy3             = _mm_add_ps(fiy3,ty);
3217             fiz3             = _mm_add_ps(fiz3,tz);
3218
3219             fjx1             = _mm_add_ps(fjx1,tx);
3220             fjy1             = _mm_add_ps(fjy1,ty);
3221             fjz1             = _mm_add_ps(fjz1,tz);
3222
3223             }
3224
3225             /**************************
3226              * CALCULATE INTERACTIONS *
3227              **************************/
3228
3229             if (gmx_mm_any_lt(rsq32,rcutoff2))
3230             {
3231
3232             r32              = _mm_mul_ps(rsq32,rinv32);
3233             r32              = _mm_andnot_ps(dummy_mask,r32);
3234
3235             /* EWALD ELECTROSTATICS */
3236
3237             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3238             ewrt             = _mm_mul_ps(r32,ewtabscale);
3239             ewitab           = _mm_cvttps_epi32(ewrt);
3240             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3241             ewitab           = _mm_slli_epi32(ewitab,2);
3242             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3243             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3244             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3245             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3246             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3247             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3248             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3249             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
3250             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
3251
3252             d                = _mm_sub_ps(r32,rswitch);
3253             d                = _mm_max_ps(d,_mm_setzero_ps());
3254             d2               = _mm_mul_ps(d,d);
3255             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3256
3257             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3258
3259             /* Evaluate switch function */
3260             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3261             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
3262             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
3263
3264             fscal            = felec;
3265
3266             fscal            = _mm_and_ps(fscal,cutoff_mask);
3267
3268             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3269
3270             /* Calculate temporary vectorial force */
3271             tx               = _mm_mul_ps(fscal,dx32);
3272             ty               = _mm_mul_ps(fscal,dy32);
3273             tz               = _mm_mul_ps(fscal,dz32);
3274
3275             /* Update vectorial force */
3276             fix3             = _mm_add_ps(fix3,tx);
3277             fiy3             = _mm_add_ps(fiy3,ty);
3278             fiz3             = _mm_add_ps(fiz3,tz);
3279
3280             fjx2             = _mm_add_ps(fjx2,tx);
3281             fjy2             = _mm_add_ps(fjy2,ty);
3282             fjz2             = _mm_add_ps(fjz2,tz);
3283
3284             }
3285
3286             /**************************
3287              * CALCULATE INTERACTIONS *
3288              **************************/
3289
3290             if (gmx_mm_any_lt(rsq33,rcutoff2))
3291             {
3292
3293             r33              = _mm_mul_ps(rsq33,rinv33);
3294             r33              = _mm_andnot_ps(dummy_mask,r33);
3295
3296             /* EWALD ELECTROSTATICS */
3297
3298             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3299             ewrt             = _mm_mul_ps(r33,ewtabscale);
3300             ewitab           = _mm_cvttps_epi32(ewrt);
3301             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3302             ewitab           = _mm_slli_epi32(ewitab,2);
3303             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3304             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3305             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3306             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3307             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3308             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3309             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3310             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3311             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3312
3313             d                = _mm_sub_ps(r33,rswitch);
3314             d                = _mm_max_ps(d,_mm_setzero_ps());
3315             d2               = _mm_mul_ps(d,d);
3316             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3317
3318             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3319
3320             /* Evaluate switch function */
3321             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3322             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3323             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
3324
3325             fscal            = felec;
3326
3327             fscal            = _mm_and_ps(fscal,cutoff_mask);
3328
3329             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3330
3331             /* Calculate temporary vectorial force */
3332             tx               = _mm_mul_ps(fscal,dx33);
3333             ty               = _mm_mul_ps(fscal,dy33);
3334             tz               = _mm_mul_ps(fscal,dz33);
3335
3336             /* Update vectorial force */
3337             fix3             = _mm_add_ps(fix3,tx);
3338             fiy3             = _mm_add_ps(fiy3,ty);
3339             fiz3             = _mm_add_ps(fiz3,tz);
3340
3341             fjx3             = _mm_add_ps(fjx3,tx);
3342             fjy3             = _mm_add_ps(fjy3,ty);
3343             fjz3             = _mm_add_ps(fjz3,tz);
3344
3345             }
3346
3347             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3348             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3349             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3350             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3351
3352             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3353                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3354                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3355
3356             /* Inner loop uses 627 flops */
3357         }
3358
3359         /* End of innermost loop */
3360
3361         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3362                                               f+i_coord_offset,fshift+i_shift_offset);
3363
3364         /* Increment number of inner iterations */
3365         inneriter                  += j_index_end - j_index_start;
3366
3367         /* Outer loop uses 24 flops */
3368     }
3369
3370     /* Increment number of outer iterations */
3371     outeriter        += nri;
3372
3373     /* Update outer/inner flops */
3374
3375     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);
3376 }