Valgrind suppression for OS X 10.9
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse4_1_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse4_1_double.h"
50 #include "kernelutil_x86_sse4_1_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_double
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB;
77     int              j_coord_offsetA,j_coord_offsetB;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwioffset3;
89     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90     int              vdwjidx0A,vdwjidx0B;
91     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B;
93     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B;
95     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B;
97     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
115     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
116     __m128i          ewitab;
117     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
118     real             *ewtab;
119     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
120     real             rswitch_scalar,d_scalar;
121     __m128d          dummy_mask,cutoff_mask;
122     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
123     __m128d          one     = _mm_set1_pd(1.0);
124     __m128d          two     = _mm_set1_pd(2.0);
125     x                = xx[0];
126     f                = ff[0];
127
128     nri              = nlist->nri;
129     iinr             = nlist->iinr;
130     jindex           = nlist->jindex;
131     jjnr             = nlist->jjnr;
132     shiftidx         = nlist->shift;
133     gid              = nlist->gid;
134     shiftvec         = fr->shift_vec[0];
135     fshift           = fr->fshift[0];
136     facel            = _mm_set1_pd(fr->epsfac);
137     charge           = mdatoms->chargeA;
138     nvdwtype         = fr->ntype;
139     vdwparam         = fr->nbfp;
140     vdwtype          = mdatoms->typeA;
141
142     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
143     ewtab            = fr->ic->tabq_coul_FDV0;
144     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
145     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
146
147     /* Setup water-specific parameters */
148     inr              = nlist->iinr[0];
149     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
150     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
151     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
152     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
153
154     jq1              = _mm_set1_pd(charge[inr+1]);
155     jq2              = _mm_set1_pd(charge[inr+2]);
156     jq3              = _mm_set1_pd(charge[inr+3]);
157     vdwjidx0A        = 2*vdwtype[inr+0];
158     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
159     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
160     qq11             = _mm_mul_pd(iq1,jq1);
161     qq12             = _mm_mul_pd(iq1,jq2);
162     qq13             = _mm_mul_pd(iq1,jq3);
163     qq21             = _mm_mul_pd(iq2,jq1);
164     qq22             = _mm_mul_pd(iq2,jq2);
165     qq23             = _mm_mul_pd(iq2,jq3);
166     qq31             = _mm_mul_pd(iq3,jq1);
167     qq32             = _mm_mul_pd(iq3,jq2);
168     qq33             = _mm_mul_pd(iq3,jq3);
169
170     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
171     rcutoff_scalar   = fr->rcoulomb;
172     rcutoff          = _mm_set1_pd(rcutoff_scalar);
173     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
174
175     rswitch_scalar   = fr->rcoulomb_switch;
176     rswitch          = _mm_set1_pd(rswitch_scalar);
177     /* Setup switch parameters */
178     d_scalar         = rcutoff_scalar-rswitch_scalar;
179     d                = _mm_set1_pd(d_scalar);
180     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
181     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
182     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
183     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
184     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
185     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
186
187     /* Avoid stupid compiler warnings */
188     jnrA = jnrB = 0;
189     j_coord_offsetA = 0;
190     j_coord_offsetB = 0;
191
192     outeriter        = 0;
193     inneriter        = 0;
194
195     /* Start outer loop over neighborlists */
196     for(iidx=0; iidx<nri; iidx++)
197     {
198         /* Load shift vector for this list */
199         i_shift_offset   = DIM*shiftidx[iidx];
200
201         /* Load limits for loop over neighbors */
202         j_index_start    = jindex[iidx];
203         j_index_end      = jindex[iidx+1];
204
205         /* Get outer coordinate index */
206         inr              = iinr[iidx];
207         i_coord_offset   = DIM*inr;
208
209         /* Load i particle coords and add shift vector */
210         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
211                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
212
213         fix0             = _mm_setzero_pd();
214         fiy0             = _mm_setzero_pd();
215         fiz0             = _mm_setzero_pd();
216         fix1             = _mm_setzero_pd();
217         fiy1             = _mm_setzero_pd();
218         fiz1             = _mm_setzero_pd();
219         fix2             = _mm_setzero_pd();
220         fiy2             = _mm_setzero_pd();
221         fiz2             = _mm_setzero_pd();
222         fix3             = _mm_setzero_pd();
223         fiy3             = _mm_setzero_pd();
224         fiz3             = _mm_setzero_pd();
225
226         /* Reset potential sums */
227         velecsum         = _mm_setzero_pd();
228         vvdwsum          = _mm_setzero_pd();
229
230         /* Start inner kernel loop */
231         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
232         {
233
234             /* Get j neighbor index, and coordinate index */
235             jnrA             = jjnr[jidx];
236             jnrB             = jjnr[jidx+1];
237             j_coord_offsetA  = DIM*jnrA;
238             j_coord_offsetB  = DIM*jnrB;
239
240             /* load j atom coordinates */
241             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
242                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
243                                               &jy2,&jz2,&jx3,&jy3,&jz3);
244
245             /* Calculate displacement vector */
246             dx00             = _mm_sub_pd(ix0,jx0);
247             dy00             = _mm_sub_pd(iy0,jy0);
248             dz00             = _mm_sub_pd(iz0,jz0);
249             dx11             = _mm_sub_pd(ix1,jx1);
250             dy11             = _mm_sub_pd(iy1,jy1);
251             dz11             = _mm_sub_pd(iz1,jz1);
252             dx12             = _mm_sub_pd(ix1,jx2);
253             dy12             = _mm_sub_pd(iy1,jy2);
254             dz12             = _mm_sub_pd(iz1,jz2);
255             dx13             = _mm_sub_pd(ix1,jx3);
256             dy13             = _mm_sub_pd(iy1,jy3);
257             dz13             = _mm_sub_pd(iz1,jz3);
258             dx21             = _mm_sub_pd(ix2,jx1);
259             dy21             = _mm_sub_pd(iy2,jy1);
260             dz21             = _mm_sub_pd(iz2,jz1);
261             dx22             = _mm_sub_pd(ix2,jx2);
262             dy22             = _mm_sub_pd(iy2,jy2);
263             dz22             = _mm_sub_pd(iz2,jz2);
264             dx23             = _mm_sub_pd(ix2,jx3);
265             dy23             = _mm_sub_pd(iy2,jy3);
266             dz23             = _mm_sub_pd(iz2,jz3);
267             dx31             = _mm_sub_pd(ix3,jx1);
268             dy31             = _mm_sub_pd(iy3,jy1);
269             dz31             = _mm_sub_pd(iz3,jz1);
270             dx32             = _mm_sub_pd(ix3,jx2);
271             dy32             = _mm_sub_pd(iy3,jy2);
272             dz32             = _mm_sub_pd(iz3,jz2);
273             dx33             = _mm_sub_pd(ix3,jx3);
274             dy33             = _mm_sub_pd(iy3,jy3);
275             dz33             = _mm_sub_pd(iz3,jz3);
276
277             /* Calculate squared distance and things based on it */
278             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
279             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
280             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
281             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
282             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
283             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
284             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
285             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
286             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
287             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
288
289             rinv00           = gmx_mm_invsqrt_pd(rsq00);
290             rinv11           = gmx_mm_invsqrt_pd(rsq11);
291             rinv12           = gmx_mm_invsqrt_pd(rsq12);
292             rinv13           = gmx_mm_invsqrt_pd(rsq13);
293             rinv21           = gmx_mm_invsqrt_pd(rsq21);
294             rinv22           = gmx_mm_invsqrt_pd(rsq22);
295             rinv23           = gmx_mm_invsqrt_pd(rsq23);
296             rinv31           = gmx_mm_invsqrt_pd(rsq31);
297             rinv32           = gmx_mm_invsqrt_pd(rsq32);
298             rinv33           = gmx_mm_invsqrt_pd(rsq33);
299
300             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
301             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
302             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
303             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
304             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
305             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
306             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
307             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
308             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
309             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
310
311             fjx0             = _mm_setzero_pd();
312             fjy0             = _mm_setzero_pd();
313             fjz0             = _mm_setzero_pd();
314             fjx1             = _mm_setzero_pd();
315             fjy1             = _mm_setzero_pd();
316             fjz1             = _mm_setzero_pd();
317             fjx2             = _mm_setzero_pd();
318             fjy2             = _mm_setzero_pd();
319             fjz2             = _mm_setzero_pd();
320             fjx3             = _mm_setzero_pd();
321             fjy3             = _mm_setzero_pd();
322             fjz3             = _mm_setzero_pd();
323
324             /**************************
325              * CALCULATE INTERACTIONS *
326              **************************/
327
328             if (gmx_mm_any_lt(rsq00,rcutoff2))
329             {
330
331             r00              = _mm_mul_pd(rsq00,rinv00);
332
333             /* LENNARD-JONES DISPERSION/REPULSION */
334
335             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
336             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
337             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
338             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
339             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
340
341             d                = _mm_sub_pd(r00,rswitch);
342             d                = _mm_max_pd(d,_mm_setzero_pd());
343             d2               = _mm_mul_pd(d,d);
344             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
345
346             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
347
348             /* Evaluate switch function */
349             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
350             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
351             vvdw             = _mm_mul_pd(vvdw,sw);
352             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
353
354             /* Update potential sum for this i atom from the interaction with this j atom. */
355             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
356             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
357
358             fscal            = fvdw;
359
360             fscal            = _mm_and_pd(fscal,cutoff_mask);
361
362             /* Calculate temporary vectorial force */
363             tx               = _mm_mul_pd(fscal,dx00);
364             ty               = _mm_mul_pd(fscal,dy00);
365             tz               = _mm_mul_pd(fscal,dz00);
366
367             /* Update vectorial force */
368             fix0             = _mm_add_pd(fix0,tx);
369             fiy0             = _mm_add_pd(fiy0,ty);
370             fiz0             = _mm_add_pd(fiz0,tz);
371
372             fjx0             = _mm_add_pd(fjx0,tx);
373             fjy0             = _mm_add_pd(fjy0,ty);
374             fjz0             = _mm_add_pd(fjz0,tz);
375
376             }
377
378             /**************************
379              * CALCULATE INTERACTIONS *
380              **************************/
381
382             if (gmx_mm_any_lt(rsq11,rcutoff2))
383             {
384
385             r11              = _mm_mul_pd(rsq11,rinv11);
386
387             /* EWALD ELECTROSTATICS */
388
389             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
390             ewrt             = _mm_mul_pd(r11,ewtabscale);
391             ewitab           = _mm_cvttpd_epi32(ewrt);
392             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
393             ewitab           = _mm_slli_epi32(ewitab,2);
394             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
395             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
396             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
397             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
398             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
399             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
400             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
401             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
402             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
403             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
404
405             d                = _mm_sub_pd(r11,rswitch);
406             d                = _mm_max_pd(d,_mm_setzero_pd());
407             d2               = _mm_mul_pd(d,d);
408             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
409
410             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
411
412             /* Evaluate switch function */
413             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
414             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
415             velec            = _mm_mul_pd(velec,sw);
416             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
417
418             /* Update potential sum for this i atom from the interaction with this j atom. */
419             velec            = _mm_and_pd(velec,cutoff_mask);
420             velecsum         = _mm_add_pd(velecsum,velec);
421
422             fscal            = felec;
423
424             fscal            = _mm_and_pd(fscal,cutoff_mask);
425
426             /* Calculate temporary vectorial force */
427             tx               = _mm_mul_pd(fscal,dx11);
428             ty               = _mm_mul_pd(fscal,dy11);
429             tz               = _mm_mul_pd(fscal,dz11);
430
431             /* Update vectorial force */
432             fix1             = _mm_add_pd(fix1,tx);
433             fiy1             = _mm_add_pd(fiy1,ty);
434             fiz1             = _mm_add_pd(fiz1,tz);
435
436             fjx1             = _mm_add_pd(fjx1,tx);
437             fjy1             = _mm_add_pd(fjy1,ty);
438             fjz1             = _mm_add_pd(fjz1,tz);
439
440             }
441
442             /**************************
443              * CALCULATE INTERACTIONS *
444              **************************/
445
446             if (gmx_mm_any_lt(rsq12,rcutoff2))
447             {
448
449             r12              = _mm_mul_pd(rsq12,rinv12);
450
451             /* EWALD ELECTROSTATICS */
452
453             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
454             ewrt             = _mm_mul_pd(r12,ewtabscale);
455             ewitab           = _mm_cvttpd_epi32(ewrt);
456             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
457             ewitab           = _mm_slli_epi32(ewitab,2);
458             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
459             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
460             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
461             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
462             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
463             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
464             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
465             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
466             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
467             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
468
469             d                = _mm_sub_pd(r12,rswitch);
470             d                = _mm_max_pd(d,_mm_setzero_pd());
471             d2               = _mm_mul_pd(d,d);
472             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
473
474             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
475
476             /* Evaluate switch function */
477             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
478             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
479             velec            = _mm_mul_pd(velec,sw);
480             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
481
482             /* Update potential sum for this i atom from the interaction with this j atom. */
483             velec            = _mm_and_pd(velec,cutoff_mask);
484             velecsum         = _mm_add_pd(velecsum,velec);
485
486             fscal            = felec;
487
488             fscal            = _mm_and_pd(fscal,cutoff_mask);
489
490             /* Calculate temporary vectorial force */
491             tx               = _mm_mul_pd(fscal,dx12);
492             ty               = _mm_mul_pd(fscal,dy12);
493             tz               = _mm_mul_pd(fscal,dz12);
494
495             /* Update vectorial force */
496             fix1             = _mm_add_pd(fix1,tx);
497             fiy1             = _mm_add_pd(fiy1,ty);
498             fiz1             = _mm_add_pd(fiz1,tz);
499
500             fjx2             = _mm_add_pd(fjx2,tx);
501             fjy2             = _mm_add_pd(fjy2,ty);
502             fjz2             = _mm_add_pd(fjz2,tz);
503
504             }
505
506             /**************************
507              * CALCULATE INTERACTIONS *
508              **************************/
509
510             if (gmx_mm_any_lt(rsq13,rcutoff2))
511             {
512
513             r13              = _mm_mul_pd(rsq13,rinv13);
514
515             /* EWALD ELECTROSTATICS */
516
517             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
518             ewrt             = _mm_mul_pd(r13,ewtabscale);
519             ewitab           = _mm_cvttpd_epi32(ewrt);
520             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
521             ewitab           = _mm_slli_epi32(ewitab,2);
522             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
523             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
524             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
525             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
526             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
527             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
528             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
529             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
530             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
531             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
532
533             d                = _mm_sub_pd(r13,rswitch);
534             d                = _mm_max_pd(d,_mm_setzero_pd());
535             d2               = _mm_mul_pd(d,d);
536             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
537
538             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
539
540             /* Evaluate switch function */
541             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
542             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
543             velec            = _mm_mul_pd(velec,sw);
544             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
545
546             /* Update potential sum for this i atom from the interaction with this j atom. */
547             velec            = _mm_and_pd(velec,cutoff_mask);
548             velecsum         = _mm_add_pd(velecsum,velec);
549
550             fscal            = felec;
551
552             fscal            = _mm_and_pd(fscal,cutoff_mask);
553
554             /* Calculate temporary vectorial force */
555             tx               = _mm_mul_pd(fscal,dx13);
556             ty               = _mm_mul_pd(fscal,dy13);
557             tz               = _mm_mul_pd(fscal,dz13);
558
559             /* Update vectorial force */
560             fix1             = _mm_add_pd(fix1,tx);
561             fiy1             = _mm_add_pd(fiy1,ty);
562             fiz1             = _mm_add_pd(fiz1,tz);
563
564             fjx3             = _mm_add_pd(fjx3,tx);
565             fjy3             = _mm_add_pd(fjy3,ty);
566             fjz3             = _mm_add_pd(fjz3,tz);
567
568             }
569
570             /**************************
571              * CALCULATE INTERACTIONS *
572              **************************/
573
574             if (gmx_mm_any_lt(rsq21,rcutoff2))
575             {
576
577             r21              = _mm_mul_pd(rsq21,rinv21);
578
579             /* EWALD ELECTROSTATICS */
580
581             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
582             ewrt             = _mm_mul_pd(r21,ewtabscale);
583             ewitab           = _mm_cvttpd_epi32(ewrt);
584             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
585             ewitab           = _mm_slli_epi32(ewitab,2);
586             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
587             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
588             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
589             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
590             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
591             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
592             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
593             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
594             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
595             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
596
597             d                = _mm_sub_pd(r21,rswitch);
598             d                = _mm_max_pd(d,_mm_setzero_pd());
599             d2               = _mm_mul_pd(d,d);
600             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
601
602             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
603
604             /* Evaluate switch function */
605             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
606             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
607             velec            = _mm_mul_pd(velec,sw);
608             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
609
610             /* Update potential sum for this i atom from the interaction with this j atom. */
611             velec            = _mm_and_pd(velec,cutoff_mask);
612             velecsum         = _mm_add_pd(velecsum,velec);
613
614             fscal            = felec;
615
616             fscal            = _mm_and_pd(fscal,cutoff_mask);
617
618             /* Calculate temporary vectorial force */
619             tx               = _mm_mul_pd(fscal,dx21);
620             ty               = _mm_mul_pd(fscal,dy21);
621             tz               = _mm_mul_pd(fscal,dz21);
622
623             /* Update vectorial force */
624             fix2             = _mm_add_pd(fix2,tx);
625             fiy2             = _mm_add_pd(fiy2,ty);
626             fiz2             = _mm_add_pd(fiz2,tz);
627
628             fjx1             = _mm_add_pd(fjx1,tx);
629             fjy1             = _mm_add_pd(fjy1,ty);
630             fjz1             = _mm_add_pd(fjz1,tz);
631
632             }
633
634             /**************************
635              * CALCULATE INTERACTIONS *
636              **************************/
637
638             if (gmx_mm_any_lt(rsq22,rcutoff2))
639             {
640
641             r22              = _mm_mul_pd(rsq22,rinv22);
642
643             /* EWALD ELECTROSTATICS */
644
645             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
646             ewrt             = _mm_mul_pd(r22,ewtabscale);
647             ewitab           = _mm_cvttpd_epi32(ewrt);
648             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
649             ewitab           = _mm_slli_epi32(ewitab,2);
650             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
651             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
652             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
653             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
654             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
655             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
656             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
657             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
658             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
659             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
660
661             d                = _mm_sub_pd(r22,rswitch);
662             d                = _mm_max_pd(d,_mm_setzero_pd());
663             d2               = _mm_mul_pd(d,d);
664             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
665
666             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
667
668             /* Evaluate switch function */
669             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
670             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
671             velec            = _mm_mul_pd(velec,sw);
672             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
673
674             /* Update potential sum for this i atom from the interaction with this j atom. */
675             velec            = _mm_and_pd(velec,cutoff_mask);
676             velecsum         = _mm_add_pd(velecsum,velec);
677
678             fscal            = felec;
679
680             fscal            = _mm_and_pd(fscal,cutoff_mask);
681
682             /* Calculate temporary vectorial force */
683             tx               = _mm_mul_pd(fscal,dx22);
684             ty               = _mm_mul_pd(fscal,dy22);
685             tz               = _mm_mul_pd(fscal,dz22);
686
687             /* Update vectorial force */
688             fix2             = _mm_add_pd(fix2,tx);
689             fiy2             = _mm_add_pd(fiy2,ty);
690             fiz2             = _mm_add_pd(fiz2,tz);
691
692             fjx2             = _mm_add_pd(fjx2,tx);
693             fjy2             = _mm_add_pd(fjy2,ty);
694             fjz2             = _mm_add_pd(fjz2,tz);
695
696             }
697
698             /**************************
699              * CALCULATE INTERACTIONS *
700              **************************/
701
702             if (gmx_mm_any_lt(rsq23,rcutoff2))
703             {
704
705             r23              = _mm_mul_pd(rsq23,rinv23);
706
707             /* EWALD ELECTROSTATICS */
708
709             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
710             ewrt             = _mm_mul_pd(r23,ewtabscale);
711             ewitab           = _mm_cvttpd_epi32(ewrt);
712             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
713             ewitab           = _mm_slli_epi32(ewitab,2);
714             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
715             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
716             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
717             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
718             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
719             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
720             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
721             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
722             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
723             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
724
725             d                = _mm_sub_pd(r23,rswitch);
726             d                = _mm_max_pd(d,_mm_setzero_pd());
727             d2               = _mm_mul_pd(d,d);
728             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
729
730             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
731
732             /* Evaluate switch function */
733             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
734             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
735             velec            = _mm_mul_pd(velec,sw);
736             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
737
738             /* Update potential sum for this i atom from the interaction with this j atom. */
739             velec            = _mm_and_pd(velec,cutoff_mask);
740             velecsum         = _mm_add_pd(velecsum,velec);
741
742             fscal            = felec;
743
744             fscal            = _mm_and_pd(fscal,cutoff_mask);
745
746             /* Calculate temporary vectorial force */
747             tx               = _mm_mul_pd(fscal,dx23);
748             ty               = _mm_mul_pd(fscal,dy23);
749             tz               = _mm_mul_pd(fscal,dz23);
750
751             /* Update vectorial force */
752             fix2             = _mm_add_pd(fix2,tx);
753             fiy2             = _mm_add_pd(fiy2,ty);
754             fiz2             = _mm_add_pd(fiz2,tz);
755
756             fjx3             = _mm_add_pd(fjx3,tx);
757             fjy3             = _mm_add_pd(fjy3,ty);
758             fjz3             = _mm_add_pd(fjz3,tz);
759
760             }
761
762             /**************************
763              * CALCULATE INTERACTIONS *
764              **************************/
765
766             if (gmx_mm_any_lt(rsq31,rcutoff2))
767             {
768
769             r31              = _mm_mul_pd(rsq31,rinv31);
770
771             /* EWALD ELECTROSTATICS */
772
773             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
774             ewrt             = _mm_mul_pd(r31,ewtabscale);
775             ewitab           = _mm_cvttpd_epi32(ewrt);
776             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
777             ewitab           = _mm_slli_epi32(ewitab,2);
778             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
779             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
780             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
781             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
782             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
783             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
784             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
785             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
786             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
787             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
788
789             d                = _mm_sub_pd(r31,rswitch);
790             d                = _mm_max_pd(d,_mm_setzero_pd());
791             d2               = _mm_mul_pd(d,d);
792             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
793
794             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
795
796             /* Evaluate switch function */
797             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
798             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
799             velec            = _mm_mul_pd(velec,sw);
800             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
801
802             /* Update potential sum for this i atom from the interaction with this j atom. */
803             velec            = _mm_and_pd(velec,cutoff_mask);
804             velecsum         = _mm_add_pd(velecsum,velec);
805
806             fscal            = felec;
807
808             fscal            = _mm_and_pd(fscal,cutoff_mask);
809
810             /* Calculate temporary vectorial force */
811             tx               = _mm_mul_pd(fscal,dx31);
812             ty               = _mm_mul_pd(fscal,dy31);
813             tz               = _mm_mul_pd(fscal,dz31);
814
815             /* Update vectorial force */
816             fix3             = _mm_add_pd(fix3,tx);
817             fiy3             = _mm_add_pd(fiy3,ty);
818             fiz3             = _mm_add_pd(fiz3,tz);
819
820             fjx1             = _mm_add_pd(fjx1,tx);
821             fjy1             = _mm_add_pd(fjy1,ty);
822             fjz1             = _mm_add_pd(fjz1,tz);
823
824             }
825
826             /**************************
827              * CALCULATE INTERACTIONS *
828              **************************/
829
830             if (gmx_mm_any_lt(rsq32,rcutoff2))
831             {
832
833             r32              = _mm_mul_pd(rsq32,rinv32);
834
835             /* EWALD ELECTROSTATICS */
836
837             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
838             ewrt             = _mm_mul_pd(r32,ewtabscale);
839             ewitab           = _mm_cvttpd_epi32(ewrt);
840             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
841             ewitab           = _mm_slli_epi32(ewitab,2);
842             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
843             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
844             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
845             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
846             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
847             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
848             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
849             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
850             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
851             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
852
853             d                = _mm_sub_pd(r32,rswitch);
854             d                = _mm_max_pd(d,_mm_setzero_pd());
855             d2               = _mm_mul_pd(d,d);
856             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
857
858             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
859
860             /* Evaluate switch function */
861             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
862             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
863             velec            = _mm_mul_pd(velec,sw);
864             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
865
866             /* Update potential sum for this i atom from the interaction with this j atom. */
867             velec            = _mm_and_pd(velec,cutoff_mask);
868             velecsum         = _mm_add_pd(velecsum,velec);
869
870             fscal            = felec;
871
872             fscal            = _mm_and_pd(fscal,cutoff_mask);
873
874             /* Calculate temporary vectorial force */
875             tx               = _mm_mul_pd(fscal,dx32);
876             ty               = _mm_mul_pd(fscal,dy32);
877             tz               = _mm_mul_pd(fscal,dz32);
878
879             /* Update vectorial force */
880             fix3             = _mm_add_pd(fix3,tx);
881             fiy3             = _mm_add_pd(fiy3,ty);
882             fiz3             = _mm_add_pd(fiz3,tz);
883
884             fjx2             = _mm_add_pd(fjx2,tx);
885             fjy2             = _mm_add_pd(fjy2,ty);
886             fjz2             = _mm_add_pd(fjz2,tz);
887
888             }
889
890             /**************************
891              * CALCULATE INTERACTIONS *
892              **************************/
893
894             if (gmx_mm_any_lt(rsq33,rcutoff2))
895             {
896
897             r33              = _mm_mul_pd(rsq33,rinv33);
898
899             /* EWALD ELECTROSTATICS */
900
901             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
902             ewrt             = _mm_mul_pd(r33,ewtabscale);
903             ewitab           = _mm_cvttpd_epi32(ewrt);
904             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
905             ewitab           = _mm_slli_epi32(ewitab,2);
906             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
907             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
908             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
909             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
910             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
911             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
912             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
913             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
914             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
915             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
916
917             d                = _mm_sub_pd(r33,rswitch);
918             d                = _mm_max_pd(d,_mm_setzero_pd());
919             d2               = _mm_mul_pd(d,d);
920             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
921
922             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
923
924             /* Evaluate switch function */
925             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
926             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
927             velec            = _mm_mul_pd(velec,sw);
928             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
929
930             /* Update potential sum for this i atom from the interaction with this j atom. */
931             velec            = _mm_and_pd(velec,cutoff_mask);
932             velecsum         = _mm_add_pd(velecsum,velec);
933
934             fscal            = felec;
935
936             fscal            = _mm_and_pd(fscal,cutoff_mask);
937
938             /* Calculate temporary vectorial force */
939             tx               = _mm_mul_pd(fscal,dx33);
940             ty               = _mm_mul_pd(fscal,dy33);
941             tz               = _mm_mul_pd(fscal,dz33);
942
943             /* Update vectorial force */
944             fix3             = _mm_add_pd(fix3,tx);
945             fiy3             = _mm_add_pd(fiy3,ty);
946             fiz3             = _mm_add_pd(fiz3,tz);
947
948             fjx3             = _mm_add_pd(fjx3,tx);
949             fjy3             = _mm_add_pd(fjy3,ty);
950             fjz3             = _mm_add_pd(fjz3,tz);
951
952             }
953
954             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
955
956             /* Inner loop uses 647 flops */
957         }
958
959         if(jidx<j_index_end)
960         {
961
962             jnrA             = jjnr[jidx];
963             j_coord_offsetA  = DIM*jnrA;
964
965             /* load j atom coordinates */
966             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
967                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
968                                               &jy2,&jz2,&jx3,&jy3,&jz3);
969
970             /* Calculate displacement vector */
971             dx00             = _mm_sub_pd(ix0,jx0);
972             dy00             = _mm_sub_pd(iy0,jy0);
973             dz00             = _mm_sub_pd(iz0,jz0);
974             dx11             = _mm_sub_pd(ix1,jx1);
975             dy11             = _mm_sub_pd(iy1,jy1);
976             dz11             = _mm_sub_pd(iz1,jz1);
977             dx12             = _mm_sub_pd(ix1,jx2);
978             dy12             = _mm_sub_pd(iy1,jy2);
979             dz12             = _mm_sub_pd(iz1,jz2);
980             dx13             = _mm_sub_pd(ix1,jx3);
981             dy13             = _mm_sub_pd(iy1,jy3);
982             dz13             = _mm_sub_pd(iz1,jz3);
983             dx21             = _mm_sub_pd(ix2,jx1);
984             dy21             = _mm_sub_pd(iy2,jy1);
985             dz21             = _mm_sub_pd(iz2,jz1);
986             dx22             = _mm_sub_pd(ix2,jx2);
987             dy22             = _mm_sub_pd(iy2,jy2);
988             dz22             = _mm_sub_pd(iz2,jz2);
989             dx23             = _mm_sub_pd(ix2,jx3);
990             dy23             = _mm_sub_pd(iy2,jy3);
991             dz23             = _mm_sub_pd(iz2,jz3);
992             dx31             = _mm_sub_pd(ix3,jx1);
993             dy31             = _mm_sub_pd(iy3,jy1);
994             dz31             = _mm_sub_pd(iz3,jz1);
995             dx32             = _mm_sub_pd(ix3,jx2);
996             dy32             = _mm_sub_pd(iy3,jy2);
997             dz32             = _mm_sub_pd(iz3,jz2);
998             dx33             = _mm_sub_pd(ix3,jx3);
999             dy33             = _mm_sub_pd(iy3,jy3);
1000             dz33             = _mm_sub_pd(iz3,jz3);
1001
1002             /* Calculate squared distance and things based on it */
1003             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1004             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1005             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1006             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1007             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1008             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1009             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1010             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1011             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1012             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1013
1014             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1015             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1016             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1017             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1018             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1019             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1020             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1021             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1022             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1023             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1024
1025             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1026             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1027             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1028             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1029             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1030             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1031             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1032             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1033             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1034             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1035
1036             fjx0             = _mm_setzero_pd();
1037             fjy0             = _mm_setzero_pd();
1038             fjz0             = _mm_setzero_pd();
1039             fjx1             = _mm_setzero_pd();
1040             fjy1             = _mm_setzero_pd();
1041             fjz1             = _mm_setzero_pd();
1042             fjx2             = _mm_setzero_pd();
1043             fjy2             = _mm_setzero_pd();
1044             fjz2             = _mm_setzero_pd();
1045             fjx3             = _mm_setzero_pd();
1046             fjy3             = _mm_setzero_pd();
1047             fjz3             = _mm_setzero_pd();
1048
1049             /**************************
1050              * CALCULATE INTERACTIONS *
1051              **************************/
1052
1053             if (gmx_mm_any_lt(rsq00,rcutoff2))
1054             {
1055
1056             r00              = _mm_mul_pd(rsq00,rinv00);
1057
1058             /* LENNARD-JONES DISPERSION/REPULSION */
1059
1060             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1061             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
1062             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1063             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1064             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1065
1066             d                = _mm_sub_pd(r00,rswitch);
1067             d                = _mm_max_pd(d,_mm_setzero_pd());
1068             d2               = _mm_mul_pd(d,d);
1069             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1070
1071             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1072
1073             /* Evaluate switch function */
1074             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1075             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1076             vvdw             = _mm_mul_pd(vvdw,sw);
1077             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1078
1079             /* Update potential sum for this i atom from the interaction with this j atom. */
1080             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
1081             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1082             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
1083
1084             fscal            = fvdw;
1085
1086             fscal            = _mm_and_pd(fscal,cutoff_mask);
1087
1088             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1089
1090             /* Calculate temporary vectorial force */
1091             tx               = _mm_mul_pd(fscal,dx00);
1092             ty               = _mm_mul_pd(fscal,dy00);
1093             tz               = _mm_mul_pd(fscal,dz00);
1094
1095             /* Update vectorial force */
1096             fix0             = _mm_add_pd(fix0,tx);
1097             fiy0             = _mm_add_pd(fiy0,ty);
1098             fiz0             = _mm_add_pd(fiz0,tz);
1099
1100             fjx0             = _mm_add_pd(fjx0,tx);
1101             fjy0             = _mm_add_pd(fjy0,ty);
1102             fjz0             = _mm_add_pd(fjz0,tz);
1103
1104             }
1105
1106             /**************************
1107              * CALCULATE INTERACTIONS *
1108              **************************/
1109
1110             if (gmx_mm_any_lt(rsq11,rcutoff2))
1111             {
1112
1113             r11              = _mm_mul_pd(rsq11,rinv11);
1114
1115             /* EWALD ELECTROSTATICS */
1116
1117             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1118             ewrt             = _mm_mul_pd(r11,ewtabscale);
1119             ewitab           = _mm_cvttpd_epi32(ewrt);
1120             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1121             ewitab           = _mm_slli_epi32(ewitab,2);
1122             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1123             ewtabD           = _mm_setzero_pd();
1124             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1125             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1126             ewtabFn          = _mm_setzero_pd();
1127             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1128             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1129             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1130             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1131             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1132
1133             d                = _mm_sub_pd(r11,rswitch);
1134             d                = _mm_max_pd(d,_mm_setzero_pd());
1135             d2               = _mm_mul_pd(d,d);
1136             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1137
1138             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1139
1140             /* Evaluate switch function */
1141             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1142             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1143             velec            = _mm_mul_pd(velec,sw);
1144             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1145
1146             /* Update potential sum for this i atom from the interaction with this j atom. */
1147             velec            = _mm_and_pd(velec,cutoff_mask);
1148             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1149             velecsum         = _mm_add_pd(velecsum,velec);
1150
1151             fscal            = felec;
1152
1153             fscal            = _mm_and_pd(fscal,cutoff_mask);
1154
1155             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1156
1157             /* Calculate temporary vectorial force */
1158             tx               = _mm_mul_pd(fscal,dx11);
1159             ty               = _mm_mul_pd(fscal,dy11);
1160             tz               = _mm_mul_pd(fscal,dz11);
1161
1162             /* Update vectorial force */
1163             fix1             = _mm_add_pd(fix1,tx);
1164             fiy1             = _mm_add_pd(fiy1,ty);
1165             fiz1             = _mm_add_pd(fiz1,tz);
1166
1167             fjx1             = _mm_add_pd(fjx1,tx);
1168             fjy1             = _mm_add_pd(fjy1,ty);
1169             fjz1             = _mm_add_pd(fjz1,tz);
1170
1171             }
1172
1173             /**************************
1174              * CALCULATE INTERACTIONS *
1175              **************************/
1176
1177             if (gmx_mm_any_lt(rsq12,rcutoff2))
1178             {
1179
1180             r12              = _mm_mul_pd(rsq12,rinv12);
1181
1182             /* EWALD ELECTROSTATICS */
1183
1184             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1185             ewrt             = _mm_mul_pd(r12,ewtabscale);
1186             ewitab           = _mm_cvttpd_epi32(ewrt);
1187             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1188             ewitab           = _mm_slli_epi32(ewitab,2);
1189             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1190             ewtabD           = _mm_setzero_pd();
1191             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1192             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1193             ewtabFn          = _mm_setzero_pd();
1194             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1195             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1196             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1197             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1198             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1199
1200             d                = _mm_sub_pd(r12,rswitch);
1201             d                = _mm_max_pd(d,_mm_setzero_pd());
1202             d2               = _mm_mul_pd(d,d);
1203             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1204
1205             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1206
1207             /* Evaluate switch function */
1208             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1209             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1210             velec            = _mm_mul_pd(velec,sw);
1211             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1212
1213             /* Update potential sum for this i atom from the interaction with this j atom. */
1214             velec            = _mm_and_pd(velec,cutoff_mask);
1215             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1216             velecsum         = _mm_add_pd(velecsum,velec);
1217
1218             fscal            = felec;
1219
1220             fscal            = _mm_and_pd(fscal,cutoff_mask);
1221
1222             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1223
1224             /* Calculate temporary vectorial force */
1225             tx               = _mm_mul_pd(fscal,dx12);
1226             ty               = _mm_mul_pd(fscal,dy12);
1227             tz               = _mm_mul_pd(fscal,dz12);
1228
1229             /* Update vectorial force */
1230             fix1             = _mm_add_pd(fix1,tx);
1231             fiy1             = _mm_add_pd(fiy1,ty);
1232             fiz1             = _mm_add_pd(fiz1,tz);
1233
1234             fjx2             = _mm_add_pd(fjx2,tx);
1235             fjy2             = _mm_add_pd(fjy2,ty);
1236             fjz2             = _mm_add_pd(fjz2,tz);
1237
1238             }
1239
1240             /**************************
1241              * CALCULATE INTERACTIONS *
1242              **************************/
1243
1244             if (gmx_mm_any_lt(rsq13,rcutoff2))
1245             {
1246
1247             r13              = _mm_mul_pd(rsq13,rinv13);
1248
1249             /* EWALD ELECTROSTATICS */
1250
1251             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1252             ewrt             = _mm_mul_pd(r13,ewtabscale);
1253             ewitab           = _mm_cvttpd_epi32(ewrt);
1254             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1255             ewitab           = _mm_slli_epi32(ewitab,2);
1256             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1257             ewtabD           = _mm_setzero_pd();
1258             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1259             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1260             ewtabFn          = _mm_setzero_pd();
1261             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1262             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1263             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1264             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1265             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1266
1267             d                = _mm_sub_pd(r13,rswitch);
1268             d                = _mm_max_pd(d,_mm_setzero_pd());
1269             d2               = _mm_mul_pd(d,d);
1270             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1271
1272             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1273
1274             /* Evaluate switch function */
1275             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1276             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1277             velec            = _mm_mul_pd(velec,sw);
1278             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
1279
1280             /* Update potential sum for this i atom from the interaction with this j atom. */
1281             velec            = _mm_and_pd(velec,cutoff_mask);
1282             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1283             velecsum         = _mm_add_pd(velecsum,velec);
1284
1285             fscal            = felec;
1286
1287             fscal            = _mm_and_pd(fscal,cutoff_mask);
1288
1289             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1290
1291             /* Calculate temporary vectorial force */
1292             tx               = _mm_mul_pd(fscal,dx13);
1293             ty               = _mm_mul_pd(fscal,dy13);
1294             tz               = _mm_mul_pd(fscal,dz13);
1295
1296             /* Update vectorial force */
1297             fix1             = _mm_add_pd(fix1,tx);
1298             fiy1             = _mm_add_pd(fiy1,ty);
1299             fiz1             = _mm_add_pd(fiz1,tz);
1300
1301             fjx3             = _mm_add_pd(fjx3,tx);
1302             fjy3             = _mm_add_pd(fjy3,ty);
1303             fjz3             = _mm_add_pd(fjz3,tz);
1304
1305             }
1306
1307             /**************************
1308              * CALCULATE INTERACTIONS *
1309              **************************/
1310
1311             if (gmx_mm_any_lt(rsq21,rcutoff2))
1312             {
1313
1314             r21              = _mm_mul_pd(rsq21,rinv21);
1315
1316             /* EWALD ELECTROSTATICS */
1317
1318             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1319             ewrt             = _mm_mul_pd(r21,ewtabscale);
1320             ewitab           = _mm_cvttpd_epi32(ewrt);
1321             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1322             ewitab           = _mm_slli_epi32(ewitab,2);
1323             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1324             ewtabD           = _mm_setzero_pd();
1325             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1326             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1327             ewtabFn          = _mm_setzero_pd();
1328             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1329             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1330             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1331             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1332             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1333
1334             d                = _mm_sub_pd(r21,rswitch);
1335             d                = _mm_max_pd(d,_mm_setzero_pd());
1336             d2               = _mm_mul_pd(d,d);
1337             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1338
1339             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1340
1341             /* Evaluate switch function */
1342             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1343             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1344             velec            = _mm_mul_pd(velec,sw);
1345             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1346
1347             /* Update potential sum for this i atom from the interaction with this j atom. */
1348             velec            = _mm_and_pd(velec,cutoff_mask);
1349             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1350             velecsum         = _mm_add_pd(velecsum,velec);
1351
1352             fscal            = felec;
1353
1354             fscal            = _mm_and_pd(fscal,cutoff_mask);
1355
1356             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1357
1358             /* Calculate temporary vectorial force */
1359             tx               = _mm_mul_pd(fscal,dx21);
1360             ty               = _mm_mul_pd(fscal,dy21);
1361             tz               = _mm_mul_pd(fscal,dz21);
1362
1363             /* Update vectorial force */
1364             fix2             = _mm_add_pd(fix2,tx);
1365             fiy2             = _mm_add_pd(fiy2,ty);
1366             fiz2             = _mm_add_pd(fiz2,tz);
1367
1368             fjx1             = _mm_add_pd(fjx1,tx);
1369             fjy1             = _mm_add_pd(fjy1,ty);
1370             fjz1             = _mm_add_pd(fjz1,tz);
1371
1372             }
1373
1374             /**************************
1375              * CALCULATE INTERACTIONS *
1376              **************************/
1377
1378             if (gmx_mm_any_lt(rsq22,rcutoff2))
1379             {
1380
1381             r22              = _mm_mul_pd(rsq22,rinv22);
1382
1383             /* EWALD ELECTROSTATICS */
1384
1385             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1386             ewrt             = _mm_mul_pd(r22,ewtabscale);
1387             ewitab           = _mm_cvttpd_epi32(ewrt);
1388             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1389             ewitab           = _mm_slli_epi32(ewitab,2);
1390             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1391             ewtabD           = _mm_setzero_pd();
1392             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1393             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1394             ewtabFn          = _mm_setzero_pd();
1395             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1396             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1397             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1398             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1399             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1400
1401             d                = _mm_sub_pd(r22,rswitch);
1402             d                = _mm_max_pd(d,_mm_setzero_pd());
1403             d2               = _mm_mul_pd(d,d);
1404             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1405
1406             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1407
1408             /* Evaluate switch function */
1409             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1410             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1411             velec            = _mm_mul_pd(velec,sw);
1412             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1413
1414             /* Update potential sum for this i atom from the interaction with this j atom. */
1415             velec            = _mm_and_pd(velec,cutoff_mask);
1416             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1417             velecsum         = _mm_add_pd(velecsum,velec);
1418
1419             fscal            = felec;
1420
1421             fscal            = _mm_and_pd(fscal,cutoff_mask);
1422
1423             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1424
1425             /* Calculate temporary vectorial force */
1426             tx               = _mm_mul_pd(fscal,dx22);
1427             ty               = _mm_mul_pd(fscal,dy22);
1428             tz               = _mm_mul_pd(fscal,dz22);
1429
1430             /* Update vectorial force */
1431             fix2             = _mm_add_pd(fix2,tx);
1432             fiy2             = _mm_add_pd(fiy2,ty);
1433             fiz2             = _mm_add_pd(fiz2,tz);
1434
1435             fjx2             = _mm_add_pd(fjx2,tx);
1436             fjy2             = _mm_add_pd(fjy2,ty);
1437             fjz2             = _mm_add_pd(fjz2,tz);
1438
1439             }
1440
1441             /**************************
1442              * CALCULATE INTERACTIONS *
1443              **************************/
1444
1445             if (gmx_mm_any_lt(rsq23,rcutoff2))
1446             {
1447
1448             r23              = _mm_mul_pd(rsq23,rinv23);
1449
1450             /* EWALD ELECTROSTATICS */
1451
1452             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1453             ewrt             = _mm_mul_pd(r23,ewtabscale);
1454             ewitab           = _mm_cvttpd_epi32(ewrt);
1455             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1456             ewitab           = _mm_slli_epi32(ewitab,2);
1457             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1458             ewtabD           = _mm_setzero_pd();
1459             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1460             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1461             ewtabFn          = _mm_setzero_pd();
1462             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1463             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1464             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1465             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1466             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1467
1468             d                = _mm_sub_pd(r23,rswitch);
1469             d                = _mm_max_pd(d,_mm_setzero_pd());
1470             d2               = _mm_mul_pd(d,d);
1471             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1472
1473             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1474
1475             /* Evaluate switch function */
1476             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1477             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1478             velec            = _mm_mul_pd(velec,sw);
1479             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
1480
1481             /* Update potential sum for this i atom from the interaction with this j atom. */
1482             velec            = _mm_and_pd(velec,cutoff_mask);
1483             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1484             velecsum         = _mm_add_pd(velecsum,velec);
1485
1486             fscal            = felec;
1487
1488             fscal            = _mm_and_pd(fscal,cutoff_mask);
1489
1490             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1491
1492             /* Calculate temporary vectorial force */
1493             tx               = _mm_mul_pd(fscal,dx23);
1494             ty               = _mm_mul_pd(fscal,dy23);
1495             tz               = _mm_mul_pd(fscal,dz23);
1496
1497             /* Update vectorial force */
1498             fix2             = _mm_add_pd(fix2,tx);
1499             fiy2             = _mm_add_pd(fiy2,ty);
1500             fiz2             = _mm_add_pd(fiz2,tz);
1501
1502             fjx3             = _mm_add_pd(fjx3,tx);
1503             fjy3             = _mm_add_pd(fjy3,ty);
1504             fjz3             = _mm_add_pd(fjz3,tz);
1505
1506             }
1507
1508             /**************************
1509              * CALCULATE INTERACTIONS *
1510              **************************/
1511
1512             if (gmx_mm_any_lt(rsq31,rcutoff2))
1513             {
1514
1515             r31              = _mm_mul_pd(rsq31,rinv31);
1516
1517             /* EWALD ELECTROSTATICS */
1518
1519             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1520             ewrt             = _mm_mul_pd(r31,ewtabscale);
1521             ewitab           = _mm_cvttpd_epi32(ewrt);
1522             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1523             ewitab           = _mm_slli_epi32(ewitab,2);
1524             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1525             ewtabD           = _mm_setzero_pd();
1526             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1527             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1528             ewtabFn          = _mm_setzero_pd();
1529             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1530             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1531             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1532             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1533             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1534
1535             d                = _mm_sub_pd(r31,rswitch);
1536             d                = _mm_max_pd(d,_mm_setzero_pd());
1537             d2               = _mm_mul_pd(d,d);
1538             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1539
1540             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1541
1542             /* Evaluate switch function */
1543             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1544             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1545             velec            = _mm_mul_pd(velec,sw);
1546             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
1547
1548             /* Update potential sum for this i atom from the interaction with this j atom. */
1549             velec            = _mm_and_pd(velec,cutoff_mask);
1550             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1551             velecsum         = _mm_add_pd(velecsum,velec);
1552
1553             fscal            = felec;
1554
1555             fscal            = _mm_and_pd(fscal,cutoff_mask);
1556
1557             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1558
1559             /* Calculate temporary vectorial force */
1560             tx               = _mm_mul_pd(fscal,dx31);
1561             ty               = _mm_mul_pd(fscal,dy31);
1562             tz               = _mm_mul_pd(fscal,dz31);
1563
1564             /* Update vectorial force */
1565             fix3             = _mm_add_pd(fix3,tx);
1566             fiy3             = _mm_add_pd(fiy3,ty);
1567             fiz3             = _mm_add_pd(fiz3,tz);
1568
1569             fjx1             = _mm_add_pd(fjx1,tx);
1570             fjy1             = _mm_add_pd(fjy1,ty);
1571             fjz1             = _mm_add_pd(fjz1,tz);
1572
1573             }
1574
1575             /**************************
1576              * CALCULATE INTERACTIONS *
1577              **************************/
1578
1579             if (gmx_mm_any_lt(rsq32,rcutoff2))
1580             {
1581
1582             r32              = _mm_mul_pd(rsq32,rinv32);
1583
1584             /* EWALD ELECTROSTATICS */
1585
1586             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1587             ewrt             = _mm_mul_pd(r32,ewtabscale);
1588             ewitab           = _mm_cvttpd_epi32(ewrt);
1589             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1590             ewitab           = _mm_slli_epi32(ewitab,2);
1591             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1592             ewtabD           = _mm_setzero_pd();
1593             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1594             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1595             ewtabFn          = _mm_setzero_pd();
1596             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1597             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1598             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1599             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1600             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1601
1602             d                = _mm_sub_pd(r32,rswitch);
1603             d                = _mm_max_pd(d,_mm_setzero_pd());
1604             d2               = _mm_mul_pd(d,d);
1605             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1606
1607             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1608
1609             /* Evaluate switch function */
1610             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1611             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1612             velec            = _mm_mul_pd(velec,sw);
1613             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
1614
1615             /* Update potential sum for this i atom from the interaction with this j atom. */
1616             velec            = _mm_and_pd(velec,cutoff_mask);
1617             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1618             velecsum         = _mm_add_pd(velecsum,velec);
1619
1620             fscal            = felec;
1621
1622             fscal            = _mm_and_pd(fscal,cutoff_mask);
1623
1624             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1625
1626             /* Calculate temporary vectorial force */
1627             tx               = _mm_mul_pd(fscal,dx32);
1628             ty               = _mm_mul_pd(fscal,dy32);
1629             tz               = _mm_mul_pd(fscal,dz32);
1630
1631             /* Update vectorial force */
1632             fix3             = _mm_add_pd(fix3,tx);
1633             fiy3             = _mm_add_pd(fiy3,ty);
1634             fiz3             = _mm_add_pd(fiz3,tz);
1635
1636             fjx2             = _mm_add_pd(fjx2,tx);
1637             fjy2             = _mm_add_pd(fjy2,ty);
1638             fjz2             = _mm_add_pd(fjz2,tz);
1639
1640             }
1641
1642             /**************************
1643              * CALCULATE INTERACTIONS *
1644              **************************/
1645
1646             if (gmx_mm_any_lt(rsq33,rcutoff2))
1647             {
1648
1649             r33              = _mm_mul_pd(rsq33,rinv33);
1650
1651             /* EWALD ELECTROSTATICS */
1652
1653             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1654             ewrt             = _mm_mul_pd(r33,ewtabscale);
1655             ewitab           = _mm_cvttpd_epi32(ewrt);
1656             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1657             ewitab           = _mm_slli_epi32(ewitab,2);
1658             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1659             ewtabD           = _mm_setzero_pd();
1660             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1661             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1662             ewtabFn          = _mm_setzero_pd();
1663             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1664             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1665             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1666             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1667             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1668
1669             d                = _mm_sub_pd(r33,rswitch);
1670             d                = _mm_max_pd(d,_mm_setzero_pd());
1671             d2               = _mm_mul_pd(d,d);
1672             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1673
1674             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1675
1676             /* Evaluate switch function */
1677             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1678             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1679             velec            = _mm_mul_pd(velec,sw);
1680             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
1681
1682             /* Update potential sum for this i atom from the interaction with this j atom. */
1683             velec            = _mm_and_pd(velec,cutoff_mask);
1684             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1685             velecsum         = _mm_add_pd(velecsum,velec);
1686
1687             fscal            = felec;
1688
1689             fscal            = _mm_and_pd(fscal,cutoff_mask);
1690
1691             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1692
1693             /* Calculate temporary vectorial force */
1694             tx               = _mm_mul_pd(fscal,dx33);
1695             ty               = _mm_mul_pd(fscal,dy33);
1696             tz               = _mm_mul_pd(fscal,dz33);
1697
1698             /* Update vectorial force */
1699             fix3             = _mm_add_pd(fix3,tx);
1700             fiy3             = _mm_add_pd(fiy3,ty);
1701             fiz3             = _mm_add_pd(fiz3,tz);
1702
1703             fjx3             = _mm_add_pd(fjx3,tx);
1704             fjy3             = _mm_add_pd(fjy3,ty);
1705             fjz3             = _mm_add_pd(fjz3,tz);
1706
1707             }
1708
1709             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1710
1711             /* Inner loop uses 647 flops */
1712         }
1713
1714         /* End of innermost loop */
1715
1716         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1717                                               f+i_coord_offset,fshift+i_shift_offset);
1718
1719         ggid                        = gid[iidx];
1720         /* Update potential energies */
1721         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1722         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1723
1724         /* Increment number of inner iterations */
1725         inneriter                  += j_index_end - j_index_start;
1726
1727         /* Outer loop uses 26 flops */
1728     }
1729
1730     /* Increment number of outer iterations */
1731     outeriter        += nri;
1732
1733     /* Update outer/inner flops */
1734
1735     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*647);
1736 }
1737 /*
1738  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_double
1739  * Electrostatics interaction: Ewald
1740  * VdW interaction:            LennardJones
1741  * Geometry:                   Water4-Water4
1742  * Calculate force/pot:        Force
1743  */
1744 void
1745 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_double
1746                     (t_nblist                    * gmx_restrict       nlist,
1747                      rvec                        * gmx_restrict          xx,
1748                      rvec                        * gmx_restrict          ff,
1749                      t_forcerec                  * gmx_restrict          fr,
1750                      t_mdatoms                   * gmx_restrict     mdatoms,
1751                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1752                      t_nrnb                      * gmx_restrict        nrnb)
1753 {
1754     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1755      * just 0 for non-waters.
1756      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1757      * jnr indices corresponding to data put in the four positions in the SIMD register.
1758      */
1759     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1760     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1761     int              jnrA,jnrB;
1762     int              j_coord_offsetA,j_coord_offsetB;
1763     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1764     real             rcutoff_scalar;
1765     real             *shiftvec,*fshift,*x,*f;
1766     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1767     int              vdwioffset0;
1768     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1769     int              vdwioffset1;
1770     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1771     int              vdwioffset2;
1772     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1773     int              vdwioffset3;
1774     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1775     int              vdwjidx0A,vdwjidx0B;
1776     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1777     int              vdwjidx1A,vdwjidx1B;
1778     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1779     int              vdwjidx2A,vdwjidx2B;
1780     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1781     int              vdwjidx3A,vdwjidx3B;
1782     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1783     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1784     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1785     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1786     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1787     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1788     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1789     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1790     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1791     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1792     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1793     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1794     real             *charge;
1795     int              nvdwtype;
1796     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1797     int              *vdwtype;
1798     real             *vdwparam;
1799     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1800     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1801     __m128i          ewitab;
1802     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1803     real             *ewtab;
1804     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1805     real             rswitch_scalar,d_scalar;
1806     __m128d          dummy_mask,cutoff_mask;
1807     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1808     __m128d          one     = _mm_set1_pd(1.0);
1809     __m128d          two     = _mm_set1_pd(2.0);
1810     x                = xx[0];
1811     f                = ff[0];
1812
1813     nri              = nlist->nri;
1814     iinr             = nlist->iinr;
1815     jindex           = nlist->jindex;
1816     jjnr             = nlist->jjnr;
1817     shiftidx         = nlist->shift;
1818     gid              = nlist->gid;
1819     shiftvec         = fr->shift_vec[0];
1820     fshift           = fr->fshift[0];
1821     facel            = _mm_set1_pd(fr->epsfac);
1822     charge           = mdatoms->chargeA;
1823     nvdwtype         = fr->ntype;
1824     vdwparam         = fr->nbfp;
1825     vdwtype          = mdatoms->typeA;
1826
1827     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1828     ewtab            = fr->ic->tabq_coul_FDV0;
1829     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1830     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1831
1832     /* Setup water-specific parameters */
1833     inr              = nlist->iinr[0];
1834     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1835     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1836     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1837     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1838
1839     jq1              = _mm_set1_pd(charge[inr+1]);
1840     jq2              = _mm_set1_pd(charge[inr+2]);
1841     jq3              = _mm_set1_pd(charge[inr+3]);
1842     vdwjidx0A        = 2*vdwtype[inr+0];
1843     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1844     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1845     qq11             = _mm_mul_pd(iq1,jq1);
1846     qq12             = _mm_mul_pd(iq1,jq2);
1847     qq13             = _mm_mul_pd(iq1,jq3);
1848     qq21             = _mm_mul_pd(iq2,jq1);
1849     qq22             = _mm_mul_pd(iq2,jq2);
1850     qq23             = _mm_mul_pd(iq2,jq3);
1851     qq31             = _mm_mul_pd(iq3,jq1);
1852     qq32             = _mm_mul_pd(iq3,jq2);
1853     qq33             = _mm_mul_pd(iq3,jq3);
1854
1855     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1856     rcutoff_scalar   = fr->rcoulomb;
1857     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1858     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1859
1860     rswitch_scalar   = fr->rcoulomb_switch;
1861     rswitch          = _mm_set1_pd(rswitch_scalar);
1862     /* Setup switch parameters */
1863     d_scalar         = rcutoff_scalar-rswitch_scalar;
1864     d                = _mm_set1_pd(d_scalar);
1865     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1866     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1867     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1868     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1869     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1870     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1871
1872     /* Avoid stupid compiler warnings */
1873     jnrA = jnrB = 0;
1874     j_coord_offsetA = 0;
1875     j_coord_offsetB = 0;
1876
1877     outeriter        = 0;
1878     inneriter        = 0;
1879
1880     /* Start outer loop over neighborlists */
1881     for(iidx=0; iidx<nri; iidx++)
1882     {
1883         /* Load shift vector for this list */
1884         i_shift_offset   = DIM*shiftidx[iidx];
1885
1886         /* Load limits for loop over neighbors */
1887         j_index_start    = jindex[iidx];
1888         j_index_end      = jindex[iidx+1];
1889
1890         /* Get outer coordinate index */
1891         inr              = iinr[iidx];
1892         i_coord_offset   = DIM*inr;
1893
1894         /* Load i particle coords and add shift vector */
1895         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1896                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1897
1898         fix0             = _mm_setzero_pd();
1899         fiy0             = _mm_setzero_pd();
1900         fiz0             = _mm_setzero_pd();
1901         fix1             = _mm_setzero_pd();
1902         fiy1             = _mm_setzero_pd();
1903         fiz1             = _mm_setzero_pd();
1904         fix2             = _mm_setzero_pd();
1905         fiy2             = _mm_setzero_pd();
1906         fiz2             = _mm_setzero_pd();
1907         fix3             = _mm_setzero_pd();
1908         fiy3             = _mm_setzero_pd();
1909         fiz3             = _mm_setzero_pd();
1910
1911         /* Start inner kernel loop */
1912         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1913         {
1914
1915             /* Get j neighbor index, and coordinate index */
1916             jnrA             = jjnr[jidx];
1917             jnrB             = jjnr[jidx+1];
1918             j_coord_offsetA  = DIM*jnrA;
1919             j_coord_offsetB  = DIM*jnrB;
1920
1921             /* load j atom coordinates */
1922             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1923                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1924                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1925
1926             /* Calculate displacement vector */
1927             dx00             = _mm_sub_pd(ix0,jx0);
1928             dy00             = _mm_sub_pd(iy0,jy0);
1929             dz00             = _mm_sub_pd(iz0,jz0);
1930             dx11             = _mm_sub_pd(ix1,jx1);
1931             dy11             = _mm_sub_pd(iy1,jy1);
1932             dz11             = _mm_sub_pd(iz1,jz1);
1933             dx12             = _mm_sub_pd(ix1,jx2);
1934             dy12             = _mm_sub_pd(iy1,jy2);
1935             dz12             = _mm_sub_pd(iz1,jz2);
1936             dx13             = _mm_sub_pd(ix1,jx3);
1937             dy13             = _mm_sub_pd(iy1,jy3);
1938             dz13             = _mm_sub_pd(iz1,jz3);
1939             dx21             = _mm_sub_pd(ix2,jx1);
1940             dy21             = _mm_sub_pd(iy2,jy1);
1941             dz21             = _mm_sub_pd(iz2,jz1);
1942             dx22             = _mm_sub_pd(ix2,jx2);
1943             dy22             = _mm_sub_pd(iy2,jy2);
1944             dz22             = _mm_sub_pd(iz2,jz2);
1945             dx23             = _mm_sub_pd(ix2,jx3);
1946             dy23             = _mm_sub_pd(iy2,jy3);
1947             dz23             = _mm_sub_pd(iz2,jz3);
1948             dx31             = _mm_sub_pd(ix3,jx1);
1949             dy31             = _mm_sub_pd(iy3,jy1);
1950             dz31             = _mm_sub_pd(iz3,jz1);
1951             dx32             = _mm_sub_pd(ix3,jx2);
1952             dy32             = _mm_sub_pd(iy3,jy2);
1953             dz32             = _mm_sub_pd(iz3,jz2);
1954             dx33             = _mm_sub_pd(ix3,jx3);
1955             dy33             = _mm_sub_pd(iy3,jy3);
1956             dz33             = _mm_sub_pd(iz3,jz3);
1957
1958             /* Calculate squared distance and things based on it */
1959             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1960             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1961             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1962             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1963             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1964             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1965             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1966             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1967             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1968             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1969
1970             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1971             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1972             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1973             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1974             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1975             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1976             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1977             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1978             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1979             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1980
1981             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1982             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1983             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1984             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1985             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1986             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1987             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1988             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1989             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1990             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1991
1992             fjx0             = _mm_setzero_pd();
1993             fjy0             = _mm_setzero_pd();
1994             fjz0             = _mm_setzero_pd();
1995             fjx1             = _mm_setzero_pd();
1996             fjy1             = _mm_setzero_pd();
1997             fjz1             = _mm_setzero_pd();
1998             fjx2             = _mm_setzero_pd();
1999             fjy2             = _mm_setzero_pd();
2000             fjz2             = _mm_setzero_pd();
2001             fjx3             = _mm_setzero_pd();
2002             fjy3             = _mm_setzero_pd();
2003             fjz3             = _mm_setzero_pd();
2004
2005             /**************************
2006              * CALCULATE INTERACTIONS *
2007              **************************/
2008
2009             if (gmx_mm_any_lt(rsq00,rcutoff2))
2010             {
2011
2012             r00              = _mm_mul_pd(rsq00,rinv00);
2013
2014             /* LENNARD-JONES DISPERSION/REPULSION */
2015
2016             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2017             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
2018             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2019             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2020             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2021
2022             d                = _mm_sub_pd(r00,rswitch);
2023             d                = _mm_max_pd(d,_mm_setzero_pd());
2024             d2               = _mm_mul_pd(d,d);
2025             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2026
2027             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2028
2029             /* Evaluate switch function */
2030             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2031             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2032             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2033
2034             fscal            = fvdw;
2035
2036             fscal            = _mm_and_pd(fscal,cutoff_mask);
2037
2038             /* Calculate temporary vectorial force */
2039             tx               = _mm_mul_pd(fscal,dx00);
2040             ty               = _mm_mul_pd(fscal,dy00);
2041             tz               = _mm_mul_pd(fscal,dz00);
2042
2043             /* Update vectorial force */
2044             fix0             = _mm_add_pd(fix0,tx);
2045             fiy0             = _mm_add_pd(fiy0,ty);
2046             fiz0             = _mm_add_pd(fiz0,tz);
2047
2048             fjx0             = _mm_add_pd(fjx0,tx);
2049             fjy0             = _mm_add_pd(fjy0,ty);
2050             fjz0             = _mm_add_pd(fjz0,tz);
2051
2052             }
2053
2054             /**************************
2055              * CALCULATE INTERACTIONS *
2056              **************************/
2057
2058             if (gmx_mm_any_lt(rsq11,rcutoff2))
2059             {
2060
2061             r11              = _mm_mul_pd(rsq11,rinv11);
2062
2063             /* EWALD ELECTROSTATICS */
2064
2065             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2066             ewrt             = _mm_mul_pd(r11,ewtabscale);
2067             ewitab           = _mm_cvttpd_epi32(ewrt);
2068             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2069             ewitab           = _mm_slli_epi32(ewitab,2);
2070             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2071             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2072             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2073             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2074             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2075             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2076             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2077             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2078             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2079             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2080
2081             d                = _mm_sub_pd(r11,rswitch);
2082             d                = _mm_max_pd(d,_mm_setzero_pd());
2083             d2               = _mm_mul_pd(d,d);
2084             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2085
2086             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2087
2088             /* Evaluate switch function */
2089             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2090             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2091             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2092
2093             fscal            = felec;
2094
2095             fscal            = _mm_and_pd(fscal,cutoff_mask);
2096
2097             /* Calculate temporary vectorial force */
2098             tx               = _mm_mul_pd(fscal,dx11);
2099             ty               = _mm_mul_pd(fscal,dy11);
2100             tz               = _mm_mul_pd(fscal,dz11);
2101
2102             /* Update vectorial force */
2103             fix1             = _mm_add_pd(fix1,tx);
2104             fiy1             = _mm_add_pd(fiy1,ty);
2105             fiz1             = _mm_add_pd(fiz1,tz);
2106
2107             fjx1             = _mm_add_pd(fjx1,tx);
2108             fjy1             = _mm_add_pd(fjy1,ty);
2109             fjz1             = _mm_add_pd(fjz1,tz);
2110
2111             }
2112
2113             /**************************
2114              * CALCULATE INTERACTIONS *
2115              **************************/
2116
2117             if (gmx_mm_any_lt(rsq12,rcutoff2))
2118             {
2119
2120             r12              = _mm_mul_pd(rsq12,rinv12);
2121
2122             /* EWALD ELECTROSTATICS */
2123
2124             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2125             ewrt             = _mm_mul_pd(r12,ewtabscale);
2126             ewitab           = _mm_cvttpd_epi32(ewrt);
2127             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2128             ewitab           = _mm_slli_epi32(ewitab,2);
2129             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2130             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2131             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2132             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2133             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2134             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2135             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2136             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2137             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2138             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2139
2140             d                = _mm_sub_pd(r12,rswitch);
2141             d                = _mm_max_pd(d,_mm_setzero_pd());
2142             d2               = _mm_mul_pd(d,d);
2143             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2144
2145             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2146
2147             /* Evaluate switch function */
2148             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2149             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2150             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2151
2152             fscal            = felec;
2153
2154             fscal            = _mm_and_pd(fscal,cutoff_mask);
2155
2156             /* Calculate temporary vectorial force */
2157             tx               = _mm_mul_pd(fscal,dx12);
2158             ty               = _mm_mul_pd(fscal,dy12);
2159             tz               = _mm_mul_pd(fscal,dz12);
2160
2161             /* Update vectorial force */
2162             fix1             = _mm_add_pd(fix1,tx);
2163             fiy1             = _mm_add_pd(fiy1,ty);
2164             fiz1             = _mm_add_pd(fiz1,tz);
2165
2166             fjx2             = _mm_add_pd(fjx2,tx);
2167             fjy2             = _mm_add_pd(fjy2,ty);
2168             fjz2             = _mm_add_pd(fjz2,tz);
2169
2170             }
2171
2172             /**************************
2173              * CALCULATE INTERACTIONS *
2174              **************************/
2175
2176             if (gmx_mm_any_lt(rsq13,rcutoff2))
2177             {
2178
2179             r13              = _mm_mul_pd(rsq13,rinv13);
2180
2181             /* EWALD ELECTROSTATICS */
2182
2183             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2184             ewrt             = _mm_mul_pd(r13,ewtabscale);
2185             ewitab           = _mm_cvttpd_epi32(ewrt);
2186             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2187             ewitab           = _mm_slli_epi32(ewitab,2);
2188             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2189             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2190             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2191             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2192             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2193             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2194             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2195             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2196             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2197             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2198
2199             d                = _mm_sub_pd(r13,rswitch);
2200             d                = _mm_max_pd(d,_mm_setzero_pd());
2201             d2               = _mm_mul_pd(d,d);
2202             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2203
2204             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2205
2206             /* Evaluate switch function */
2207             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2208             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2209             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2210
2211             fscal            = felec;
2212
2213             fscal            = _mm_and_pd(fscal,cutoff_mask);
2214
2215             /* Calculate temporary vectorial force */
2216             tx               = _mm_mul_pd(fscal,dx13);
2217             ty               = _mm_mul_pd(fscal,dy13);
2218             tz               = _mm_mul_pd(fscal,dz13);
2219
2220             /* Update vectorial force */
2221             fix1             = _mm_add_pd(fix1,tx);
2222             fiy1             = _mm_add_pd(fiy1,ty);
2223             fiz1             = _mm_add_pd(fiz1,tz);
2224
2225             fjx3             = _mm_add_pd(fjx3,tx);
2226             fjy3             = _mm_add_pd(fjy3,ty);
2227             fjz3             = _mm_add_pd(fjz3,tz);
2228
2229             }
2230
2231             /**************************
2232              * CALCULATE INTERACTIONS *
2233              **************************/
2234
2235             if (gmx_mm_any_lt(rsq21,rcutoff2))
2236             {
2237
2238             r21              = _mm_mul_pd(rsq21,rinv21);
2239
2240             /* EWALD ELECTROSTATICS */
2241
2242             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2243             ewrt             = _mm_mul_pd(r21,ewtabscale);
2244             ewitab           = _mm_cvttpd_epi32(ewrt);
2245             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2246             ewitab           = _mm_slli_epi32(ewitab,2);
2247             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2248             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2249             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2250             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2251             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2252             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2253             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2254             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2255             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2256             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2257
2258             d                = _mm_sub_pd(r21,rswitch);
2259             d                = _mm_max_pd(d,_mm_setzero_pd());
2260             d2               = _mm_mul_pd(d,d);
2261             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2262
2263             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2264
2265             /* Evaluate switch function */
2266             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2267             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2268             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2269
2270             fscal            = felec;
2271
2272             fscal            = _mm_and_pd(fscal,cutoff_mask);
2273
2274             /* Calculate temporary vectorial force */
2275             tx               = _mm_mul_pd(fscal,dx21);
2276             ty               = _mm_mul_pd(fscal,dy21);
2277             tz               = _mm_mul_pd(fscal,dz21);
2278
2279             /* Update vectorial force */
2280             fix2             = _mm_add_pd(fix2,tx);
2281             fiy2             = _mm_add_pd(fiy2,ty);
2282             fiz2             = _mm_add_pd(fiz2,tz);
2283
2284             fjx1             = _mm_add_pd(fjx1,tx);
2285             fjy1             = _mm_add_pd(fjy1,ty);
2286             fjz1             = _mm_add_pd(fjz1,tz);
2287
2288             }
2289
2290             /**************************
2291              * CALCULATE INTERACTIONS *
2292              **************************/
2293
2294             if (gmx_mm_any_lt(rsq22,rcutoff2))
2295             {
2296
2297             r22              = _mm_mul_pd(rsq22,rinv22);
2298
2299             /* EWALD ELECTROSTATICS */
2300
2301             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2302             ewrt             = _mm_mul_pd(r22,ewtabscale);
2303             ewitab           = _mm_cvttpd_epi32(ewrt);
2304             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2305             ewitab           = _mm_slli_epi32(ewitab,2);
2306             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2307             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2308             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2309             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2310             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2311             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2312             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2313             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2314             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2315             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2316
2317             d                = _mm_sub_pd(r22,rswitch);
2318             d                = _mm_max_pd(d,_mm_setzero_pd());
2319             d2               = _mm_mul_pd(d,d);
2320             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2321
2322             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2323
2324             /* Evaluate switch function */
2325             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2326             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2327             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2328
2329             fscal            = felec;
2330
2331             fscal            = _mm_and_pd(fscal,cutoff_mask);
2332
2333             /* Calculate temporary vectorial force */
2334             tx               = _mm_mul_pd(fscal,dx22);
2335             ty               = _mm_mul_pd(fscal,dy22);
2336             tz               = _mm_mul_pd(fscal,dz22);
2337
2338             /* Update vectorial force */
2339             fix2             = _mm_add_pd(fix2,tx);
2340             fiy2             = _mm_add_pd(fiy2,ty);
2341             fiz2             = _mm_add_pd(fiz2,tz);
2342
2343             fjx2             = _mm_add_pd(fjx2,tx);
2344             fjy2             = _mm_add_pd(fjy2,ty);
2345             fjz2             = _mm_add_pd(fjz2,tz);
2346
2347             }
2348
2349             /**************************
2350              * CALCULATE INTERACTIONS *
2351              **************************/
2352
2353             if (gmx_mm_any_lt(rsq23,rcutoff2))
2354             {
2355
2356             r23              = _mm_mul_pd(rsq23,rinv23);
2357
2358             /* EWALD ELECTROSTATICS */
2359
2360             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2361             ewrt             = _mm_mul_pd(r23,ewtabscale);
2362             ewitab           = _mm_cvttpd_epi32(ewrt);
2363             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2364             ewitab           = _mm_slli_epi32(ewitab,2);
2365             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2366             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2367             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2368             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2369             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2370             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2371             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2372             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2373             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2374             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2375
2376             d                = _mm_sub_pd(r23,rswitch);
2377             d                = _mm_max_pd(d,_mm_setzero_pd());
2378             d2               = _mm_mul_pd(d,d);
2379             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2380
2381             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2382
2383             /* Evaluate switch function */
2384             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2385             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2386             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
2387
2388             fscal            = felec;
2389
2390             fscal            = _mm_and_pd(fscal,cutoff_mask);
2391
2392             /* Calculate temporary vectorial force */
2393             tx               = _mm_mul_pd(fscal,dx23);
2394             ty               = _mm_mul_pd(fscal,dy23);
2395             tz               = _mm_mul_pd(fscal,dz23);
2396
2397             /* Update vectorial force */
2398             fix2             = _mm_add_pd(fix2,tx);
2399             fiy2             = _mm_add_pd(fiy2,ty);
2400             fiz2             = _mm_add_pd(fiz2,tz);
2401
2402             fjx3             = _mm_add_pd(fjx3,tx);
2403             fjy3             = _mm_add_pd(fjy3,ty);
2404             fjz3             = _mm_add_pd(fjz3,tz);
2405
2406             }
2407
2408             /**************************
2409              * CALCULATE INTERACTIONS *
2410              **************************/
2411
2412             if (gmx_mm_any_lt(rsq31,rcutoff2))
2413             {
2414
2415             r31              = _mm_mul_pd(rsq31,rinv31);
2416
2417             /* EWALD ELECTROSTATICS */
2418
2419             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2420             ewrt             = _mm_mul_pd(r31,ewtabscale);
2421             ewitab           = _mm_cvttpd_epi32(ewrt);
2422             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2423             ewitab           = _mm_slli_epi32(ewitab,2);
2424             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2425             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2426             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2427             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2428             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2429             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2430             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2431             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2432             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2433             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2434
2435             d                = _mm_sub_pd(r31,rswitch);
2436             d                = _mm_max_pd(d,_mm_setzero_pd());
2437             d2               = _mm_mul_pd(d,d);
2438             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2439
2440             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2441
2442             /* Evaluate switch function */
2443             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2444             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2445             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
2446
2447             fscal            = felec;
2448
2449             fscal            = _mm_and_pd(fscal,cutoff_mask);
2450
2451             /* Calculate temporary vectorial force */
2452             tx               = _mm_mul_pd(fscal,dx31);
2453             ty               = _mm_mul_pd(fscal,dy31);
2454             tz               = _mm_mul_pd(fscal,dz31);
2455
2456             /* Update vectorial force */
2457             fix3             = _mm_add_pd(fix3,tx);
2458             fiy3             = _mm_add_pd(fiy3,ty);
2459             fiz3             = _mm_add_pd(fiz3,tz);
2460
2461             fjx1             = _mm_add_pd(fjx1,tx);
2462             fjy1             = _mm_add_pd(fjy1,ty);
2463             fjz1             = _mm_add_pd(fjz1,tz);
2464
2465             }
2466
2467             /**************************
2468              * CALCULATE INTERACTIONS *
2469              **************************/
2470
2471             if (gmx_mm_any_lt(rsq32,rcutoff2))
2472             {
2473
2474             r32              = _mm_mul_pd(rsq32,rinv32);
2475
2476             /* EWALD ELECTROSTATICS */
2477
2478             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2479             ewrt             = _mm_mul_pd(r32,ewtabscale);
2480             ewitab           = _mm_cvttpd_epi32(ewrt);
2481             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2482             ewitab           = _mm_slli_epi32(ewitab,2);
2483             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2484             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2485             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2486             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2487             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2488             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2489             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2490             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2491             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2492             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2493
2494             d                = _mm_sub_pd(r32,rswitch);
2495             d                = _mm_max_pd(d,_mm_setzero_pd());
2496             d2               = _mm_mul_pd(d,d);
2497             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2498
2499             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2500
2501             /* Evaluate switch function */
2502             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2503             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2504             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
2505
2506             fscal            = felec;
2507
2508             fscal            = _mm_and_pd(fscal,cutoff_mask);
2509
2510             /* Calculate temporary vectorial force */
2511             tx               = _mm_mul_pd(fscal,dx32);
2512             ty               = _mm_mul_pd(fscal,dy32);
2513             tz               = _mm_mul_pd(fscal,dz32);
2514
2515             /* Update vectorial force */
2516             fix3             = _mm_add_pd(fix3,tx);
2517             fiy3             = _mm_add_pd(fiy3,ty);
2518             fiz3             = _mm_add_pd(fiz3,tz);
2519
2520             fjx2             = _mm_add_pd(fjx2,tx);
2521             fjy2             = _mm_add_pd(fjy2,ty);
2522             fjz2             = _mm_add_pd(fjz2,tz);
2523
2524             }
2525
2526             /**************************
2527              * CALCULATE INTERACTIONS *
2528              **************************/
2529
2530             if (gmx_mm_any_lt(rsq33,rcutoff2))
2531             {
2532
2533             r33              = _mm_mul_pd(rsq33,rinv33);
2534
2535             /* EWALD ELECTROSTATICS */
2536
2537             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2538             ewrt             = _mm_mul_pd(r33,ewtabscale);
2539             ewitab           = _mm_cvttpd_epi32(ewrt);
2540             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2541             ewitab           = _mm_slli_epi32(ewitab,2);
2542             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2543             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2544             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2545             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2546             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2547             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2548             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2549             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2550             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2551             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2552
2553             d                = _mm_sub_pd(r33,rswitch);
2554             d                = _mm_max_pd(d,_mm_setzero_pd());
2555             d2               = _mm_mul_pd(d,d);
2556             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2557
2558             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2559
2560             /* Evaluate switch function */
2561             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2562             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2563             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
2564
2565             fscal            = felec;
2566
2567             fscal            = _mm_and_pd(fscal,cutoff_mask);
2568
2569             /* Calculate temporary vectorial force */
2570             tx               = _mm_mul_pd(fscal,dx33);
2571             ty               = _mm_mul_pd(fscal,dy33);
2572             tz               = _mm_mul_pd(fscal,dz33);
2573
2574             /* Update vectorial force */
2575             fix3             = _mm_add_pd(fix3,tx);
2576             fiy3             = _mm_add_pd(fiy3,ty);
2577             fiz3             = _mm_add_pd(fiz3,tz);
2578
2579             fjx3             = _mm_add_pd(fjx3,tx);
2580             fjy3             = _mm_add_pd(fjy3,ty);
2581             fjz3             = _mm_add_pd(fjz3,tz);
2582
2583             }
2584
2585             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2586
2587             /* Inner loop uses 617 flops */
2588         }
2589
2590         if(jidx<j_index_end)
2591         {
2592
2593             jnrA             = jjnr[jidx];
2594             j_coord_offsetA  = DIM*jnrA;
2595
2596             /* load j atom coordinates */
2597             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2598                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2599                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2600
2601             /* Calculate displacement vector */
2602             dx00             = _mm_sub_pd(ix0,jx0);
2603             dy00             = _mm_sub_pd(iy0,jy0);
2604             dz00             = _mm_sub_pd(iz0,jz0);
2605             dx11             = _mm_sub_pd(ix1,jx1);
2606             dy11             = _mm_sub_pd(iy1,jy1);
2607             dz11             = _mm_sub_pd(iz1,jz1);
2608             dx12             = _mm_sub_pd(ix1,jx2);
2609             dy12             = _mm_sub_pd(iy1,jy2);
2610             dz12             = _mm_sub_pd(iz1,jz2);
2611             dx13             = _mm_sub_pd(ix1,jx3);
2612             dy13             = _mm_sub_pd(iy1,jy3);
2613             dz13             = _mm_sub_pd(iz1,jz3);
2614             dx21             = _mm_sub_pd(ix2,jx1);
2615             dy21             = _mm_sub_pd(iy2,jy1);
2616             dz21             = _mm_sub_pd(iz2,jz1);
2617             dx22             = _mm_sub_pd(ix2,jx2);
2618             dy22             = _mm_sub_pd(iy2,jy2);
2619             dz22             = _mm_sub_pd(iz2,jz2);
2620             dx23             = _mm_sub_pd(ix2,jx3);
2621             dy23             = _mm_sub_pd(iy2,jy3);
2622             dz23             = _mm_sub_pd(iz2,jz3);
2623             dx31             = _mm_sub_pd(ix3,jx1);
2624             dy31             = _mm_sub_pd(iy3,jy1);
2625             dz31             = _mm_sub_pd(iz3,jz1);
2626             dx32             = _mm_sub_pd(ix3,jx2);
2627             dy32             = _mm_sub_pd(iy3,jy2);
2628             dz32             = _mm_sub_pd(iz3,jz2);
2629             dx33             = _mm_sub_pd(ix3,jx3);
2630             dy33             = _mm_sub_pd(iy3,jy3);
2631             dz33             = _mm_sub_pd(iz3,jz3);
2632
2633             /* Calculate squared distance and things based on it */
2634             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2635             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2636             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2637             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2638             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2639             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2640             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2641             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2642             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2643             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2644
2645             rinv00           = gmx_mm_invsqrt_pd(rsq00);
2646             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2647             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2648             rinv13           = gmx_mm_invsqrt_pd(rsq13);
2649             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2650             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2651             rinv23           = gmx_mm_invsqrt_pd(rsq23);
2652             rinv31           = gmx_mm_invsqrt_pd(rsq31);
2653             rinv32           = gmx_mm_invsqrt_pd(rsq32);
2654             rinv33           = gmx_mm_invsqrt_pd(rsq33);
2655
2656             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
2657             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2658             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2659             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
2660             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2661             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2662             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
2663             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
2664             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
2665             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
2666
2667             fjx0             = _mm_setzero_pd();
2668             fjy0             = _mm_setzero_pd();
2669             fjz0             = _mm_setzero_pd();
2670             fjx1             = _mm_setzero_pd();
2671             fjy1             = _mm_setzero_pd();
2672             fjz1             = _mm_setzero_pd();
2673             fjx2             = _mm_setzero_pd();
2674             fjy2             = _mm_setzero_pd();
2675             fjz2             = _mm_setzero_pd();
2676             fjx3             = _mm_setzero_pd();
2677             fjy3             = _mm_setzero_pd();
2678             fjz3             = _mm_setzero_pd();
2679
2680             /**************************
2681              * CALCULATE INTERACTIONS *
2682              **************************/
2683
2684             if (gmx_mm_any_lt(rsq00,rcutoff2))
2685             {
2686
2687             r00              = _mm_mul_pd(rsq00,rinv00);
2688
2689             /* LENNARD-JONES DISPERSION/REPULSION */
2690
2691             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2692             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
2693             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2694             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2695             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2696
2697             d                = _mm_sub_pd(r00,rswitch);
2698             d                = _mm_max_pd(d,_mm_setzero_pd());
2699             d2               = _mm_mul_pd(d,d);
2700             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2701
2702             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2703
2704             /* Evaluate switch function */
2705             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2706             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2707             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2708
2709             fscal            = fvdw;
2710
2711             fscal            = _mm_and_pd(fscal,cutoff_mask);
2712
2713             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2714
2715             /* Calculate temporary vectorial force */
2716             tx               = _mm_mul_pd(fscal,dx00);
2717             ty               = _mm_mul_pd(fscal,dy00);
2718             tz               = _mm_mul_pd(fscal,dz00);
2719
2720             /* Update vectorial force */
2721             fix0             = _mm_add_pd(fix0,tx);
2722             fiy0             = _mm_add_pd(fiy0,ty);
2723             fiz0             = _mm_add_pd(fiz0,tz);
2724
2725             fjx0             = _mm_add_pd(fjx0,tx);
2726             fjy0             = _mm_add_pd(fjy0,ty);
2727             fjz0             = _mm_add_pd(fjz0,tz);
2728
2729             }
2730
2731             /**************************
2732              * CALCULATE INTERACTIONS *
2733              **************************/
2734
2735             if (gmx_mm_any_lt(rsq11,rcutoff2))
2736             {
2737
2738             r11              = _mm_mul_pd(rsq11,rinv11);
2739
2740             /* EWALD ELECTROSTATICS */
2741
2742             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2743             ewrt             = _mm_mul_pd(r11,ewtabscale);
2744             ewitab           = _mm_cvttpd_epi32(ewrt);
2745             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2746             ewitab           = _mm_slli_epi32(ewitab,2);
2747             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2748             ewtabD           = _mm_setzero_pd();
2749             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2750             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2751             ewtabFn          = _mm_setzero_pd();
2752             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2753             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2754             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2755             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2756             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2757
2758             d                = _mm_sub_pd(r11,rswitch);
2759             d                = _mm_max_pd(d,_mm_setzero_pd());
2760             d2               = _mm_mul_pd(d,d);
2761             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2762
2763             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2764
2765             /* Evaluate switch function */
2766             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2767             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2768             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2769
2770             fscal            = felec;
2771
2772             fscal            = _mm_and_pd(fscal,cutoff_mask);
2773
2774             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2775
2776             /* Calculate temporary vectorial force */
2777             tx               = _mm_mul_pd(fscal,dx11);
2778             ty               = _mm_mul_pd(fscal,dy11);
2779             tz               = _mm_mul_pd(fscal,dz11);
2780
2781             /* Update vectorial force */
2782             fix1             = _mm_add_pd(fix1,tx);
2783             fiy1             = _mm_add_pd(fiy1,ty);
2784             fiz1             = _mm_add_pd(fiz1,tz);
2785
2786             fjx1             = _mm_add_pd(fjx1,tx);
2787             fjy1             = _mm_add_pd(fjy1,ty);
2788             fjz1             = _mm_add_pd(fjz1,tz);
2789
2790             }
2791
2792             /**************************
2793              * CALCULATE INTERACTIONS *
2794              **************************/
2795
2796             if (gmx_mm_any_lt(rsq12,rcutoff2))
2797             {
2798
2799             r12              = _mm_mul_pd(rsq12,rinv12);
2800
2801             /* EWALD ELECTROSTATICS */
2802
2803             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2804             ewrt             = _mm_mul_pd(r12,ewtabscale);
2805             ewitab           = _mm_cvttpd_epi32(ewrt);
2806             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2807             ewitab           = _mm_slli_epi32(ewitab,2);
2808             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2809             ewtabD           = _mm_setzero_pd();
2810             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2811             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2812             ewtabFn          = _mm_setzero_pd();
2813             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2814             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2815             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2816             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2817             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2818
2819             d                = _mm_sub_pd(r12,rswitch);
2820             d                = _mm_max_pd(d,_mm_setzero_pd());
2821             d2               = _mm_mul_pd(d,d);
2822             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2823
2824             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2825
2826             /* Evaluate switch function */
2827             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2828             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2829             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2830
2831             fscal            = felec;
2832
2833             fscal            = _mm_and_pd(fscal,cutoff_mask);
2834
2835             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2836
2837             /* Calculate temporary vectorial force */
2838             tx               = _mm_mul_pd(fscal,dx12);
2839             ty               = _mm_mul_pd(fscal,dy12);
2840             tz               = _mm_mul_pd(fscal,dz12);
2841
2842             /* Update vectorial force */
2843             fix1             = _mm_add_pd(fix1,tx);
2844             fiy1             = _mm_add_pd(fiy1,ty);
2845             fiz1             = _mm_add_pd(fiz1,tz);
2846
2847             fjx2             = _mm_add_pd(fjx2,tx);
2848             fjy2             = _mm_add_pd(fjy2,ty);
2849             fjz2             = _mm_add_pd(fjz2,tz);
2850
2851             }
2852
2853             /**************************
2854              * CALCULATE INTERACTIONS *
2855              **************************/
2856
2857             if (gmx_mm_any_lt(rsq13,rcutoff2))
2858             {
2859
2860             r13              = _mm_mul_pd(rsq13,rinv13);
2861
2862             /* EWALD ELECTROSTATICS */
2863
2864             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2865             ewrt             = _mm_mul_pd(r13,ewtabscale);
2866             ewitab           = _mm_cvttpd_epi32(ewrt);
2867             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2868             ewitab           = _mm_slli_epi32(ewitab,2);
2869             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2870             ewtabD           = _mm_setzero_pd();
2871             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2872             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2873             ewtabFn          = _mm_setzero_pd();
2874             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2875             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2876             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2877             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2878             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2879
2880             d                = _mm_sub_pd(r13,rswitch);
2881             d                = _mm_max_pd(d,_mm_setzero_pd());
2882             d2               = _mm_mul_pd(d,d);
2883             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2884
2885             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2886
2887             /* Evaluate switch function */
2888             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2889             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2890             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2891
2892             fscal            = felec;
2893
2894             fscal            = _mm_and_pd(fscal,cutoff_mask);
2895
2896             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2897
2898             /* Calculate temporary vectorial force */
2899             tx               = _mm_mul_pd(fscal,dx13);
2900             ty               = _mm_mul_pd(fscal,dy13);
2901             tz               = _mm_mul_pd(fscal,dz13);
2902
2903             /* Update vectorial force */
2904             fix1             = _mm_add_pd(fix1,tx);
2905             fiy1             = _mm_add_pd(fiy1,ty);
2906             fiz1             = _mm_add_pd(fiz1,tz);
2907
2908             fjx3             = _mm_add_pd(fjx3,tx);
2909             fjy3             = _mm_add_pd(fjy3,ty);
2910             fjz3             = _mm_add_pd(fjz3,tz);
2911
2912             }
2913
2914             /**************************
2915              * CALCULATE INTERACTIONS *
2916              **************************/
2917
2918             if (gmx_mm_any_lt(rsq21,rcutoff2))
2919             {
2920
2921             r21              = _mm_mul_pd(rsq21,rinv21);
2922
2923             /* EWALD ELECTROSTATICS */
2924
2925             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2926             ewrt             = _mm_mul_pd(r21,ewtabscale);
2927             ewitab           = _mm_cvttpd_epi32(ewrt);
2928             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2929             ewitab           = _mm_slli_epi32(ewitab,2);
2930             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2931             ewtabD           = _mm_setzero_pd();
2932             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2933             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2934             ewtabFn          = _mm_setzero_pd();
2935             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2936             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2937             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2938             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2939             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2940
2941             d                = _mm_sub_pd(r21,rswitch);
2942             d                = _mm_max_pd(d,_mm_setzero_pd());
2943             d2               = _mm_mul_pd(d,d);
2944             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2945
2946             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2947
2948             /* Evaluate switch function */
2949             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2950             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2951             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2952
2953             fscal            = felec;
2954
2955             fscal            = _mm_and_pd(fscal,cutoff_mask);
2956
2957             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2958
2959             /* Calculate temporary vectorial force */
2960             tx               = _mm_mul_pd(fscal,dx21);
2961             ty               = _mm_mul_pd(fscal,dy21);
2962             tz               = _mm_mul_pd(fscal,dz21);
2963
2964             /* Update vectorial force */
2965             fix2             = _mm_add_pd(fix2,tx);
2966             fiy2             = _mm_add_pd(fiy2,ty);
2967             fiz2             = _mm_add_pd(fiz2,tz);
2968
2969             fjx1             = _mm_add_pd(fjx1,tx);
2970             fjy1             = _mm_add_pd(fjy1,ty);
2971             fjz1             = _mm_add_pd(fjz1,tz);
2972
2973             }
2974
2975             /**************************
2976              * CALCULATE INTERACTIONS *
2977              **************************/
2978
2979             if (gmx_mm_any_lt(rsq22,rcutoff2))
2980             {
2981
2982             r22              = _mm_mul_pd(rsq22,rinv22);
2983
2984             /* EWALD ELECTROSTATICS */
2985
2986             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2987             ewrt             = _mm_mul_pd(r22,ewtabscale);
2988             ewitab           = _mm_cvttpd_epi32(ewrt);
2989             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2990             ewitab           = _mm_slli_epi32(ewitab,2);
2991             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2992             ewtabD           = _mm_setzero_pd();
2993             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2994             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2995             ewtabFn          = _mm_setzero_pd();
2996             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2997             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2998             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2999             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
3000             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
3001
3002             d                = _mm_sub_pd(r22,rswitch);
3003             d                = _mm_max_pd(d,_mm_setzero_pd());
3004             d2               = _mm_mul_pd(d,d);
3005             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3006
3007             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3008
3009             /* Evaluate switch function */
3010             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3011             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
3012             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
3013
3014             fscal            = felec;
3015
3016             fscal            = _mm_and_pd(fscal,cutoff_mask);
3017
3018             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3019
3020             /* Calculate temporary vectorial force */
3021             tx               = _mm_mul_pd(fscal,dx22);
3022             ty               = _mm_mul_pd(fscal,dy22);
3023             tz               = _mm_mul_pd(fscal,dz22);
3024
3025             /* Update vectorial force */
3026             fix2             = _mm_add_pd(fix2,tx);
3027             fiy2             = _mm_add_pd(fiy2,ty);
3028             fiz2             = _mm_add_pd(fiz2,tz);
3029
3030             fjx2             = _mm_add_pd(fjx2,tx);
3031             fjy2             = _mm_add_pd(fjy2,ty);
3032             fjz2             = _mm_add_pd(fjz2,tz);
3033
3034             }
3035
3036             /**************************
3037              * CALCULATE INTERACTIONS *
3038              **************************/
3039
3040             if (gmx_mm_any_lt(rsq23,rcutoff2))
3041             {
3042
3043             r23              = _mm_mul_pd(rsq23,rinv23);
3044
3045             /* EWALD ELECTROSTATICS */
3046
3047             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3048             ewrt             = _mm_mul_pd(r23,ewtabscale);
3049             ewitab           = _mm_cvttpd_epi32(ewrt);
3050             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3051             ewitab           = _mm_slli_epi32(ewitab,2);
3052             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3053             ewtabD           = _mm_setzero_pd();
3054             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3055             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3056             ewtabFn          = _mm_setzero_pd();
3057             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3058             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3059             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3060             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
3061             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
3062
3063             d                = _mm_sub_pd(r23,rswitch);
3064             d                = _mm_max_pd(d,_mm_setzero_pd());
3065             d2               = _mm_mul_pd(d,d);
3066             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3067
3068             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3069
3070             /* Evaluate switch function */
3071             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3072             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
3073             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
3074
3075             fscal            = felec;
3076
3077             fscal            = _mm_and_pd(fscal,cutoff_mask);
3078
3079             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3080
3081             /* Calculate temporary vectorial force */
3082             tx               = _mm_mul_pd(fscal,dx23);
3083             ty               = _mm_mul_pd(fscal,dy23);
3084             tz               = _mm_mul_pd(fscal,dz23);
3085
3086             /* Update vectorial force */
3087             fix2             = _mm_add_pd(fix2,tx);
3088             fiy2             = _mm_add_pd(fiy2,ty);
3089             fiz2             = _mm_add_pd(fiz2,tz);
3090
3091             fjx3             = _mm_add_pd(fjx3,tx);
3092             fjy3             = _mm_add_pd(fjy3,ty);
3093             fjz3             = _mm_add_pd(fjz3,tz);
3094
3095             }
3096
3097             /**************************
3098              * CALCULATE INTERACTIONS *
3099              **************************/
3100
3101             if (gmx_mm_any_lt(rsq31,rcutoff2))
3102             {
3103
3104             r31              = _mm_mul_pd(rsq31,rinv31);
3105
3106             /* EWALD ELECTROSTATICS */
3107
3108             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3109             ewrt             = _mm_mul_pd(r31,ewtabscale);
3110             ewitab           = _mm_cvttpd_epi32(ewrt);
3111             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3112             ewitab           = _mm_slli_epi32(ewitab,2);
3113             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3114             ewtabD           = _mm_setzero_pd();
3115             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3116             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3117             ewtabFn          = _mm_setzero_pd();
3118             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3119             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3120             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3121             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
3122             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
3123
3124             d                = _mm_sub_pd(r31,rswitch);
3125             d                = _mm_max_pd(d,_mm_setzero_pd());
3126             d2               = _mm_mul_pd(d,d);
3127             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3128
3129             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3130
3131             /* Evaluate switch function */
3132             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3133             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
3134             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
3135
3136             fscal            = felec;
3137
3138             fscal            = _mm_and_pd(fscal,cutoff_mask);
3139
3140             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3141
3142             /* Calculate temporary vectorial force */
3143             tx               = _mm_mul_pd(fscal,dx31);
3144             ty               = _mm_mul_pd(fscal,dy31);
3145             tz               = _mm_mul_pd(fscal,dz31);
3146
3147             /* Update vectorial force */
3148             fix3             = _mm_add_pd(fix3,tx);
3149             fiy3             = _mm_add_pd(fiy3,ty);
3150             fiz3             = _mm_add_pd(fiz3,tz);
3151
3152             fjx1             = _mm_add_pd(fjx1,tx);
3153             fjy1             = _mm_add_pd(fjy1,ty);
3154             fjz1             = _mm_add_pd(fjz1,tz);
3155
3156             }
3157
3158             /**************************
3159              * CALCULATE INTERACTIONS *
3160              **************************/
3161
3162             if (gmx_mm_any_lt(rsq32,rcutoff2))
3163             {
3164
3165             r32              = _mm_mul_pd(rsq32,rinv32);
3166
3167             /* EWALD ELECTROSTATICS */
3168
3169             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3170             ewrt             = _mm_mul_pd(r32,ewtabscale);
3171             ewitab           = _mm_cvttpd_epi32(ewrt);
3172             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3173             ewitab           = _mm_slli_epi32(ewitab,2);
3174             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3175             ewtabD           = _mm_setzero_pd();
3176             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3177             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3178             ewtabFn          = _mm_setzero_pd();
3179             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3180             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3181             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3182             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
3183             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
3184
3185             d                = _mm_sub_pd(r32,rswitch);
3186             d                = _mm_max_pd(d,_mm_setzero_pd());
3187             d2               = _mm_mul_pd(d,d);
3188             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3189
3190             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3191
3192             /* Evaluate switch function */
3193             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3194             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
3195             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
3196
3197             fscal            = felec;
3198
3199             fscal            = _mm_and_pd(fscal,cutoff_mask);
3200
3201             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3202
3203             /* Calculate temporary vectorial force */
3204             tx               = _mm_mul_pd(fscal,dx32);
3205             ty               = _mm_mul_pd(fscal,dy32);
3206             tz               = _mm_mul_pd(fscal,dz32);
3207
3208             /* Update vectorial force */
3209             fix3             = _mm_add_pd(fix3,tx);
3210             fiy3             = _mm_add_pd(fiy3,ty);
3211             fiz3             = _mm_add_pd(fiz3,tz);
3212
3213             fjx2             = _mm_add_pd(fjx2,tx);
3214             fjy2             = _mm_add_pd(fjy2,ty);
3215             fjz2             = _mm_add_pd(fjz2,tz);
3216
3217             }
3218
3219             /**************************
3220              * CALCULATE INTERACTIONS *
3221              **************************/
3222
3223             if (gmx_mm_any_lt(rsq33,rcutoff2))
3224             {
3225
3226             r33              = _mm_mul_pd(rsq33,rinv33);
3227
3228             /* EWALD ELECTROSTATICS */
3229
3230             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3231             ewrt             = _mm_mul_pd(r33,ewtabscale);
3232             ewitab           = _mm_cvttpd_epi32(ewrt);
3233             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3234             ewitab           = _mm_slli_epi32(ewitab,2);
3235             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3236             ewtabD           = _mm_setzero_pd();
3237             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3238             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3239             ewtabFn          = _mm_setzero_pd();
3240             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3241             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3242             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3243             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
3244             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
3245
3246             d                = _mm_sub_pd(r33,rswitch);
3247             d                = _mm_max_pd(d,_mm_setzero_pd());
3248             d2               = _mm_mul_pd(d,d);
3249             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3250
3251             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3252
3253             /* Evaluate switch function */
3254             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3255             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
3256             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
3257
3258             fscal            = felec;
3259
3260             fscal            = _mm_and_pd(fscal,cutoff_mask);
3261
3262             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3263
3264             /* Calculate temporary vectorial force */
3265             tx               = _mm_mul_pd(fscal,dx33);
3266             ty               = _mm_mul_pd(fscal,dy33);
3267             tz               = _mm_mul_pd(fscal,dz33);
3268
3269             /* Update vectorial force */
3270             fix3             = _mm_add_pd(fix3,tx);
3271             fiy3             = _mm_add_pd(fiy3,ty);
3272             fiz3             = _mm_add_pd(fiz3,tz);
3273
3274             fjx3             = _mm_add_pd(fjx3,tx);
3275             fjy3             = _mm_add_pd(fjy3,ty);
3276             fjz3             = _mm_add_pd(fjz3,tz);
3277
3278             }
3279
3280             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3281
3282             /* Inner loop uses 617 flops */
3283         }
3284
3285         /* End of innermost loop */
3286
3287         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3288                                               f+i_coord_offset,fshift+i_shift_offset);
3289
3290         /* Increment number of inner iterations */
3291         inneriter                  += j_index_end - j_index_start;
3292
3293         /* Outer loop uses 24 flops */
3294     }
3295
3296     /* Increment number of outer iterations */
3297     outeriter        += nri;
3298
3299     /* Update outer/inner flops */
3300
3301     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*617);
3302 }