Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse4_1_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse4_1_double.h"
48 #include "kernelutil_x86_sse4_1_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_double
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB;
75     int              j_coord_offsetA,j_coord_offsetB;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80     int              vdwioffset0;
81     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82     int              vdwioffset1;
83     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84     int              vdwioffset2;
85     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86     int              vdwioffset3;
87     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88     int              vdwjidx0A,vdwjidx0B;
89     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B;
91     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B;
93     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     int              vdwjidx3A,vdwjidx3B;
95     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
113     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
114     __m128i          ewitab;
115     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
116     real             *ewtab;
117     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
118     real             rswitch_scalar,d_scalar;
119     __m128d          dummy_mask,cutoff_mask;
120     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
121     __m128d          one     = _mm_set1_pd(1.0);
122     __m128d          two     = _mm_set1_pd(2.0);
123     x                = xx[0];
124     f                = ff[0];
125
126     nri              = nlist->nri;
127     iinr             = nlist->iinr;
128     jindex           = nlist->jindex;
129     jjnr             = nlist->jjnr;
130     shiftidx         = nlist->shift;
131     gid              = nlist->gid;
132     shiftvec         = fr->shift_vec[0];
133     fshift           = fr->fshift[0];
134     facel            = _mm_set1_pd(fr->epsfac);
135     charge           = mdatoms->chargeA;
136     nvdwtype         = fr->ntype;
137     vdwparam         = fr->nbfp;
138     vdwtype          = mdatoms->typeA;
139
140     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
141     ewtab            = fr->ic->tabq_coul_FDV0;
142     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
143     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
144
145     /* Setup water-specific parameters */
146     inr              = nlist->iinr[0];
147     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
148     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
149     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
150     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
151
152     jq1              = _mm_set1_pd(charge[inr+1]);
153     jq2              = _mm_set1_pd(charge[inr+2]);
154     jq3              = _mm_set1_pd(charge[inr+3]);
155     vdwjidx0A        = 2*vdwtype[inr+0];
156     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
157     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
158     qq11             = _mm_mul_pd(iq1,jq1);
159     qq12             = _mm_mul_pd(iq1,jq2);
160     qq13             = _mm_mul_pd(iq1,jq3);
161     qq21             = _mm_mul_pd(iq2,jq1);
162     qq22             = _mm_mul_pd(iq2,jq2);
163     qq23             = _mm_mul_pd(iq2,jq3);
164     qq31             = _mm_mul_pd(iq3,jq1);
165     qq32             = _mm_mul_pd(iq3,jq2);
166     qq33             = _mm_mul_pd(iq3,jq3);
167
168     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169     rcutoff_scalar   = fr->rcoulomb;
170     rcutoff          = _mm_set1_pd(rcutoff_scalar);
171     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
172
173     rswitch_scalar   = fr->rcoulomb_switch;
174     rswitch          = _mm_set1_pd(rswitch_scalar);
175     /* Setup switch parameters */
176     d_scalar         = rcutoff_scalar-rswitch_scalar;
177     d                = _mm_set1_pd(d_scalar);
178     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
179     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
182     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
183     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
184
185     /* Avoid stupid compiler warnings */
186     jnrA = jnrB = 0;
187     j_coord_offsetA = 0;
188     j_coord_offsetB = 0;
189
190     outeriter        = 0;
191     inneriter        = 0;
192
193     /* Start outer loop over neighborlists */
194     for(iidx=0; iidx<nri; iidx++)
195     {
196         /* Load shift vector for this list */
197         i_shift_offset   = DIM*shiftidx[iidx];
198
199         /* Load limits for loop over neighbors */
200         j_index_start    = jindex[iidx];
201         j_index_end      = jindex[iidx+1];
202
203         /* Get outer coordinate index */
204         inr              = iinr[iidx];
205         i_coord_offset   = DIM*inr;
206
207         /* Load i particle coords and add shift vector */
208         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
209                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
210
211         fix0             = _mm_setzero_pd();
212         fiy0             = _mm_setzero_pd();
213         fiz0             = _mm_setzero_pd();
214         fix1             = _mm_setzero_pd();
215         fiy1             = _mm_setzero_pd();
216         fiz1             = _mm_setzero_pd();
217         fix2             = _mm_setzero_pd();
218         fiy2             = _mm_setzero_pd();
219         fiz2             = _mm_setzero_pd();
220         fix3             = _mm_setzero_pd();
221         fiy3             = _mm_setzero_pd();
222         fiz3             = _mm_setzero_pd();
223
224         /* Reset potential sums */
225         velecsum         = _mm_setzero_pd();
226         vvdwsum          = _mm_setzero_pd();
227
228         /* Start inner kernel loop */
229         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
230         {
231
232             /* Get j neighbor index, and coordinate index */
233             jnrA             = jjnr[jidx];
234             jnrB             = jjnr[jidx+1];
235             j_coord_offsetA  = DIM*jnrA;
236             j_coord_offsetB  = DIM*jnrB;
237
238             /* load j atom coordinates */
239             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
240                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
241                                               &jy2,&jz2,&jx3,&jy3,&jz3);
242
243             /* Calculate displacement vector */
244             dx00             = _mm_sub_pd(ix0,jx0);
245             dy00             = _mm_sub_pd(iy0,jy0);
246             dz00             = _mm_sub_pd(iz0,jz0);
247             dx11             = _mm_sub_pd(ix1,jx1);
248             dy11             = _mm_sub_pd(iy1,jy1);
249             dz11             = _mm_sub_pd(iz1,jz1);
250             dx12             = _mm_sub_pd(ix1,jx2);
251             dy12             = _mm_sub_pd(iy1,jy2);
252             dz12             = _mm_sub_pd(iz1,jz2);
253             dx13             = _mm_sub_pd(ix1,jx3);
254             dy13             = _mm_sub_pd(iy1,jy3);
255             dz13             = _mm_sub_pd(iz1,jz3);
256             dx21             = _mm_sub_pd(ix2,jx1);
257             dy21             = _mm_sub_pd(iy2,jy1);
258             dz21             = _mm_sub_pd(iz2,jz1);
259             dx22             = _mm_sub_pd(ix2,jx2);
260             dy22             = _mm_sub_pd(iy2,jy2);
261             dz22             = _mm_sub_pd(iz2,jz2);
262             dx23             = _mm_sub_pd(ix2,jx3);
263             dy23             = _mm_sub_pd(iy2,jy3);
264             dz23             = _mm_sub_pd(iz2,jz3);
265             dx31             = _mm_sub_pd(ix3,jx1);
266             dy31             = _mm_sub_pd(iy3,jy1);
267             dz31             = _mm_sub_pd(iz3,jz1);
268             dx32             = _mm_sub_pd(ix3,jx2);
269             dy32             = _mm_sub_pd(iy3,jy2);
270             dz32             = _mm_sub_pd(iz3,jz2);
271             dx33             = _mm_sub_pd(ix3,jx3);
272             dy33             = _mm_sub_pd(iy3,jy3);
273             dz33             = _mm_sub_pd(iz3,jz3);
274
275             /* Calculate squared distance and things based on it */
276             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
277             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
278             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
279             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
280             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
281             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
282             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
283             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
284             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
285             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
286
287             rinv00           = gmx_mm_invsqrt_pd(rsq00);
288             rinv11           = gmx_mm_invsqrt_pd(rsq11);
289             rinv12           = gmx_mm_invsqrt_pd(rsq12);
290             rinv13           = gmx_mm_invsqrt_pd(rsq13);
291             rinv21           = gmx_mm_invsqrt_pd(rsq21);
292             rinv22           = gmx_mm_invsqrt_pd(rsq22);
293             rinv23           = gmx_mm_invsqrt_pd(rsq23);
294             rinv31           = gmx_mm_invsqrt_pd(rsq31);
295             rinv32           = gmx_mm_invsqrt_pd(rsq32);
296             rinv33           = gmx_mm_invsqrt_pd(rsq33);
297
298             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
299             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
300             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
301             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
302             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
303             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
304             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
305             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
306             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
307             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
308
309             fjx0             = _mm_setzero_pd();
310             fjy0             = _mm_setzero_pd();
311             fjz0             = _mm_setzero_pd();
312             fjx1             = _mm_setzero_pd();
313             fjy1             = _mm_setzero_pd();
314             fjz1             = _mm_setzero_pd();
315             fjx2             = _mm_setzero_pd();
316             fjy2             = _mm_setzero_pd();
317             fjz2             = _mm_setzero_pd();
318             fjx3             = _mm_setzero_pd();
319             fjy3             = _mm_setzero_pd();
320             fjz3             = _mm_setzero_pd();
321
322             /**************************
323              * CALCULATE INTERACTIONS *
324              **************************/
325
326             if (gmx_mm_any_lt(rsq00,rcutoff2))
327             {
328
329             r00              = _mm_mul_pd(rsq00,rinv00);
330
331             /* LENNARD-JONES DISPERSION/REPULSION */
332
333             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
334             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
335             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
336             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
337             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
338
339             d                = _mm_sub_pd(r00,rswitch);
340             d                = _mm_max_pd(d,_mm_setzero_pd());
341             d2               = _mm_mul_pd(d,d);
342             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
343
344             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
345
346             /* Evaluate switch function */
347             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
348             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
349             vvdw             = _mm_mul_pd(vvdw,sw);
350             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
351
352             /* Update potential sum for this i atom from the interaction with this j atom. */
353             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
354             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
355
356             fscal            = fvdw;
357
358             fscal            = _mm_and_pd(fscal,cutoff_mask);
359
360             /* Calculate temporary vectorial force */
361             tx               = _mm_mul_pd(fscal,dx00);
362             ty               = _mm_mul_pd(fscal,dy00);
363             tz               = _mm_mul_pd(fscal,dz00);
364
365             /* Update vectorial force */
366             fix0             = _mm_add_pd(fix0,tx);
367             fiy0             = _mm_add_pd(fiy0,ty);
368             fiz0             = _mm_add_pd(fiz0,tz);
369
370             fjx0             = _mm_add_pd(fjx0,tx);
371             fjy0             = _mm_add_pd(fjy0,ty);
372             fjz0             = _mm_add_pd(fjz0,tz);
373
374             }
375
376             /**************************
377              * CALCULATE INTERACTIONS *
378              **************************/
379
380             if (gmx_mm_any_lt(rsq11,rcutoff2))
381             {
382
383             r11              = _mm_mul_pd(rsq11,rinv11);
384
385             /* EWALD ELECTROSTATICS */
386
387             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
388             ewrt             = _mm_mul_pd(r11,ewtabscale);
389             ewitab           = _mm_cvttpd_epi32(ewrt);
390             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
391             ewitab           = _mm_slli_epi32(ewitab,2);
392             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
393             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
394             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
395             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
396             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
397             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
398             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
399             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
400             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
401             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
402
403             d                = _mm_sub_pd(r11,rswitch);
404             d                = _mm_max_pd(d,_mm_setzero_pd());
405             d2               = _mm_mul_pd(d,d);
406             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
407
408             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
409
410             /* Evaluate switch function */
411             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
412             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
413             velec            = _mm_mul_pd(velec,sw);
414             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
415
416             /* Update potential sum for this i atom from the interaction with this j atom. */
417             velec            = _mm_and_pd(velec,cutoff_mask);
418             velecsum         = _mm_add_pd(velecsum,velec);
419
420             fscal            = felec;
421
422             fscal            = _mm_and_pd(fscal,cutoff_mask);
423
424             /* Calculate temporary vectorial force */
425             tx               = _mm_mul_pd(fscal,dx11);
426             ty               = _mm_mul_pd(fscal,dy11);
427             tz               = _mm_mul_pd(fscal,dz11);
428
429             /* Update vectorial force */
430             fix1             = _mm_add_pd(fix1,tx);
431             fiy1             = _mm_add_pd(fiy1,ty);
432             fiz1             = _mm_add_pd(fiz1,tz);
433
434             fjx1             = _mm_add_pd(fjx1,tx);
435             fjy1             = _mm_add_pd(fjy1,ty);
436             fjz1             = _mm_add_pd(fjz1,tz);
437
438             }
439
440             /**************************
441              * CALCULATE INTERACTIONS *
442              **************************/
443
444             if (gmx_mm_any_lt(rsq12,rcutoff2))
445             {
446
447             r12              = _mm_mul_pd(rsq12,rinv12);
448
449             /* EWALD ELECTROSTATICS */
450
451             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
452             ewrt             = _mm_mul_pd(r12,ewtabscale);
453             ewitab           = _mm_cvttpd_epi32(ewrt);
454             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
455             ewitab           = _mm_slli_epi32(ewitab,2);
456             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
457             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
458             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
459             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
460             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
461             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
462             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
463             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
464             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
465             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
466
467             d                = _mm_sub_pd(r12,rswitch);
468             d                = _mm_max_pd(d,_mm_setzero_pd());
469             d2               = _mm_mul_pd(d,d);
470             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
471
472             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
473
474             /* Evaluate switch function */
475             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
476             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
477             velec            = _mm_mul_pd(velec,sw);
478             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
479
480             /* Update potential sum for this i atom from the interaction with this j atom. */
481             velec            = _mm_and_pd(velec,cutoff_mask);
482             velecsum         = _mm_add_pd(velecsum,velec);
483
484             fscal            = felec;
485
486             fscal            = _mm_and_pd(fscal,cutoff_mask);
487
488             /* Calculate temporary vectorial force */
489             tx               = _mm_mul_pd(fscal,dx12);
490             ty               = _mm_mul_pd(fscal,dy12);
491             tz               = _mm_mul_pd(fscal,dz12);
492
493             /* Update vectorial force */
494             fix1             = _mm_add_pd(fix1,tx);
495             fiy1             = _mm_add_pd(fiy1,ty);
496             fiz1             = _mm_add_pd(fiz1,tz);
497
498             fjx2             = _mm_add_pd(fjx2,tx);
499             fjy2             = _mm_add_pd(fjy2,ty);
500             fjz2             = _mm_add_pd(fjz2,tz);
501
502             }
503
504             /**************************
505              * CALCULATE INTERACTIONS *
506              **************************/
507
508             if (gmx_mm_any_lt(rsq13,rcutoff2))
509             {
510
511             r13              = _mm_mul_pd(rsq13,rinv13);
512
513             /* EWALD ELECTROSTATICS */
514
515             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
516             ewrt             = _mm_mul_pd(r13,ewtabscale);
517             ewitab           = _mm_cvttpd_epi32(ewrt);
518             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
519             ewitab           = _mm_slli_epi32(ewitab,2);
520             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
521             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
522             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
523             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
524             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
525             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
526             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
527             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
528             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
529             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
530
531             d                = _mm_sub_pd(r13,rswitch);
532             d                = _mm_max_pd(d,_mm_setzero_pd());
533             d2               = _mm_mul_pd(d,d);
534             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
535
536             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
537
538             /* Evaluate switch function */
539             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
540             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
541             velec            = _mm_mul_pd(velec,sw);
542             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
543
544             /* Update potential sum for this i atom from the interaction with this j atom. */
545             velec            = _mm_and_pd(velec,cutoff_mask);
546             velecsum         = _mm_add_pd(velecsum,velec);
547
548             fscal            = felec;
549
550             fscal            = _mm_and_pd(fscal,cutoff_mask);
551
552             /* Calculate temporary vectorial force */
553             tx               = _mm_mul_pd(fscal,dx13);
554             ty               = _mm_mul_pd(fscal,dy13);
555             tz               = _mm_mul_pd(fscal,dz13);
556
557             /* Update vectorial force */
558             fix1             = _mm_add_pd(fix1,tx);
559             fiy1             = _mm_add_pd(fiy1,ty);
560             fiz1             = _mm_add_pd(fiz1,tz);
561
562             fjx3             = _mm_add_pd(fjx3,tx);
563             fjy3             = _mm_add_pd(fjy3,ty);
564             fjz3             = _mm_add_pd(fjz3,tz);
565
566             }
567
568             /**************************
569              * CALCULATE INTERACTIONS *
570              **************************/
571
572             if (gmx_mm_any_lt(rsq21,rcutoff2))
573             {
574
575             r21              = _mm_mul_pd(rsq21,rinv21);
576
577             /* EWALD ELECTROSTATICS */
578
579             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
580             ewrt             = _mm_mul_pd(r21,ewtabscale);
581             ewitab           = _mm_cvttpd_epi32(ewrt);
582             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
583             ewitab           = _mm_slli_epi32(ewitab,2);
584             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
585             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
586             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
587             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
588             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
589             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
590             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
591             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
592             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
593             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
594
595             d                = _mm_sub_pd(r21,rswitch);
596             d                = _mm_max_pd(d,_mm_setzero_pd());
597             d2               = _mm_mul_pd(d,d);
598             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
599
600             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
601
602             /* Evaluate switch function */
603             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
604             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
605             velec            = _mm_mul_pd(velec,sw);
606             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
607
608             /* Update potential sum for this i atom from the interaction with this j atom. */
609             velec            = _mm_and_pd(velec,cutoff_mask);
610             velecsum         = _mm_add_pd(velecsum,velec);
611
612             fscal            = felec;
613
614             fscal            = _mm_and_pd(fscal,cutoff_mask);
615
616             /* Calculate temporary vectorial force */
617             tx               = _mm_mul_pd(fscal,dx21);
618             ty               = _mm_mul_pd(fscal,dy21);
619             tz               = _mm_mul_pd(fscal,dz21);
620
621             /* Update vectorial force */
622             fix2             = _mm_add_pd(fix2,tx);
623             fiy2             = _mm_add_pd(fiy2,ty);
624             fiz2             = _mm_add_pd(fiz2,tz);
625
626             fjx1             = _mm_add_pd(fjx1,tx);
627             fjy1             = _mm_add_pd(fjy1,ty);
628             fjz1             = _mm_add_pd(fjz1,tz);
629
630             }
631
632             /**************************
633              * CALCULATE INTERACTIONS *
634              **************************/
635
636             if (gmx_mm_any_lt(rsq22,rcutoff2))
637             {
638
639             r22              = _mm_mul_pd(rsq22,rinv22);
640
641             /* EWALD ELECTROSTATICS */
642
643             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
644             ewrt             = _mm_mul_pd(r22,ewtabscale);
645             ewitab           = _mm_cvttpd_epi32(ewrt);
646             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
647             ewitab           = _mm_slli_epi32(ewitab,2);
648             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
649             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
650             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
651             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
652             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
653             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
654             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
655             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
656             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
657             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
658
659             d                = _mm_sub_pd(r22,rswitch);
660             d                = _mm_max_pd(d,_mm_setzero_pd());
661             d2               = _mm_mul_pd(d,d);
662             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
663
664             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
665
666             /* Evaluate switch function */
667             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
668             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
669             velec            = _mm_mul_pd(velec,sw);
670             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
671
672             /* Update potential sum for this i atom from the interaction with this j atom. */
673             velec            = _mm_and_pd(velec,cutoff_mask);
674             velecsum         = _mm_add_pd(velecsum,velec);
675
676             fscal            = felec;
677
678             fscal            = _mm_and_pd(fscal,cutoff_mask);
679
680             /* Calculate temporary vectorial force */
681             tx               = _mm_mul_pd(fscal,dx22);
682             ty               = _mm_mul_pd(fscal,dy22);
683             tz               = _mm_mul_pd(fscal,dz22);
684
685             /* Update vectorial force */
686             fix2             = _mm_add_pd(fix2,tx);
687             fiy2             = _mm_add_pd(fiy2,ty);
688             fiz2             = _mm_add_pd(fiz2,tz);
689
690             fjx2             = _mm_add_pd(fjx2,tx);
691             fjy2             = _mm_add_pd(fjy2,ty);
692             fjz2             = _mm_add_pd(fjz2,tz);
693
694             }
695
696             /**************************
697              * CALCULATE INTERACTIONS *
698              **************************/
699
700             if (gmx_mm_any_lt(rsq23,rcutoff2))
701             {
702
703             r23              = _mm_mul_pd(rsq23,rinv23);
704
705             /* EWALD ELECTROSTATICS */
706
707             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
708             ewrt             = _mm_mul_pd(r23,ewtabscale);
709             ewitab           = _mm_cvttpd_epi32(ewrt);
710             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
711             ewitab           = _mm_slli_epi32(ewitab,2);
712             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
713             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
714             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
715             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
716             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
717             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
718             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
719             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
720             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
721             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
722
723             d                = _mm_sub_pd(r23,rswitch);
724             d                = _mm_max_pd(d,_mm_setzero_pd());
725             d2               = _mm_mul_pd(d,d);
726             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
727
728             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
729
730             /* Evaluate switch function */
731             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
732             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
733             velec            = _mm_mul_pd(velec,sw);
734             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
735
736             /* Update potential sum for this i atom from the interaction with this j atom. */
737             velec            = _mm_and_pd(velec,cutoff_mask);
738             velecsum         = _mm_add_pd(velecsum,velec);
739
740             fscal            = felec;
741
742             fscal            = _mm_and_pd(fscal,cutoff_mask);
743
744             /* Calculate temporary vectorial force */
745             tx               = _mm_mul_pd(fscal,dx23);
746             ty               = _mm_mul_pd(fscal,dy23);
747             tz               = _mm_mul_pd(fscal,dz23);
748
749             /* Update vectorial force */
750             fix2             = _mm_add_pd(fix2,tx);
751             fiy2             = _mm_add_pd(fiy2,ty);
752             fiz2             = _mm_add_pd(fiz2,tz);
753
754             fjx3             = _mm_add_pd(fjx3,tx);
755             fjy3             = _mm_add_pd(fjy3,ty);
756             fjz3             = _mm_add_pd(fjz3,tz);
757
758             }
759
760             /**************************
761              * CALCULATE INTERACTIONS *
762              **************************/
763
764             if (gmx_mm_any_lt(rsq31,rcutoff2))
765             {
766
767             r31              = _mm_mul_pd(rsq31,rinv31);
768
769             /* EWALD ELECTROSTATICS */
770
771             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
772             ewrt             = _mm_mul_pd(r31,ewtabscale);
773             ewitab           = _mm_cvttpd_epi32(ewrt);
774             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
775             ewitab           = _mm_slli_epi32(ewitab,2);
776             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
777             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
778             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
779             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
780             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
781             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
782             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
783             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
784             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
785             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
786
787             d                = _mm_sub_pd(r31,rswitch);
788             d                = _mm_max_pd(d,_mm_setzero_pd());
789             d2               = _mm_mul_pd(d,d);
790             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
791
792             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
793
794             /* Evaluate switch function */
795             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
796             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
797             velec            = _mm_mul_pd(velec,sw);
798             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
799
800             /* Update potential sum for this i atom from the interaction with this j atom. */
801             velec            = _mm_and_pd(velec,cutoff_mask);
802             velecsum         = _mm_add_pd(velecsum,velec);
803
804             fscal            = felec;
805
806             fscal            = _mm_and_pd(fscal,cutoff_mask);
807
808             /* Calculate temporary vectorial force */
809             tx               = _mm_mul_pd(fscal,dx31);
810             ty               = _mm_mul_pd(fscal,dy31);
811             tz               = _mm_mul_pd(fscal,dz31);
812
813             /* Update vectorial force */
814             fix3             = _mm_add_pd(fix3,tx);
815             fiy3             = _mm_add_pd(fiy3,ty);
816             fiz3             = _mm_add_pd(fiz3,tz);
817
818             fjx1             = _mm_add_pd(fjx1,tx);
819             fjy1             = _mm_add_pd(fjy1,ty);
820             fjz1             = _mm_add_pd(fjz1,tz);
821
822             }
823
824             /**************************
825              * CALCULATE INTERACTIONS *
826              **************************/
827
828             if (gmx_mm_any_lt(rsq32,rcutoff2))
829             {
830
831             r32              = _mm_mul_pd(rsq32,rinv32);
832
833             /* EWALD ELECTROSTATICS */
834
835             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
836             ewrt             = _mm_mul_pd(r32,ewtabscale);
837             ewitab           = _mm_cvttpd_epi32(ewrt);
838             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
839             ewitab           = _mm_slli_epi32(ewitab,2);
840             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
841             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
842             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
843             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
844             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
845             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
846             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
847             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
848             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
849             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
850
851             d                = _mm_sub_pd(r32,rswitch);
852             d                = _mm_max_pd(d,_mm_setzero_pd());
853             d2               = _mm_mul_pd(d,d);
854             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
855
856             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
857
858             /* Evaluate switch function */
859             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
860             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
861             velec            = _mm_mul_pd(velec,sw);
862             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
863
864             /* Update potential sum for this i atom from the interaction with this j atom. */
865             velec            = _mm_and_pd(velec,cutoff_mask);
866             velecsum         = _mm_add_pd(velecsum,velec);
867
868             fscal            = felec;
869
870             fscal            = _mm_and_pd(fscal,cutoff_mask);
871
872             /* Calculate temporary vectorial force */
873             tx               = _mm_mul_pd(fscal,dx32);
874             ty               = _mm_mul_pd(fscal,dy32);
875             tz               = _mm_mul_pd(fscal,dz32);
876
877             /* Update vectorial force */
878             fix3             = _mm_add_pd(fix3,tx);
879             fiy3             = _mm_add_pd(fiy3,ty);
880             fiz3             = _mm_add_pd(fiz3,tz);
881
882             fjx2             = _mm_add_pd(fjx2,tx);
883             fjy2             = _mm_add_pd(fjy2,ty);
884             fjz2             = _mm_add_pd(fjz2,tz);
885
886             }
887
888             /**************************
889              * CALCULATE INTERACTIONS *
890              **************************/
891
892             if (gmx_mm_any_lt(rsq33,rcutoff2))
893             {
894
895             r33              = _mm_mul_pd(rsq33,rinv33);
896
897             /* EWALD ELECTROSTATICS */
898
899             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
900             ewrt             = _mm_mul_pd(r33,ewtabscale);
901             ewitab           = _mm_cvttpd_epi32(ewrt);
902             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
903             ewitab           = _mm_slli_epi32(ewitab,2);
904             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
905             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
906             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
907             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
908             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
909             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
910             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
911             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
912             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
913             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
914
915             d                = _mm_sub_pd(r33,rswitch);
916             d                = _mm_max_pd(d,_mm_setzero_pd());
917             d2               = _mm_mul_pd(d,d);
918             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
919
920             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
921
922             /* Evaluate switch function */
923             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
924             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
925             velec            = _mm_mul_pd(velec,sw);
926             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
927
928             /* Update potential sum for this i atom from the interaction with this j atom. */
929             velec            = _mm_and_pd(velec,cutoff_mask);
930             velecsum         = _mm_add_pd(velecsum,velec);
931
932             fscal            = felec;
933
934             fscal            = _mm_and_pd(fscal,cutoff_mask);
935
936             /* Calculate temporary vectorial force */
937             tx               = _mm_mul_pd(fscal,dx33);
938             ty               = _mm_mul_pd(fscal,dy33);
939             tz               = _mm_mul_pd(fscal,dz33);
940
941             /* Update vectorial force */
942             fix3             = _mm_add_pd(fix3,tx);
943             fiy3             = _mm_add_pd(fiy3,ty);
944             fiz3             = _mm_add_pd(fiz3,tz);
945
946             fjx3             = _mm_add_pd(fjx3,tx);
947             fjy3             = _mm_add_pd(fjy3,ty);
948             fjz3             = _mm_add_pd(fjz3,tz);
949
950             }
951
952             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
953
954             /* Inner loop uses 647 flops */
955         }
956
957         if(jidx<j_index_end)
958         {
959
960             jnrA             = jjnr[jidx];
961             j_coord_offsetA  = DIM*jnrA;
962
963             /* load j atom coordinates */
964             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
965                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
966                                               &jy2,&jz2,&jx3,&jy3,&jz3);
967
968             /* Calculate displacement vector */
969             dx00             = _mm_sub_pd(ix0,jx0);
970             dy00             = _mm_sub_pd(iy0,jy0);
971             dz00             = _mm_sub_pd(iz0,jz0);
972             dx11             = _mm_sub_pd(ix1,jx1);
973             dy11             = _mm_sub_pd(iy1,jy1);
974             dz11             = _mm_sub_pd(iz1,jz1);
975             dx12             = _mm_sub_pd(ix1,jx2);
976             dy12             = _mm_sub_pd(iy1,jy2);
977             dz12             = _mm_sub_pd(iz1,jz2);
978             dx13             = _mm_sub_pd(ix1,jx3);
979             dy13             = _mm_sub_pd(iy1,jy3);
980             dz13             = _mm_sub_pd(iz1,jz3);
981             dx21             = _mm_sub_pd(ix2,jx1);
982             dy21             = _mm_sub_pd(iy2,jy1);
983             dz21             = _mm_sub_pd(iz2,jz1);
984             dx22             = _mm_sub_pd(ix2,jx2);
985             dy22             = _mm_sub_pd(iy2,jy2);
986             dz22             = _mm_sub_pd(iz2,jz2);
987             dx23             = _mm_sub_pd(ix2,jx3);
988             dy23             = _mm_sub_pd(iy2,jy3);
989             dz23             = _mm_sub_pd(iz2,jz3);
990             dx31             = _mm_sub_pd(ix3,jx1);
991             dy31             = _mm_sub_pd(iy3,jy1);
992             dz31             = _mm_sub_pd(iz3,jz1);
993             dx32             = _mm_sub_pd(ix3,jx2);
994             dy32             = _mm_sub_pd(iy3,jy2);
995             dz32             = _mm_sub_pd(iz3,jz2);
996             dx33             = _mm_sub_pd(ix3,jx3);
997             dy33             = _mm_sub_pd(iy3,jy3);
998             dz33             = _mm_sub_pd(iz3,jz3);
999
1000             /* Calculate squared distance and things based on it */
1001             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1002             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1003             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1004             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1005             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1006             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1007             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1008             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1009             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1010             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1011
1012             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1013             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1014             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1015             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1016             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1017             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1018             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1019             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1020             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1021             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1022
1023             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1024             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1025             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1026             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1027             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1028             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1029             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1030             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1031             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1032             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1033
1034             fjx0             = _mm_setzero_pd();
1035             fjy0             = _mm_setzero_pd();
1036             fjz0             = _mm_setzero_pd();
1037             fjx1             = _mm_setzero_pd();
1038             fjy1             = _mm_setzero_pd();
1039             fjz1             = _mm_setzero_pd();
1040             fjx2             = _mm_setzero_pd();
1041             fjy2             = _mm_setzero_pd();
1042             fjz2             = _mm_setzero_pd();
1043             fjx3             = _mm_setzero_pd();
1044             fjy3             = _mm_setzero_pd();
1045             fjz3             = _mm_setzero_pd();
1046
1047             /**************************
1048              * CALCULATE INTERACTIONS *
1049              **************************/
1050
1051             if (gmx_mm_any_lt(rsq00,rcutoff2))
1052             {
1053
1054             r00              = _mm_mul_pd(rsq00,rinv00);
1055
1056             /* LENNARD-JONES DISPERSION/REPULSION */
1057
1058             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1059             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
1060             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1061             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1062             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1063
1064             d                = _mm_sub_pd(r00,rswitch);
1065             d                = _mm_max_pd(d,_mm_setzero_pd());
1066             d2               = _mm_mul_pd(d,d);
1067             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1068
1069             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1070
1071             /* Evaluate switch function */
1072             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1073             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1074             vvdw             = _mm_mul_pd(vvdw,sw);
1075             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1076
1077             /* Update potential sum for this i atom from the interaction with this j atom. */
1078             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
1079             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1080             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
1081
1082             fscal            = fvdw;
1083
1084             fscal            = _mm_and_pd(fscal,cutoff_mask);
1085
1086             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1087
1088             /* Calculate temporary vectorial force */
1089             tx               = _mm_mul_pd(fscal,dx00);
1090             ty               = _mm_mul_pd(fscal,dy00);
1091             tz               = _mm_mul_pd(fscal,dz00);
1092
1093             /* Update vectorial force */
1094             fix0             = _mm_add_pd(fix0,tx);
1095             fiy0             = _mm_add_pd(fiy0,ty);
1096             fiz0             = _mm_add_pd(fiz0,tz);
1097
1098             fjx0             = _mm_add_pd(fjx0,tx);
1099             fjy0             = _mm_add_pd(fjy0,ty);
1100             fjz0             = _mm_add_pd(fjz0,tz);
1101
1102             }
1103
1104             /**************************
1105              * CALCULATE INTERACTIONS *
1106              **************************/
1107
1108             if (gmx_mm_any_lt(rsq11,rcutoff2))
1109             {
1110
1111             r11              = _mm_mul_pd(rsq11,rinv11);
1112
1113             /* EWALD ELECTROSTATICS */
1114
1115             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1116             ewrt             = _mm_mul_pd(r11,ewtabscale);
1117             ewitab           = _mm_cvttpd_epi32(ewrt);
1118             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1119             ewitab           = _mm_slli_epi32(ewitab,2);
1120             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1121             ewtabD           = _mm_setzero_pd();
1122             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1123             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1124             ewtabFn          = _mm_setzero_pd();
1125             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1126             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1127             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1128             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1129             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1130
1131             d                = _mm_sub_pd(r11,rswitch);
1132             d                = _mm_max_pd(d,_mm_setzero_pd());
1133             d2               = _mm_mul_pd(d,d);
1134             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1135
1136             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1137
1138             /* Evaluate switch function */
1139             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1140             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1141             velec            = _mm_mul_pd(velec,sw);
1142             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1143
1144             /* Update potential sum for this i atom from the interaction with this j atom. */
1145             velec            = _mm_and_pd(velec,cutoff_mask);
1146             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1147             velecsum         = _mm_add_pd(velecsum,velec);
1148
1149             fscal            = felec;
1150
1151             fscal            = _mm_and_pd(fscal,cutoff_mask);
1152
1153             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1154
1155             /* Calculate temporary vectorial force */
1156             tx               = _mm_mul_pd(fscal,dx11);
1157             ty               = _mm_mul_pd(fscal,dy11);
1158             tz               = _mm_mul_pd(fscal,dz11);
1159
1160             /* Update vectorial force */
1161             fix1             = _mm_add_pd(fix1,tx);
1162             fiy1             = _mm_add_pd(fiy1,ty);
1163             fiz1             = _mm_add_pd(fiz1,tz);
1164
1165             fjx1             = _mm_add_pd(fjx1,tx);
1166             fjy1             = _mm_add_pd(fjy1,ty);
1167             fjz1             = _mm_add_pd(fjz1,tz);
1168
1169             }
1170
1171             /**************************
1172              * CALCULATE INTERACTIONS *
1173              **************************/
1174
1175             if (gmx_mm_any_lt(rsq12,rcutoff2))
1176             {
1177
1178             r12              = _mm_mul_pd(rsq12,rinv12);
1179
1180             /* EWALD ELECTROSTATICS */
1181
1182             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1183             ewrt             = _mm_mul_pd(r12,ewtabscale);
1184             ewitab           = _mm_cvttpd_epi32(ewrt);
1185             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1186             ewitab           = _mm_slli_epi32(ewitab,2);
1187             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1188             ewtabD           = _mm_setzero_pd();
1189             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1190             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1191             ewtabFn          = _mm_setzero_pd();
1192             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1193             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1194             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1195             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1196             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1197
1198             d                = _mm_sub_pd(r12,rswitch);
1199             d                = _mm_max_pd(d,_mm_setzero_pd());
1200             d2               = _mm_mul_pd(d,d);
1201             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1202
1203             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1204
1205             /* Evaluate switch function */
1206             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1207             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1208             velec            = _mm_mul_pd(velec,sw);
1209             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1210
1211             /* Update potential sum for this i atom from the interaction with this j atom. */
1212             velec            = _mm_and_pd(velec,cutoff_mask);
1213             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1214             velecsum         = _mm_add_pd(velecsum,velec);
1215
1216             fscal            = felec;
1217
1218             fscal            = _mm_and_pd(fscal,cutoff_mask);
1219
1220             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1221
1222             /* Calculate temporary vectorial force */
1223             tx               = _mm_mul_pd(fscal,dx12);
1224             ty               = _mm_mul_pd(fscal,dy12);
1225             tz               = _mm_mul_pd(fscal,dz12);
1226
1227             /* Update vectorial force */
1228             fix1             = _mm_add_pd(fix1,tx);
1229             fiy1             = _mm_add_pd(fiy1,ty);
1230             fiz1             = _mm_add_pd(fiz1,tz);
1231
1232             fjx2             = _mm_add_pd(fjx2,tx);
1233             fjy2             = _mm_add_pd(fjy2,ty);
1234             fjz2             = _mm_add_pd(fjz2,tz);
1235
1236             }
1237
1238             /**************************
1239              * CALCULATE INTERACTIONS *
1240              **************************/
1241
1242             if (gmx_mm_any_lt(rsq13,rcutoff2))
1243             {
1244
1245             r13              = _mm_mul_pd(rsq13,rinv13);
1246
1247             /* EWALD ELECTROSTATICS */
1248
1249             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1250             ewrt             = _mm_mul_pd(r13,ewtabscale);
1251             ewitab           = _mm_cvttpd_epi32(ewrt);
1252             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1253             ewitab           = _mm_slli_epi32(ewitab,2);
1254             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1255             ewtabD           = _mm_setzero_pd();
1256             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1257             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1258             ewtabFn          = _mm_setzero_pd();
1259             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1260             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1261             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1262             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1263             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1264
1265             d                = _mm_sub_pd(r13,rswitch);
1266             d                = _mm_max_pd(d,_mm_setzero_pd());
1267             d2               = _mm_mul_pd(d,d);
1268             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1269
1270             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1271
1272             /* Evaluate switch function */
1273             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1274             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1275             velec            = _mm_mul_pd(velec,sw);
1276             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
1277
1278             /* Update potential sum for this i atom from the interaction with this j atom. */
1279             velec            = _mm_and_pd(velec,cutoff_mask);
1280             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1281             velecsum         = _mm_add_pd(velecsum,velec);
1282
1283             fscal            = felec;
1284
1285             fscal            = _mm_and_pd(fscal,cutoff_mask);
1286
1287             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1288
1289             /* Calculate temporary vectorial force */
1290             tx               = _mm_mul_pd(fscal,dx13);
1291             ty               = _mm_mul_pd(fscal,dy13);
1292             tz               = _mm_mul_pd(fscal,dz13);
1293
1294             /* Update vectorial force */
1295             fix1             = _mm_add_pd(fix1,tx);
1296             fiy1             = _mm_add_pd(fiy1,ty);
1297             fiz1             = _mm_add_pd(fiz1,tz);
1298
1299             fjx3             = _mm_add_pd(fjx3,tx);
1300             fjy3             = _mm_add_pd(fjy3,ty);
1301             fjz3             = _mm_add_pd(fjz3,tz);
1302
1303             }
1304
1305             /**************************
1306              * CALCULATE INTERACTIONS *
1307              **************************/
1308
1309             if (gmx_mm_any_lt(rsq21,rcutoff2))
1310             {
1311
1312             r21              = _mm_mul_pd(rsq21,rinv21);
1313
1314             /* EWALD ELECTROSTATICS */
1315
1316             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1317             ewrt             = _mm_mul_pd(r21,ewtabscale);
1318             ewitab           = _mm_cvttpd_epi32(ewrt);
1319             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1320             ewitab           = _mm_slli_epi32(ewitab,2);
1321             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1322             ewtabD           = _mm_setzero_pd();
1323             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1324             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1325             ewtabFn          = _mm_setzero_pd();
1326             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1327             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1328             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1329             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1330             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1331
1332             d                = _mm_sub_pd(r21,rswitch);
1333             d                = _mm_max_pd(d,_mm_setzero_pd());
1334             d2               = _mm_mul_pd(d,d);
1335             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1336
1337             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1338
1339             /* Evaluate switch function */
1340             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1341             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1342             velec            = _mm_mul_pd(velec,sw);
1343             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1344
1345             /* Update potential sum for this i atom from the interaction with this j atom. */
1346             velec            = _mm_and_pd(velec,cutoff_mask);
1347             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1348             velecsum         = _mm_add_pd(velecsum,velec);
1349
1350             fscal            = felec;
1351
1352             fscal            = _mm_and_pd(fscal,cutoff_mask);
1353
1354             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1355
1356             /* Calculate temporary vectorial force */
1357             tx               = _mm_mul_pd(fscal,dx21);
1358             ty               = _mm_mul_pd(fscal,dy21);
1359             tz               = _mm_mul_pd(fscal,dz21);
1360
1361             /* Update vectorial force */
1362             fix2             = _mm_add_pd(fix2,tx);
1363             fiy2             = _mm_add_pd(fiy2,ty);
1364             fiz2             = _mm_add_pd(fiz2,tz);
1365
1366             fjx1             = _mm_add_pd(fjx1,tx);
1367             fjy1             = _mm_add_pd(fjy1,ty);
1368             fjz1             = _mm_add_pd(fjz1,tz);
1369
1370             }
1371
1372             /**************************
1373              * CALCULATE INTERACTIONS *
1374              **************************/
1375
1376             if (gmx_mm_any_lt(rsq22,rcutoff2))
1377             {
1378
1379             r22              = _mm_mul_pd(rsq22,rinv22);
1380
1381             /* EWALD ELECTROSTATICS */
1382
1383             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1384             ewrt             = _mm_mul_pd(r22,ewtabscale);
1385             ewitab           = _mm_cvttpd_epi32(ewrt);
1386             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1387             ewitab           = _mm_slli_epi32(ewitab,2);
1388             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1389             ewtabD           = _mm_setzero_pd();
1390             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1391             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1392             ewtabFn          = _mm_setzero_pd();
1393             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1394             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1395             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1396             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1397             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1398
1399             d                = _mm_sub_pd(r22,rswitch);
1400             d                = _mm_max_pd(d,_mm_setzero_pd());
1401             d2               = _mm_mul_pd(d,d);
1402             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1403
1404             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1405
1406             /* Evaluate switch function */
1407             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1408             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1409             velec            = _mm_mul_pd(velec,sw);
1410             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1411
1412             /* Update potential sum for this i atom from the interaction with this j atom. */
1413             velec            = _mm_and_pd(velec,cutoff_mask);
1414             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1415             velecsum         = _mm_add_pd(velecsum,velec);
1416
1417             fscal            = felec;
1418
1419             fscal            = _mm_and_pd(fscal,cutoff_mask);
1420
1421             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1422
1423             /* Calculate temporary vectorial force */
1424             tx               = _mm_mul_pd(fscal,dx22);
1425             ty               = _mm_mul_pd(fscal,dy22);
1426             tz               = _mm_mul_pd(fscal,dz22);
1427
1428             /* Update vectorial force */
1429             fix2             = _mm_add_pd(fix2,tx);
1430             fiy2             = _mm_add_pd(fiy2,ty);
1431             fiz2             = _mm_add_pd(fiz2,tz);
1432
1433             fjx2             = _mm_add_pd(fjx2,tx);
1434             fjy2             = _mm_add_pd(fjy2,ty);
1435             fjz2             = _mm_add_pd(fjz2,tz);
1436
1437             }
1438
1439             /**************************
1440              * CALCULATE INTERACTIONS *
1441              **************************/
1442
1443             if (gmx_mm_any_lt(rsq23,rcutoff2))
1444             {
1445
1446             r23              = _mm_mul_pd(rsq23,rinv23);
1447
1448             /* EWALD ELECTROSTATICS */
1449
1450             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1451             ewrt             = _mm_mul_pd(r23,ewtabscale);
1452             ewitab           = _mm_cvttpd_epi32(ewrt);
1453             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1454             ewitab           = _mm_slli_epi32(ewitab,2);
1455             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1456             ewtabD           = _mm_setzero_pd();
1457             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1458             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1459             ewtabFn          = _mm_setzero_pd();
1460             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1461             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1462             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1463             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1464             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1465
1466             d                = _mm_sub_pd(r23,rswitch);
1467             d                = _mm_max_pd(d,_mm_setzero_pd());
1468             d2               = _mm_mul_pd(d,d);
1469             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1470
1471             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1472
1473             /* Evaluate switch function */
1474             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1475             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1476             velec            = _mm_mul_pd(velec,sw);
1477             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
1478
1479             /* Update potential sum for this i atom from the interaction with this j atom. */
1480             velec            = _mm_and_pd(velec,cutoff_mask);
1481             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1482             velecsum         = _mm_add_pd(velecsum,velec);
1483
1484             fscal            = felec;
1485
1486             fscal            = _mm_and_pd(fscal,cutoff_mask);
1487
1488             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1489
1490             /* Calculate temporary vectorial force */
1491             tx               = _mm_mul_pd(fscal,dx23);
1492             ty               = _mm_mul_pd(fscal,dy23);
1493             tz               = _mm_mul_pd(fscal,dz23);
1494
1495             /* Update vectorial force */
1496             fix2             = _mm_add_pd(fix2,tx);
1497             fiy2             = _mm_add_pd(fiy2,ty);
1498             fiz2             = _mm_add_pd(fiz2,tz);
1499
1500             fjx3             = _mm_add_pd(fjx3,tx);
1501             fjy3             = _mm_add_pd(fjy3,ty);
1502             fjz3             = _mm_add_pd(fjz3,tz);
1503
1504             }
1505
1506             /**************************
1507              * CALCULATE INTERACTIONS *
1508              **************************/
1509
1510             if (gmx_mm_any_lt(rsq31,rcutoff2))
1511             {
1512
1513             r31              = _mm_mul_pd(rsq31,rinv31);
1514
1515             /* EWALD ELECTROSTATICS */
1516
1517             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1518             ewrt             = _mm_mul_pd(r31,ewtabscale);
1519             ewitab           = _mm_cvttpd_epi32(ewrt);
1520             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1521             ewitab           = _mm_slli_epi32(ewitab,2);
1522             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1523             ewtabD           = _mm_setzero_pd();
1524             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1525             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1526             ewtabFn          = _mm_setzero_pd();
1527             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1528             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1529             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1530             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1531             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1532
1533             d                = _mm_sub_pd(r31,rswitch);
1534             d                = _mm_max_pd(d,_mm_setzero_pd());
1535             d2               = _mm_mul_pd(d,d);
1536             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1537
1538             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1539
1540             /* Evaluate switch function */
1541             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1542             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1543             velec            = _mm_mul_pd(velec,sw);
1544             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
1545
1546             /* Update potential sum for this i atom from the interaction with this j atom. */
1547             velec            = _mm_and_pd(velec,cutoff_mask);
1548             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1549             velecsum         = _mm_add_pd(velecsum,velec);
1550
1551             fscal            = felec;
1552
1553             fscal            = _mm_and_pd(fscal,cutoff_mask);
1554
1555             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1556
1557             /* Calculate temporary vectorial force */
1558             tx               = _mm_mul_pd(fscal,dx31);
1559             ty               = _mm_mul_pd(fscal,dy31);
1560             tz               = _mm_mul_pd(fscal,dz31);
1561
1562             /* Update vectorial force */
1563             fix3             = _mm_add_pd(fix3,tx);
1564             fiy3             = _mm_add_pd(fiy3,ty);
1565             fiz3             = _mm_add_pd(fiz3,tz);
1566
1567             fjx1             = _mm_add_pd(fjx1,tx);
1568             fjy1             = _mm_add_pd(fjy1,ty);
1569             fjz1             = _mm_add_pd(fjz1,tz);
1570
1571             }
1572
1573             /**************************
1574              * CALCULATE INTERACTIONS *
1575              **************************/
1576
1577             if (gmx_mm_any_lt(rsq32,rcutoff2))
1578             {
1579
1580             r32              = _mm_mul_pd(rsq32,rinv32);
1581
1582             /* EWALD ELECTROSTATICS */
1583
1584             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1585             ewrt             = _mm_mul_pd(r32,ewtabscale);
1586             ewitab           = _mm_cvttpd_epi32(ewrt);
1587             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1588             ewitab           = _mm_slli_epi32(ewitab,2);
1589             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1590             ewtabD           = _mm_setzero_pd();
1591             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1592             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1593             ewtabFn          = _mm_setzero_pd();
1594             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1595             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1596             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1597             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1598             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1599
1600             d                = _mm_sub_pd(r32,rswitch);
1601             d                = _mm_max_pd(d,_mm_setzero_pd());
1602             d2               = _mm_mul_pd(d,d);
1603             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1604
1605             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1606
1607             /* Evaluate switch function */
1608             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1609             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1610             velec            = _mm_mul_pd(velec,sw);
1611             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
1612
1613             /* Update potential sum for this i atom from the interaction with this j atom. */
1614             velec            = _mm_and_pd(velec,cutoff_mask);
1615             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1616             velecsum         = _mm_add_pd(velecsum,velec);
1617
1618             fscal            = felec;
1619
1620             fscal            = _mm_and_pd(fscal,cutoff_mask);
1621
1622             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1623
1624             /* Calculate temporary vectorial force */
1625             tx               = _mm_mul_pd(fscal,dx32);
1626             ty               = _mm_mul_pd(fscal,dy32);
1627             tz               = _mm_mul_pd(fscal,dz32);
1628
1629             /* Update vectorial force */
1630             fix3             = _mm_add_pd(fix3,tx);
1631             fiy3             = _mm_add_pd(fiy3,ty);
1632             fiz3             = _mm_add_pd(fiz3,tz);
1633
1634             fjx2             = _mm_add_pd(fjx2,tx);
1635             fjy2             = _mm_add_pd(fjy2,ty);
1636             fjz2             = _mm_add_pd(fjz2,tz);
1637
1638             }
1639
1640             /**************************
1641              * CALCULATE INTERACTIONS *
1642              **************************/
1643
1644             if (gmx_mm_any_lt(rsq33,rcutoff2))
1645             {
1646
1647             r33              = _mm_mul_pd(rsq33,rinv33);
1648
1649             /* EWALD ELECTROSTATICS */
1650
1651             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1652             ewrt             = _mm_mul_pd(r33,ewtabscale);
1653             ewitab           = _mm_cvttpd_epi32(ewrt);
1654             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1655             ewitab           = _mm_slli_epi32(ewitab,2);
1656             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1657             ewtabD           = _mm_setzero_pd();
1658             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1659             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1660             ewtabFn          = _mm_setzero_pd();
1661             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1662             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1663             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1664             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1665             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1666
1667             d                = _mm_sub_pd(r33,rswitch);
1668             d                = _mm_max_pd(d,_mm_setzero_pd());
1669             d2               = _mm_mul_pd(d,d);
1670             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1671
1672             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1673
1674             /* Evaluate switch function */
1675             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1676             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1677             velec            = _mm_mul_pd(velec,sw);
1678             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
1679
1680             /* Update potential sum for this i atom from the interaction with this j atom. */
1681             velec            = _mm_and_pd(velec,cutoff_mask);
1682             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1683             velecsum         = _mm_add_pd(velecsum,velec);
1684
1685             fscal            = felec;
1686
1687             fscal            = _mm_and_pd(fscal,cutoff_mask);
1688
1689             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1690
1691             /* Calculate temporary vectorial force */
1692             tx               = _mm_mul_pd(fscal,dx33);
1693             ty               = _mm_mul_pd(fscal,dy33);
1694             tz               = _mm_mul_pd(fscal,dz33);
1695
1696             /* Update vectorial force */
1697             fix3             = _mm_add_pd(fix3,tx);
1698             fiy3             = _mm_add_pd(fiy3,ty);
1699             fiz3             = _mm_add_pd(fiz3,tz);
1700
1701             fjx3             = _mm_add_pd(fjx3,tx);
1702             fjy3             = _mm_add_pd(fjy3,ty);
1703             fjz3             = _mm_add_pd(fjz3,tz);
1704
1705             }
1706
1707             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1708
1709             /* Inner loop uses 647 flops */
1710         }
1711
1712         /* End of innermost loop */
1713
1714         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1715                                               f+i_coord_offset,fshift+i_shift_offset);
1716
1717         ggid                        = gid[iidx];
1718         /* Update potential energies */
1719         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1720         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1721
1722         /* Increment number of inner iterations */
1723         inneriter                  += j_index_end - j_index_start;
1724
1725         /* Outer loop uses 26 flops */
1726     }
1727
1728     /* Increment number of outer iterations */
1729     outeriter        += nri;
1730
1731     /* Update outer/inner flops */
1732
1733     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*647);
1734 }
1735 /*
1736  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_double
1737  * Electrostatics interaction: Ewald
1738  * VdW interaction:            LennardJones
1739  * Geometry:                   Water4-Water4
1740  * Calculate force/pot:        Force
1741  */
1742 void
1743 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_double
1744                     (t_nblist                    * gmx_restrict       nlist,
1745                      rvec                        * gmx_restrict          xx,
1746                      rvec                        * gmx_restrict          ff,
1747                      t_forcerec                  * gmx_restrict          fr,
1748                      t_mdatoms                   * gmx_restrict     mdatoms,
1749                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1750                      t_nrnb                      * gmx_restrict        nrnb)
1751 {
1752     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1753      * just 0 for non-waters.
1754      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1755      * jnr indices corresponding to data put in the four positions in the SIMD register.
1756      */
1757     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1758     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1759     int              jnrA,jnrB;
1760     int              j_coord_offsetA,j_coord_offsetB;
1761     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1762     real             rcutoff_scalar;
1763     real             *shiftvec,*fshift,*x,*f;
1764     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1765     int              vdwioffset0;
1766     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1767     int              vdwioffset1;
1768     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1769     int              vdwioffset2;
1770     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1771     int              vdwioffset3;
1772     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1773     int              vdwjidx0A,vdwjidx0B;
1774     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1775     int              vdwjidx1A,vdwjidx1B;
1776     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1777     int              vdwjidx2A,vdwjidx2B;
1778     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1779     int              vdwjidx3A,vdwjidx3B;
1780     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1781     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1782     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1783     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1784     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1785     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1786     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1787     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1788     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1789     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1790     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1791     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1792     real             *charge;
1793     int              nvdwtype;
1794     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1795     int              *vdwtype;
1796     real             *vdwparam;
1797     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1798     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1799     __m128i          ewitab;
1800     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1801     real             *ewtab;
1802     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1803     real             rswitch_scalar,d_scalar;
1804     __m128d          dummy_mask,cutoff_mask;
1805     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1806     __m128d          one     = _mm_set1_pd(1.0);
1807     __m128d          two     = _mm_set1_pd(2.0);
1808     x                = xx[0];
1809     f                = ff[0];
1810
1811     nri              = nlist->nri;
1812     iinr             = nlist->iinr;
1813     jindex           = nlist->jindex;
1814     jjnr             = nlist->jjnr;
1815     shiftidx         = nlist->shift;
1816     gid              = nlist->gid;
1817     shiftvec         = fr->shift_vec[0];
1818     fshift           = fr->fshift[0];
1819     facel            = _mm_set1_pd(fr->epsfac);
1820     charge           = mdatoms->chargeA;
1821     nvdwtype         = fr->ntype;
1822     vdwparam         = fr->nbfp;
1823     vdwtype          = mdatoms->typeA;
1824
1825     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1826     ewtab            = fr->ic->tabq_coul_FDV0;
1827     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1828     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1829
1830     /* Setup water-specific parameters */
1831     inr              = nlist->iinr[0];
1832     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1833     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1834     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1835     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1836
1837     jq1              = _mm_set1_pd(charge[inr+1]);
1838     jq2              = _mm_set1_pd(charge[inr+2]);
1839     jq3              = _mm_set1_pd(charge[inr+3]);
1840     vdwjidx0A        = 2*vdwtype[inr+0];
1841     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1842     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1843     qq11             = _mm_mul_pd(iq1,jq1);
1844     qq12             = _mm_mul_pd(iq1,jq2);
1845     qq13             = _mm_mul_pd(iq1,jq3);
1846     qq21             = _mm_mul_pd(iq2,jq1);
1847     qq22             = _mm_mul_pd(iq2,jq2);
1848     qq23             = _mm_mul_pd(iq2,jq3);
1849     qq31             = _mm_mul_pd(iq3,jq1);
1850     qq32             = _mm_mul_pd(iq3,jq2);
1851     qq33             = _mm_mul_pd(iq3,jq3);
1852
1853     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1854     rcutoff_scalar   = fr->rcoulomb;
1855     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1856     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1857
1858     rswitch_scalar   = fr->rcoulomb_switch;
1859     rswitch          = _mm_set1_pd(rswitch_scalar);
1860     /* Setup switch parameters */
1861     d_scalar         = rcutoff_scalar-rswitch_scalar;
1862     d                = _mm_set1_pd(d_scalar);
1863     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1864     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1865     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1866     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1867     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1868     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1869
1870     /* Avoid stupid compiler warnings */
1871     jnrA = jnrB = 0;
1872     j_coord_offsetA = 0;
1873     j_coord_offsetB = 0;
1874
1875     outeriter        = 0;
1876     inneriter        = 0;
1877
1878     /* Start outer loop over neighborlists */
1879     for(iidx=0; iidx<nri; iidx++)
1880     {
1881         /* Load shift vector for this list */
1882         i_shift_offset   = DIM*shiftidx[iidx];
1883
1884         /* Load limits for loop over neighbors */
1885         j_index_start    = jindex[iidx];
1886         j_index_end      = jindex[iidx+1];
1887
1888         /* Get outer coordinate index */
1889         inr              = iinr[iidx];
1890         i_coord_offset   = DIM*inr;
1891
1892         /* Load i particle coords and add shift vector */
1893         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1894                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1895
1896         fix0             = _mm_setzero_pd();
1897         fiy0             = _mm_setzero_pd();
1898         fiz0             = _mm_setzero_pd();
1899         fix1             = _mm_setzero_pd();
1900         fiy1             = _mm_setzero_pd();
1901         fiz1             = _mm_setzero_pd();
1902         fix2             = _mm_setzero_pd();
1903         fiy2             = _mm_setzero_pd();
1904         fiz2             = _mm_setzero_pd();
1905         fix3             = _mm_setzero_pd();
1906         fiy3             = _mm_setzero_pd();
1907         fiz3             = _mm_setzero_pd();
1908
1909         /* Start inner kernel loop */
1910         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1911         {
1912
1913             /* Get j neighbor index, and coordinate index */
1914             jnrA             = jjnr[jidx];
1915             jnrB             = jjnr[jidx+1];
1916             j_coord_offsetA  = DIM*jnrA;
1917             j_coord_offsetB  = DIM*jnrB;
1918
1919             /* load j atom coordinates */
1920             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1921                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1922                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1923
1924             /* Calculate displacement vector */
1925             dx00             = _mm_sub_pd(ix0,jx0);
1926             dy00             = _mm_sub_pd(iy0,jy0);
1927             dz00             = _mm_sub_pd(iz0,jz0);
1928             dx11             = _mm_sub_pd(ix1,jx1);
1929             dy11             = _mm_sub_pd(iy1,jy1);
1930             dz11             = _mm_sub_pd(iz1,jz1);
1931             dx12             = _mm_sub_pd(ix1,jx2);
1932             dy12             = _mm_sub_pd(iy1,jy2);
1933             dz12             = _mm_sub_pd(iz1,jz2);
1934             dx13             = _mm_sub_pd(ix1,jx3);
1935             dy13             = _mm_sub_pd(iy1,jy3);
1936             dz13             = _mm_sub_pd(iz1,jz3);
1937             dx21             = _mm_sub_pd(ix2,jx1);
1938             dy21             = _mm_sub_pd(iy2,jy1);
1939             dz21             = _mm_sub_pd(iz2,jz1);
1940             dx22             = _mm_sub_pd(ix2,jx2);
1941             dy22             = _mm_sub_pd(iy2,jy2);
1942             dz22             = _mm_sub_pd(iz2,jz2);
1943             dx23             = _mm_sub_pd(ix2,jx3);
1944             dy23             = _mm_sub_pd(iy2,jy3);
1945             dz23             = _mm_sub_pd(iz2,jz3);
1946             dx31             = _mm_sub_pd(ix3,jx1);
1947             dy31             = _mm_sub_pd(iy3,jy1);
1948             dz31             = _mm_sub_pd(iz3,jz1);
1949             dx32             = _mm_sub_pd(ix3,jx2);
1950             dy32             = _mm_sub_pd(iy3,jy2);
1951             dz32             = _mm_sub_pd(iz3,jz2);
1952             dx33             = _mm_sub_pd(ix3,jx3);
1953             dy33             = _mm_sub_pd(iy3,jy3);
1954             dz33             = _mm_sub_pd(iz3,jz3);
1955
1956             /* Calculate squared distance and things based on it */
1957             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1958             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1959             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1960             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1961             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1962             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1963             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1964             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1965             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1966             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1967
1968             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1969             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1970             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1971             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1972             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1973             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1974             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1975             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1976             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1977             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1978
1979             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1980             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1981             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1982             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1983             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1984             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1985             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1986             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1987             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1988             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1989
1990             fjx0             = _mm_setzero_pd();
1991             fjy0             = _mm_setzero_pd();
1992             fjz0             = _mm_setzero_pd();
1993             fjx1             = _mm_setzero_pd();
1994             fjy1             = _mm_setzero_pd();
1995             fjz1             = _mm_setzero_pd();
1996             fjx2             = _mm_setzero_pd();
1997             fjy2             = _mm_setzero_pd();
1998             fjz2             = _mm_setzero_pd();
1999             fjx3             = _mm_setzero_pd();
2000             fjy3             = _mm_setzero_pd();
2001             fjz3             = _mm_setzero_pd();
2002
2003             /**************************
2004              * CALCULATE INTERACTIONS *
2005              **************************/
2006
2007             if (gmx_mm_any_lt(rsq00,rcutoff2))
2008             {
2009
2010             r00              = _mm_mul_pd(rsq00,rinv00);
2011
2012             /* LENNARD-JONES DISPERSION/REPULSION */
2013
2014             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2015             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
2016             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2017             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2018             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2019
2020             d                = _mm_sub_pd(r00,rswitch);
2021             d                = _mm_max_pd(d,_mm_setzero_pd());
2022             d2               = _mm_mul_pd(d,d);
2023             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2024
2025             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2026
2027             /* Evaluate switch function */
2028             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2029             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2030             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2031
2032             fscal            = fvdw;
2033
2034             fscal            = _mm_and_pd(fscal,cutoff_mask);
2035
2036             /* Calculate temporary vectorial force */
2037             tx               = _mm_mul_pd(fscal,dx00);
2038             ty               = _mm_mul_pd(fscal,dy00);
2039             tz               = _mm_mul_pd(fscal,dz00);
2040
2041             /* Update vectorial force */
2042             fix0             = _mm_add_pd(fix0,tx);
2043             fiy0             = _mm_add_pd(fiy0,ty);
2044             fiz0             = _mm_add_pd(fiz0,tz);
2045
2046             fjx0             = _mm_add_pd(fjx0,tx);
2047             fjy0             = _mm_add_pd(fjy0,ty);
2048             fjz0             = _mm_add_pd(fjz0,tz);
2049
2050             }
2051
2052             /**************************
2053              * CALCULATE INTERACTIONS *
2054              **************************/
2055
2056             if (gmx_mm_any_lt(rsq11,rcutoff2))
2057             {
2058
2059             r11              = _mm_mul_pd(rsq11,rinv11);
2060
2061             /* EWALD ELECTROSTATICS */
2062
2063             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2064             ewrt             = _mm_mul_pd(r11,ewtabscale);
2065             ewitab           = _mm_cvttpd_epi32(ewrt);
2066             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2067             ewitab           = _mm_slli_epi32(ewitab,2);
2068             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2069             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2070             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2071             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2072             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2073             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2074             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2075             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2076             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2077             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2078
2079             d                = _mm_sub_pd(r11,rswitch);
2080             d                = _mm_max_pd(d,_mm_setzero_pd());
2081             d2               = _mm_mul_pd(d,d);
2082             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2083
2084             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2085
2086             /* Evaluate switch function */
2087             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2088             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2089             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2090
2091             fscal            = felec;
2092
2093             fscal            = _mm_and_pd(fscal,cutoff_mask);
2094
2095             /* Calculate temporary vectorial force */
2096             tx               = _mm_mul_pd(fscal,dx11);
2097             ty               = _mm_mul_pd(fscal,dy11);
2098             tz               = _mm_mul_pd(fscal,dz11);
2099
2100             /* Update vectorial force */
2101             fix1             = _mm_add_pd(fix1,tx);
2102             fiy1             = _mm_add_pd(fiy1,ty);
2103             fiz1             = _mm_add_pd(fiz1,tz);
2104
2105             fjx1             = _mm_add_pd(fjx1,tx);
2106             fjy1             = _mm_add_pd(fjy1,ty);
2107             fjz1             = _mm_add_pd(fjz1,tz);
2108
2109             }
2110
2111             /**************************
2112              * CALCULATE INTERACTIONS *
2113              **************************/
2114
2115             if (gmx_mm_any_lt(rsq12,rcutoff2))
2116             {
2117
2118             r12              = _mm_mul_pd(rsq12,rinv12);
2119
2120             /* EWALD ELECTROSTATICS */
2121
2122             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2123             ewrt             = _mm_mul_pd(r12,ewtabscale);
2124             ewitab           = _mm_cvttpd_epi32(ewrt);
2125             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2126             ewitab           = _mm_slli_epi32(ewitab,2);
2127             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2128             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2129             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2130             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2131             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2132             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2133             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2134             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2135             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2136             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2137
2138             d                = _mm_sub_pd(r12,rswitch);
2139             d                = _mm_max_pd(d,_mm_setzero_pd());
2140             d2               = _mm_mul_pd(d,d);
2141             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2142
2143             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2144
2145             /* Evaluate switch function */
2146             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2147             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2148             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2149
2150             fscal            = felec;
2151
2152             fscal            = _mm_and_pd(fscal,cutoff_mask);
2153
2154             /* Calculate temporary vectorial force */
2155             tx               = _mm_mul_pd(fscal,dx12);
2156             ty               = _mm_mul_pd(fscal,dy12);
2157             tz               = _mm_mul_pd(fscal,dz12);
2158
2159             /* Update vectorial force */
2160             fix1             = _mm_add_pd(fix1,tx);
2161             fiy1             = _mm_add_pd(fiy1,ty);
2162             fiz1             = _mm_add_pd(fiz1,tz);
2163
2164             fjx2             = _mm_add_pd(fjx2,tx);
2165             fjy2             = _mm_add_pd(fjy2,ty);
2166             fjz2             = _mm_add_pd(fjz2,tz);
2167
2168             }
2169
2170             /**************************
2171              * CALCULATE INTERACTIONS *
2172              **************************/
2173
2174             if (gmx_mm_any_lt(rsq13,rcutoff2))
2175             {
2176
2177             r13              = _mm_mul_pd(rsq13,rinv13);
2178
2179             /* EWALD ELECTROSTATICS */
2180
2181             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2182             ewrt             = _mm_mul_pd(r13,ewtabscale);
2183             ewitab           = _mm_cvttpd_epi32(ewrt);
2184             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2185             ewitab           = _mm_slli_epi32(ewitab,2);
2186             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2187             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2188             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2189             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2190             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2191             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2192             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2193             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2194             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2195             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2196
2197             d                = _mm_sub_pd(r13,rswitch);
2198             d                = _mm_max_pd(d,_mm_setzero_pd());
2199             d2               = _mm_mul_pd(d,d);
2200             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2201
2202             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2203
2204             /* Evaluate switch function */
2205             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2206             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2207             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2208
2209             fscal            = felec;
2210
2211             fscal            = _mm_and_pd(fscal,cutoff_mask);
2212
2213             /* Calculate temporary vectorial force */
2214             tx               = _mm_mul_pd(fscal,dx13);
2215             ty               = _mm_mul_pd(fscal,dy13);
2216             tz               = _mm_mul_pd(fscal,dz13);
2217
2218             /* Update vectorial force */
2219             fix1             = _mm_add_pd(fix1,tx);
2220             fiy1             = _mm_add_pd(fiy1,ty);
2221             fiz1             = _mm_add_pd(fiz1,tz);
2222
2223             fjx3             = _mm_add_pd(fjx3,tx);
2224             fjy3             = _mm_add_pd(fjy3,ty);
2225             fjz3             = _mm_add_pd(fjz3,tz);
2226
2227             }
2228
2229             /**************************
2230              * CALCULATE INTERACTIONS *
2231              **************************/
2232
2233             if (gmx_mm_any_lt(rsq21,rcutoff2))
2234             {
2235
2236             r21              = _mm_mul_pd(rsq21,rinv21);
2237
2238             /* EWALD ELECTROSTATICS */
2239
2240             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2241             ewrt             = _mm_mul_pd(r21,ewtabscale);
2242             ewitab           = _mm_cvttpd_epi32(ewrt);
2243             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2244             ewitab           = _mm_slli_epi32(ewitab,2);
2245             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2246             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2247             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2248             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2249             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2250             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2251             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2252             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2253             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2254             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2255
2256             d                = _mm_sub_pd(r21,rswitch);
2257             d                = _mm_max_pd(d,_mm_setzero_pd());
2258             d2               = _mm_mul_pd(d,d);
2259             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2260
2261             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2262
2263             /* Evaluate switch function */
2264             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2265             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2266             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2267
2268             fscal            = felec;
2269
2270             fscal            = _mm_and_pd(fscal,cutoff_mask);
2271
2272             /* Calculate temporary vectorial force */
2273             tx               = _mm_mul_pd(fscal,dx21);
2274             ty               = _mm_mul_pd(fscal,dy21);
2275             tz               = _mm_mul_pd(fscal,dz21);
2276
2277             /* Update vectorial force */
2278             fix2             = _mm_add_pd(fix2,tx);
2279             fiy2             = _mm_add_pd(fiy2,ty);
2280             fiz2             = _mm_add_pd(fiz2,tz);
2281
2282             fjx1             = _mm_add_pd(fjx1,tx);
2283             fjy1             = _mm_add_pd(fjy1,ty);
2284             fjz1             = _mm_add_pd(fjz1,tz);
2285
2286             }
2287
2288             /**************************
2289              * CALCULATE INTERACTIONS *
2290              **************************/
2291
2292             if (gmx_mm_any_lt(rsq22,rcutoff2))
2293             {
2294
2295             r22              = _mm_mul_pd(rsq22,rinv22);
2296
2297             /* EWALD ELECTROSTATICS */
2298
2299             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2300             ewrt             = _mm_mul_pd(r22,ewtabscale);
2301             ewitab           = _mm_cvttpd_epi32(ewrt);
2302             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2303             ewitab           = _mm_slli_epi32(ewitab,2);
2304             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2305             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2306             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2307             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2308             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2309             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2310             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2311             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2312             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2313             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2314
2315             d                = _mm_sub_pd(r22,rswitch);
2316             d                = _mm_max_pd(d,_mm_setzero_pd());
2317             d2               = _mm_mul_pd(d,d);
2318             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2319
2320             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2321
2322             /* Evaluate switch function */
2323             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2324             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2325             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2326
2327             fscal            = felec;
2328
2329             fscal            = _mm_and_pd(fscal,cutoff_mask);
2330
2331             /* Calculate temporary vectorial force */
2332             tx               = _mm_mul_pd(fscal,dx22);
2333             ty               = _mm_mul_pd(fscal,dy22);
2334             tz               = _mm_mul_pd(fscal,dz22);
2335
2336             /* Update vectorial force */
2337             fix2             = _mm_add_pd(fix2,tx);
2338             fiy2             = _mm_add_pd(fiy2,ty);
2339             fiz2             = _mm_add_pd(fiz2,tz);
2340
2341             fjx2             = _mm_add_pd(fjx2,tx);
2342             fjy2             = _mm_add_pd(fjy2,ty);
2343             fjz2             = _mm_add_pd(fjz2,tz);
2344
2345             }
2346
2347             /**************************
2348              * CALCULATE INTERACTIONS *
2349              **************************/
2350
2351             if (gmx_mm_any_lt(rsq23,rcutoff2))
2352             {
2353
2354             r23              = _mm_mul_pd(rsq23,rinv23);
2355
2356             /* EWALD ELECTROSTATICS */
2357
2358             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2359             ewrt             = _mm_mul_pd(r23,ewtabscale);
2360             ewitab           = _mm_cvttpd_epi32(ewrt);
2361             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2362             ewitab           = _mm_slli_epi32(ewitab,2);
2363             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2364             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2365             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2366             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2367             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2368             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2369             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2370             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2371             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2372             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2373
2374             d                = _mm_sub_pd(r23,rswitch);
2375             d                = _mm_max_pd(d,_mm_setzero_pd());
2376             d2               = _mm_mul_pd(d,d);
2377             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2378
2379             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2380
2381             /* Evaluate switch function */
2382             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2383             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2384             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
2385
2386             fscal            = felec;
2387
2388             fscal            = _mm_and_pd(fscal,cutoff_mask);
2389
2390             /* Calculate temporary vectorial force */
2391             tx               = _mm_mul_pd(fscal,dx23);
2392             ty               = _mm_mul_pd(fscal,dy23);
2393             tz               = _mm_mul_pd(fscal,dz23);
2394
2395             /* Update vectorial force */
2396             fix2             = _mm_add_pd(fix2,tx);
2397             fiy2             = _mm_add_pd(fiy2,ty);
2398             fiz2             = _mm_add_pd(fiz2,tz);
2399
2400             fjx3             = _mm_add_pd(fjx3,tx);
2401             fjy3             = _mm_add_pd(fjy3,ty);
2402             fjz3             = _mm_add_pd(fjz3,tz);
2403
2404             }
2405
2406             /**************************
2407              * CALCULATE INTERACTIONS *
2408              **************************/
2409
2410             if (gmx_mm_any_lt(rsq31,rcutoff2))
2411             {
2412
2413             r31              = _mm_mul_pd(rsq31,rinv31);
2414
2415             /* EWALD ELECTROSTATICS */
2416
2417             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2418             ewrt             = _mm_mul_pd(r31,ewtabscale);
2419             ewitab           = _mm_cvttpd_epi32(ewrt);
2420             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2421             ewitab           = _mm_slli_epi32(ewitab,2);
2422             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2423             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2424             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2425             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2426             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2427             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2428             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2429             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2430             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2431             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2432
2433             d                = _mm_sub_pd(r31,rswitch);
2434             d                = _mm_max_pd(d,_mm_setzero_pd());
2435             d2               = _mm_mul_pd(d,d);
2436             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2437
2438             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2439
2440             /* Evaluate switch function */
2441             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2442             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2443             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
2444
2445             fscal            = felec;
2446
2447             fscal            = _mm_and_pd(fscal,cutoff_mask);
2448
2449             /* Calculate temporary vectorial force */
2450             tx               = _mm_mul_pd(fscal,dx31);
2451             ty               = _mm_mul_pd(fscal,dy31);
2452             tz               = _mm_mul_pd(fscal,dz31);
2453
2454             /* Update vectorial force */
2455             fix3             = _mm_add_pd(fix3,tx);
2456             fiy3             = _mm_add_pd(fiy3,ty);
2457             fiz3             = _mm_add_pd(fiz3,tz);
2458
2459             fjx1             = _mm_add_pd(fjx1,tx);
2460             fjy1             = _mm_add_pd(fjy1,ty);
2461             fjz1             = _mm_add_pd(fjz1,tz);
2462
2463             }
2464
2465             /**************************
2466              * CALCULATE INTERACTIONS *
2467              **************************/
2468
2469             if (gmx_mm_any_lt(rsq32,rcutoff2))
2470             {
2471
2472             r32              = _mm_mul_pd(rsq32,rinv32);
2473
2474             /* EWALD ELECTROSTATICS */
2475
2476             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2477             ewrt             = _mm_mul_pd(r32,ewtabscale);
2478             ewitab           = _mm_cvttpd_epi32(ewrt);
2479             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2480             ewitab           = _mm_slli_epi32(ewitab,2);
2481             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2482             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2483             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2484             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2485             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2486             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2487             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2488             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2489             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2490             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2491
2492             d                = _mm_sub_pd(r32,rswitch);
2493             d                = _mm_max_pd(d,_mm_setzero_pd());
2494             d2               = _mm_mul_pd(d,d);
2495             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2496
2497             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2498
2499             /* Evaluate switch function */
2500             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2501             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2502             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
2503
2504             fscal            = felec;
2505
2506             fscal            = _mm_and_pd(fscal,cutoff_mask);
2507
2508             /* Calculate temporary vectorial force */
2509             tx               = _mm_mul_pd(fscal,dx32);
2510             ty               = _mm_mul_pd(fscal,dy32);
2511             tz               = _mm_mul_pd(fscal,dz32);
2512
2513             /* Update vectorial force */
2514             fix3             = _mm_add_pd(fix3,tx);
2515             fiy3             = _mm_add_pd(fiy3,ty);
2516             fiz3             = _mm_add_pd(fiz3,tz);
2517
2518             fjx2             = _mm_add_pd(fjx2,tx);
2519             fjy2             = _mm_add_pd(fjy2,ty);
2520             fjz2             = _mm_add_pd(fjz2,tz);
2521
2522             }
2523
2524             /**************************
2525              * CALCULATE INTERACTIONS *
2526              **************************/
2527
2528             if (gmx_mm_any_lt(rsq33,rcutoff2))
2529             {
2530
2531             r33              = _mm_mul_pd(rsq33,rinv33);
2532
2533             /* EWALD ELECTROSTATICS */
2534
2535             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2536             ewrt             = _mm_mul_pd(r33,ewtabscale);
2537             ewitab           = _mm_cvttpd_epi32(ewrt);
2538             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2539             ewitab           = _mm_slli_epi32(ewitab,2);
2540             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2541             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2542             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2543             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2544             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2545             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2546             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2547             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2548             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2549             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2550
2551             d                = _mm_sub_pd(r33,rswitch);
2552             d                = _mm_max_pd(d,_mm_setzero_pd());
2553             d2               = _mm_mul_pd(d,d);
2554             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2555
2556             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2557
2558             /* Evaluate switch function */
2559             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2560             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2561             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
2562
2563             fscal            = felec;
2564
2565             fscal            = _mm_and_pd(fscal,cutoff_mask);
2566
2567             /* Calculate temporary vectorial force */
2568             tx               = _mm_mul_pd(fscal,dx33);
2569             ty               = _mm_mul_pd(fscal,dy33);
2570             tz               = _mm_mul_pd(fscal,dz33);
2571
2572             /* Update vectorial force */
2573             fix3             = _mm_add_pd(fix3,tx);
2574             fiy3             = _mm_add_pd(fiy3,ty);
2575             fiz3             = _mm_add_pd(fiz3,tz);
2576
2577             fjx3             = _mm_add_pd(fjx3,tx);
2578             fjy3             = _mm_add_pd(fjy3,ty);
2579             fjz3             = _mm_add_pd(fjz3,tz);
2580
2581             }
2582
2583             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2584
2585             /* Inner loop uses 617 flops */
2586         }
2587
2588         if(jidx<j_index_end)
2589         {
2590
2591             jnrA             = jjnr[jidx];
2592             j_coord_offsetA  = DIM*jnrA;
2593
2594             /* load j atom coordinates */
2595             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2596                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2597                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2598
2599             /* Calculate displacement vector */
2600             dx00             = _mm_sub_pd(ix0,jx0);
2601             dy00             = _mm_sub_pd(iy0,jy0);
2602             dz00             = _mm_sub_pd(iz0,jz0);
2603             dx11             = _mm_sub_pd(ix1,jx1);
2604             dy11             = _mm_sub_pd(iy1,jy1);
2605             dz11             = _mm_sub_pd(iz1,jz1);
2606             dx12             = _mm_sub_pd(ix1,jx2);
2607             dy12             = _mm_sub_pd(iy1,jy2);
2608             dz12             = _mm_sub_pd(iz1,jz2);
2609             dx13             = _mm_sub_pd(ix1,jx3);
2610             dy13             = _mm_sub_pd(iy1,jy3);
2611             dz13             = _mm_sub_pd(iz1,jz3);
2612             dx21             = _mm_sub_pd(ix2,jx1);
2613             dy21             = _mm_sub_pd(iy2,jy1);
2614             dz21             = _mm_sub_pd(iz2,jz1);
2615             dx22             = _mm_sub_pd(ix2,jx2);
2616             dy22             = _mm_sub_pd(iy2,jy2);
2617             dz22             = _mm_sub_pd(iz2,jz2);
2618             dx23             = _mm_sub_pd(ix2,jx3);
2619             dy23             = _mm_sub_pd(iy2,jy3);
2620             dz23             = _mm_sub_pd(iz2,jz3);
2621             dx31             = _mm_sub_pd(ix3,jx1);
2622             dy31             = _mm_sub_pd(iy3,jy1);
2623             dz31             = _mm_sub_pd(iz3,jz1);
2624             dx32             = _mm_sub_pd(ix3,jx2);
2625             dy32             = _mm_sub_pd(iy3,jy2);
2626             dz32             = _mm_sub_pd(iz3,jz2);
2627             dx33             = _mm_sub_pd(ix3,jx3);
2628             dy33             = _mm_sub_pd(iy3,jy3);
2629             dz33             = _mm_sub_pd(iz3,jz3);
2630
2631             /* Calculate squared distance and things based on it */
2632             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2633             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2634             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2635             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2636             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2637             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2638             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2639             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2640             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2641             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2642
2643             rinv00           = gmx_mm_invsqrt_pd(rsq00);
2644             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2645             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2646             rinv13           = gmx_mm_invsqrt_pd(rsq13);
2647             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2648             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2649             rinv23           = gmx_mm_invsqrt_pd(rsq23);
2650             rinv31           = gmx_mm_invsqrt_pd(rsq31);
2651             rinv32           = gmx_mm_invsqrt_pd(rsq32);
2652             rinv33           = gmx_mm_invsqrt_pd(rsq33);
2653
2654             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
2655             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2656             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2657             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
2658             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2659             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2660             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
2661             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
2662             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
2663             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
2664
2665             fjx0             = _mm_setzero_pd();
2666             fjy0             = _mm_setzero_pd();
2667             fjz0             = _mm_setzero_pd();
2668             fjx1             = _mm_setzero_pd();
2669             fjy1             = _mm_setzero_pd();
2670             fjz1             = _mm_setzero_pd();
2671             fjx2             = _mm_setzero_pd();
2672             fjy2             = _mm_setzero_pd();
2673             fjz2             = _mm_setzero_pd();
2674             fjx3             = _mm_setzero_pd();
2675             fjy3             = _mm_setzero_pd();
2676             fjz3             = _mm_setzero_pd();
2677
2678             /**************************
2679              * CALCULATE INTERACTIONS *
2680              **************************/
2681
2682             if (gmx_mm_any_lt(rsq00,rcutoff2))
2683             {
2684
2685             r00              = _mm_mul_pd(rsq00,rinv00);
2686
2687             /* LENNARD-JONES DISPERSION/REPULSION */
2688
2689             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2690             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
2691             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2692             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2693             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2694
2695             d                = _mm_sub_pd(r00,rswitch);
2696             d                = _mm_max_pd(d,_mm_setzero_pd());
2697             d2               = _mm_mul_pd(d,d);
2698             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2699
2700             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2701
2702             /* Evaluate switch function */
2703             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2704             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2705             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2706
2707             fscal            = fvdw;
2708
2709             fscal            = _mm_and_pd(fscal,cutoff_mask);
2710
2711             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2712
2713             /* Calculate temporary vectorial force */
2714             tx               = _mm_mul_pd(fscal,dx00);
2715             ty               = _mm_mul_pd(fscal,dy00);
2716             tz               = _mm_mul_pd(fscal,dz00);
2717
2718             /* Update vectorial force */
2719             fix0             = _mm_add_pd(fix0,tx);
2720             fiy0             = _mm_add_pd(fiy0,ty);
2721             fiz0             = _mm_add_pd(fiz0,tz);
2722
2723             fjx0             = _mm_add_pd(fjx0,tx);
2724             fjy0             = _mm_add_pd(fjy0,ty);
2725             fjz0             = _mm_add_pd(fjz0,tz);
2726
2727             }
2728
2729             /**************************
2730              * CALCULATE INTERACTIONS *
2731              **************************/
2732
2733             if (gmx_mm_any_lt(rsq11,rcutoff2))
2734             {
2735
2736             r11              = _mm_mul_pd(rsq11,rinv11);
2737
2738             /* EWALD ELECTROSTATICS */
2739
2740             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2741             ewrt             = _mm_mul_pd(r11,ewtabscale);
2742             ewitab           = _mm_cvttpd_epi32(ewrt);
2743             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2744             ewitab           = _mm_slli_epi32(ewitab,2);
2745             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2746             ewtabD           = _mm_setzero_pd();
2747             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2748             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2749             ewtabFn          = _mm_setzero_pd();
2750             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2751             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2752             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2753             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2754             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2755
2756             d                = _mm_sub_pd(r11,rswitch);
2757             d                = _mm_max_pd(d,_mm_setzero_pd());
2758             d2               = _mm_mul_pd(d,d);
2759             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2760
2761             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2762
2763             /* Evaluate switch function */
2764             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2765             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2766             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2767
2768             fscal            = felec;
2769
2770             fscal            = _mm_and_pd(fscal,cutoff_mask);
2771
2772             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2773
2774             /* Calculate temporary vectorial force */
2775             tx               = _mm_mul_pd(fscal,dx11);
2776             ty               = _mm_mul_pd(fscal,dy11);
2777             tz               = _mm_mul_pd(fscal,dz11);
2778
2779             /* Update vectorial force */
2780             fix1             = _mm_add_pd(fix1,tx);
2781             fiy1             = _mm_add_pd(fiy1,ty);
2782             fiz1             = _mm_add_pd(fiz1,tz);
2783
2784             fjx1             = _mm_add_pd(fjx1,tx);
2785             fjy1             = _mm_add_pd(fjy1,ty);
2786             fjz1             = _mm_add_pd(fjz1,tz);
2787
2788             }
2789
2790             /**************************
2791              * CALCULATE INTERACTIONS *
2792              **************************/
2793
2794             if (gmx_mm_any_lt(rsq12,rcutoff2))
2795             {
2796
2797             r12              = _mm_mul_pd(rsq12,rinv12);
2798
2799             /* EWALD ELECTROSTATICS */
2800
2801             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2802             ewrt             = _mm_mul_pd(r12,ewtabscale);
2803             ewitab           = _mm_cvttpd_epi32(ewrt);
2804             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2805             ewitab           = _mm_slli_epi32(ewitab,2);
2806             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2807             ewtabD           = _mm_setzero_pd();
2808             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2809             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2810             ewtabFn          = _mm_setzero_pd();
2811             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2812             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2813             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2814             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2815             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2816
2817             d                = _mm_sub_pd(r12,rswitch);
2818             d                = _mm_max_pd(d,_mm_setzero_pd());
2819             d2               = _mm_mul_pd(d,d);
2820             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2821
2822             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2823
2824             /* Evaluate switch function */
2825             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2826             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2827             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2828
2829             fscal            = felec;
2830
2831             fscal            = _mm_and_pd(fscal,cutoff_mask);
2832
2833             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2834
2835             /* Calculate temporary vectorial force */
2836             tx               = _mm_mul_pd(fscal,dx12);
2837             ty               = _mm_mul_pd(fscal,dy12);
2838             tz               = _mm_mul_pd(fscal,dz12);
2839
2840             /* Update vectorial force */
2841             fix1             = _mm_add_pd(fix1,tx);
2842             fiy1             = _mm_add_pd(fiy1,ty);
2843             fiz1             = _mm_add_pd(fiz1,tz);
2844
2845             fjx2             = _mm_add_pd(fjx2,tx);
2846             fjy2             = _mm_add_pd(fjy2,ty);
2847             fjz2             = _mm_add_pd(fjz2,tz);
2848
2849             }
2850
2851             /**************************
2852              * CALCULATE INTERACTIONS *
2853              **************************/
2854
2855             if (gmx_mm_any_lt(rsq13,rcutoff2))
2856             {
2857
2858             r13              = _mm_mul_pd(rsq13,rinv13);
2859
2860             /* EWALD ELECTROSTATICS */
2861
2862             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2863             ewrt             = _mm_mul_pd(r13,ewtabscale);
2864             ewitab           = _mm_cvttpd_epi32(ewrt);
2865             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2866             ewitab           = _mm_slli_epi32(ewitab,2);
2867             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2868             ewtabD           = _mm_setzero_pd();
2869             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2870             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2871             ewtabFn          = _mm_setzero_pd();
2872             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2873             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2874             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2875             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2876             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2877
2878             d                = _mm_sub_pd(r13,rswitch);
2879             d                = _mm_max_pd(d,_mm_setzero_pd());
2880             d2               = _mm_mul_pd(d,d);
2881             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2882
2883             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2884
2885             /* Evaluate switch function */
2886             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2887             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2888             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2889
2890             fscal            = felec;
2891
2892             fscal            = _mm_and_pd(fscal,cutoff_mask);
2893
2894             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2895
2896             /* Calculate temporary vectorial force */
2897             tx               = _mm_mul_pd(fscal,dx13);
2898             ty               = _mm_mul_pd(fscal,dy13);
2899             tz               = _mm_mul_pd(fscal,dz13);
2900
2901             /* Update vectorial force */
2902             fix1             = _mm_add_pd(fix1,tx);
2903             fiy1             = _mm_add_pd(fiy1,ty);
2904             fiz1             = _mm_add_pd(fiz1,tz);
2905
2906             fjx3             = _mm_add_pd(fjx3,tx);
2907             fjy3             = _mm_add_pd(fjy3,ty);
2908             fjz3             = _mm_add_pd(fjz3,tz);
2909
2910             }
2911
2912             /**************************
2913              * CALCULATE INTERACTIONS *
2914              **************************/
2915
2916             if (gmx_mm_any_lt(rsq21,rcutoff2))
2917             {
2918
2919             r21              = _mm_mul_pd(rsq21,rinv21);
2920
2921             /* EWALD ELECTROSTATICS */
2922
2923             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2924             ewrt             = _mm_mul_pd(r21,ewtabscale);
2925             ewitab           = _mm_cvttpd_epi32(ewrt);
2926             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2927             ewitab           = _mm_slli_epi32(ewitab,2);
2928             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2929             ewtabD           = _mm_setzero_pd();
2930             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2931             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2932             ewtabFn          = _mm_setzero_pd();
2933             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2934             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2935             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2936             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2937             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2938
2939             d                = _mm_sub_pd(r21,rswitch);
2940             d                = _mm_max_pd(d,_mm_setzero_pd());
2941             d2               = _mm_mul_pd(d,d);
2942             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2943
2944             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2945
2946             /* Evaluate switch function */
2947             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2948             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2949             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2950
2951             fscal            = felec;
2952
2953             fscal            = _mm_and_pd(fscal,cutoff_mask);
2954
2955             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2956
2957             /* Calculate temporary vectorial force */
2958             tx               = _mm_mul_pd(fscal,dx21);
2959             ty               = _mm_mul_pd(fscal,dy21);
2960             tz               = _mm_mul_pd(fscal,dz21);
2961
2962             /* Update vectorial force */
2963             fix2             = _mm_add_pd(fix2,tx);
2964             fiy2             = _mm_add_pd(fiy2,ty);
2965             fiz2             = _mm_add_pd(fiz2,tz);
2966
2967             fjx1             = _mm_add_pd(fjx1,tx);
2968             fjy1             = _mm_add_pd(fjy1,ty);
2969             fjz1             = _mm_add_pd(fjz1,tz);
2970
2971             }
2972
2973             /**************************
2974              * CALCULATE INTERACTIONS *
2975              **************************/
2976
2977             if (gmx_mm_any_lt(rsq22,rcutoff2))
2978             {
2979
2980             r22              = _mm_mul_pd(rsq22,rinv22);
2981
2982             /* EWALD ELECTROSTATICS */
2983
2984             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2985             ewrt             = _mm_mul_pd(r22,ewtabscale);
2986             ewitab           = _mm_cvttpd_epi32(ewrt);
2987             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2988             ewitab           = _mm_slli_epi32(ewitab,2);
2989             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2990             ewtabD           = _mm_setzero_pd();
2991             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2992             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2993             ewtabFn          = _mm_setzero_pd();
2994             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2995             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2996             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2997             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2998             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2999
3000             d                = _mm_sub_pd(r22,rswitch);
3001             d                = _mm_max_pd(d,_mm_setzero_pd());
3002             d2               = _mm_mul_pd(d,d);
3003             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3004
3005             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3006
3007             /* Evaluate switch function */
3008             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3009             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
3010             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
3011
3012             fscal            = felec;
3013
3014             fscal            = _mm_and_pd(fscal,cutoff_mask);
3015
3016             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3017
3018             /* Calculate temporary vectorial force */
3019             tx               = _mm_mul_pd(fscal,dx22);
3020             ty               = _mm_mul_pd(fscal,dy22);
3021             tz               = _mm_mul_pd(fscal,dz22);
3022
3023             /* Update vectorial force */
3024             fix2             = _mm_add_pd(fix2,tx);
3025             fiy2             = _mm_add_pd(fiy2,ty);
3026             fiz2             = _mm_add_pd(fiz2,tz);
3027
3028             fjx2             = _mm_add_pd(fjx2,tx);
3029             fjy2             = _mm_add_pd(fjy2,ty);
3030             fjz2             = _mm_add_pd(fjz2,tz);
3031
3032             }
3033
3034             /**************************
3035              * CALCULATE INTERACTIONS *
3036              **************************/
3037
3038             if (gmx_mm_any_lt(rsq23,rcutoff2))
3039             {
3040
3041             r23              = _mm_mul_pd(rsq23,rinv23);
3042
3043             /* EWALD ELECTROSTATICS */
3044
3045             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3046             ewrt             = _mm_mul_pd(r23,ewtabscale);
3047             ewitab           = _mm_cvttpd_epi32(ewrt);
3048             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3049             ewitab           = _mm_slli_epi32(ewitab,2);
3050             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3051             ewtabD           = _mm_setzero_pd();
3052             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3053             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3054             ewtabFn          = _mm_setzero_pd();
3055             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3056             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3057             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3058             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
3059             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
3060
3061             d                = _mm_sub_pd(r23,rswitch);
3062             d                = _mm_max_pd(d,_mm_setzero_pd());
3063             d2               = _mm_mul_pd(d,d);
3064             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3065
3066             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3067
3068             /* Evaluate switch function */
3069             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3070             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
3071             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
3072
3073             fscal            = felec;
3074
3075             fscal            = _mm_and_pd(fscal,cutoff_mask);
3076
3077             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3078
3079             /* Calculate temporary vectorial force */
3080             tx               = _mm_mul_pd(fscal,dx23);
3081             ty               = _mm_mul_pd(fscal,dy23);
3082             tz               = _mm_mul_pd(fscal,dz23);
3083
3084             /* Update vectorial force */
3085             fix2             = _mm_add_pd(fix2,tx);
3086             fiy2             = _mm_add_pd(fiy2,ty);
3087             fiz2             = _mm_add_pd(fiz2,tz);
3088
3089             fjx3             = _mm_add_pd(fjx3,tx);
3090             fjy3             = _mm_add_pd(fjy3,ty);
3091             fjz3             = _mm_add_pd(fjz3,tz);
3092
3093             }
3094
3095             /**************************
3096              * CALCULATE INTERACTIONS *
3097              **************************/
3098
3099             if (gmx_mm_any_lt(rsq31,rcutoff2))
3100             {
3101
3102             r31              = _mm_mul_pd(rsq31,rinv31);
3103
3104             /* EWALD ELECTROSTATICS */
3105
3106             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3107             ewrt             = _mm_mul_pd(r31,ewtabscale);
3108             ewitab           = _mm_cvttpd_epi32(ewrt);
3109             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3110             ewitab           = _mm_slli_epi32(ewitab,2);
3111             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3112             ewtabD           = _mm_setzero_pd();
3113             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3114             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3115             ewtabFn          = _mm_setzero_pd();
3116             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3117             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3118             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3119             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
3120             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
3121
3122             d                = _mm_sub_pd(r31,rswitch);
3123             d                = _mm_max_pd(d,_mm_setzero_pd());
3124             d2               = _mm_mul_pd(d,d);
3125             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3126
3127             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3128
3129             /* Evaluate switch function */
3130             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3131             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
3132             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
3133
3134             fscal            = felec;
3135
3136             fscal            = _mm_and_pd(fscal,cutoff_mask);
3137
3138             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3139
3140             /* Calculate temporary vectorial force */
3141             tx               = _mm_mul_pd(fscal,dx31);
3142             ty               = _mm_mul_pd(fscal,dy31);
3143             tz               = _mm_mul_pd(fscal,dz31);
3144
3145             /* Update vectorial force */
3146             fix3             = _mm_add_pd(fix3,tx);
3147             fiy3             = _mm_add_pd(fiy3,ty);
3148             fiz3             = _mm_add_pd(fiz3,tz);
3149
3150             fjx1             = _mm_add_pd(fjx1,tx);
3151             fjy1             = _mm_add_pd(fjy1,ty);
3152             fjz1             = _mm_add_pd(fjz1,tz);
3153
3154             }
3155
3156             /**************************
3157              * CALCULATE INTERACTIONS *
3158              **************************/
3159
3160             if (gmx_mm_any_lt(rsq32,rcutoff2))
3161             {
3162
3163             r32              = _mm_mul_pd(rsq32,rinv32);
3164
3165             /* EWALD ELECTROSTATICS */
3166
3167             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3168             ewrt             = _mm_mul_pd(r32,ewtabscale);
3169             ewitab           = _mm_cvttpd_epi32(ewrt);
3170             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3171             ewitab           = _mm_slli_epi32(ewitab,2);
3172             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3173             ewtabD           = _mm_setzero_pd();
3174             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3175             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3176             ewtabFn          = _mm_setzero_pd();
3177             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3178             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3179             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3180             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
3181             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
3182
3183             d                = _mm_sub_pd(r32,rswitch);
3184             d                = _mm_max_pd(d,_mm_setzero_pd());
3185             d2               = _mm_mul_pd(d,d);
3186             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3187
3188             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3189
3190             /* Evaluate switch function */
3191             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3192             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
3193             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
3194
3195             fscal            = felec;
3196
3197             fscal            = _mm_and_pd(fscal,cutoff_mask);
3198
3199             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3200
3201             /* Calculate temporary vectorial force */
3202             tx               = _mm_mul_pd(fscal,dx32);
3203             ty               = _mm_mul_pd(fscal,dy32);
3204             tz               = _mm_mul_pd(fscal,dz32);
3205
3206             /* Update vectorial force */
3207             fix3             = _mm_add_pd(fix3,tx);
3208             fiy3             = _mm_add_pd(fiy3,ty);
3209             fiz3             = _mm_add_pd(fiz3,tz);
3210
3211             fjx2             = _mm_add_pd(fjx2,tx);
3212             fjy2             = _mm_add_pd(fjy2,ty);
3213             fjz2             = _mm_add_pd(fjz2,tz);
3214
3215             }
3216
3217             /**************************
3218              * CALCULATE INTERACTIONS *
3219              **************************/
3220
3221             if (gmx_mm_any_lt(rsq33,rcutoff2))
3222             {
3223
3224             r33              = _mm_mul_pd(rsq33,rinv33);
3225
3226             /* EWALD ELECTROSTATICS */
3227
3228             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3229             ewrt             = _mm_mul_pd(r33,ewtabscale);
3230             ewitab           = _mm_cvttpd_epi32(ewrt);
3231             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3232             ewitab           = _mm_slli_epi32(ewitab,2);
3233             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3234             ewtabD           = _mm_setzero_pd();
3235             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3236             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3237             ewtabFn          = _mm_setzero_pd();
3238             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3239             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3240             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3241             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
3242             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
3243
3244             d                = _mm_sub_pd(r33,rswitch);
3245             d                = _mm_max_pd(d,_mm_setzero_pd());
3246             d2               = _mm_mul_pd(d,d);
3247             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3248
3249             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3250
3251             /* Evaluate switch function */
3252             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3253             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
3254             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
3255
3256             fscal            = felec;
3257
3258             fscal            = _mm_and_pd(fscal,cutoff_mask);
3259
3260             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3261
3262             /* Calculate temporary vectorial force */
3263             tx               = _mm_mul_pd(fscal,dx33);
3264             ty               = _mm_mul_pd(fscal,dy33);
3265             tz               = _mm_mul_pd(fscal,dz33);
3266
3267             /* Update vectorial force */
3268             fix3             = _mm_add_pd(fix3,tx);
3269             fiy3             = _mm_add_pd(fiy3,ty);
3270             fiz3             = _mm_add_pd(fiz3,tz);
3271
3272             fjx3             = _mm_add_pd(fjx3,tx);
3273             fjy3             = _mm_add_pd(fjy3,ty);
3274             fjz3             = _mm_add_pd(fjz3,tz);
3275
3276             }
3277
3278             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3279
3280             /* Inner loop uses 617 flops */
3281         }
3282
3283         /* End of innermost loop */
3284
3285         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3286                                               f+i_coord_offset,fshift+i_shift_offset);
3287
3288         /* Increment number of inner iterations */
3289         inneriter                  += j_index_end - j_index_start;
3290
3291         /* Outer loop uses 24 flops */
3292     }
3293
3294     /* Increment number of outer iterations */
3295     outeriter        += nri;
3296
3297     /* Update outer/inner flops */
3298
3299     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*617);
3300 }