made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse2_double.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwioffset3;
73     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74     int              vdwjidx0A,vdwjidx0B;
75     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76     int              vdwjidx1A,vdwjidx1B;
77     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78     int              vdwjidx2A,vdwjidx2B;
79     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80     int              vdwjidx3A,vdwjidx3B;
81     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
93     real             *charge;
94     int              nvdwtype;
95     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96     int              *vdwtype;
97     real             *vdwparam;
98     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
99     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
100     __m128i          ewitab;
101     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
102     real             *ewtab;
103     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
104     real             rswitch_scalar,d_scalar;
105     __m128d          dummy_mask,cutoff_mask;
106     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
107     __m128d          one     = _mm_set1_pd(1.0);
108     __m128d          two     = _mm_set1_pd(2.0);
109     x                = xx[0];
110     f                = ff[0];
111
112     nri              = nlist->nri;
113     iinr             = nlist->iinr;
114     jindex           = nlist->jindex;
115     jjnr             = nlist->jjnr;
116     shiftidx         = nlist->shift;
117     gid              = nlist->gid;
118     shiftvec         = fr->shift_vec[0];
119     fshift           = fr->fshift[0];
120     facel            = _mm_set1_pd(fr->epsfac);
121     charge           = mdatoms->chargeA;
122     nvdwtype         = fr->ntype;
123     vdwparam         = fr->nbfp;
124     vdwtype          = mdatoms->typeA;
125
126     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
127     ewtab            = fr->ic->tabq_coul_FDV0;
128     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
129     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
130
131     /* Setup water-specific parameters */
132     inr              = nlist->iinr[0];
133     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
134     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
135     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
136     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
137
138     jq1              = _mm_set1_pd(charge[inr+1]);
139     jq2              = _mm_set1_pd(charge[inr+2]);
140     jq3              = _mm_set1_pd(charge[inr+3]);
141     vdwjidx0A        = 2*vdwtype[inr+0];
142     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
143     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
144     qq11             = _mm_mul_pd(iq1,jq1);
145     qq12             = _mm_mul_pd(iq1,jq2);
146     qq13             = _mm_mul_pd(iq1,jq3);
147     qq21             = _mm_mul_pd(iq2,jq1);
148     qq22             = _mm_mul_pd(iq2,jq2);
149     qq23             = _mm_mul_pd(iq2,jq3);
150     qq31             = _mm_mul_pd(iq3,jq1);
151     qq32             = _mm_mul_pd(iq3,jq2);
152     qq33             = _mm_mul_pd(iq3,jq3);
153
154     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155     rcutoff_scalar   = fr->rcoulomb;
156     rcutoff          = _mm_set1_pd(rcutoff_scalar);
157     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
158
159     rswitch_scalar   = fr->rcoulomb_switch;
160     rswitch          = _mm_set1_pd(rswitch_scalar);
161     /* Setup switch parameters */
162     d_scalar         = rcutoff_scalar-rswitch_scalar;
163     d                = _mm_set1_pd(d_scalar);
164     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
165     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
166     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
167     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
168     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
169     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
170
171     /* Avoid stupid compiler warnings */
172     jnrA = jnrB = 0;
173     j_coord_offsetA = 0;
174     j_coord_offsetB = 0;
175
176     outeriter        = 0;
177     inneriter        = 0;
178
179     /* Start outer loop over neighborlists */
180     for(iidx=0; iidx<nri; iidx++)
181     {
182         /* Load shift vector for this list */
183         i_shift_offset   = DIM*shiftidx[iidx];
184
185         /* Load limits for loop over neighbors */
186         j_index_start    = jindex[iidx];
187         j_index_end      = jindex[iidx+1];
188
189         /* Get outer coordinate index */
190         inr              = iinr[iidx];
191         i_coord_offset   = DIM*inr;
192
193         /* Load i particle coords and add shift vector */
194         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
195                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
196
197         fix0             = _mm_setzero_pd();
198         fiy0             = _mm_setzero_pd();
199         fiz0             = _mm_setzero_pd();
200         fix1             = _mm_setzero_pd();
201         fiy1             = _mm_setzero_pd();
202         fiz1             = _mm_setzero_pd();
203         fix2             = _mm_setzero_pd();
204         fiy2             = _mm_setzero_pd();
205         fiz2             = _mm_setzero_pd();
206         fix3             = _mm_setzero_pd();
207         fiy3             = _mm_setzero_pd();
208         fiz3             = _mm_setzero_pd();
209
210         /* Reset potential sums */
211         velecsum         = _mm_setzero_pd();
212         vvdwsum          = _mm_setzero_pd();
213
214         /* Start inner kernel loop */
215         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
216         {
217
218             /* Get j neighbor index, and coordinate index */
219             jnrA             = jjnr[jidx];
220             jnrB             = jjnr[jidx+1];
221             j_coord_offsetA  = DIM*jnrA;
222             j_coord_offsetB  = DIM*jnrB;
223             
224             /* load j atom coordinates */
225             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
226                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
227                                               &jy2,&jz2,&jx3,&jy3,&jz3);
228             
229             /* Calculate displacement vector */
230             dx00             = _mm_sub_pd(ix0,jx0);
231             dy00             = _mm_sub_pd(iy0,jy0);
232             dz00             = _mm_sub_pd(iz0,jz0);
233             dx11             = _mm_sub_pd(ix1,jx1);
234             dy11             = _mm_sub_pd(iy1,jy1);
235             dz11             = _mm_sub_pd(iz1,jz1);
236             dx12             = _mm_sub_pd(ix1,jx2);
237             dy12             = _mm_sub_pd(iy1,jy2);
238             dz12             = _mm_sub_pd(iz1,jz2);
239             dx13             = _mm_sub_pd(ix1,jx3);
240             dy13             = _mm_sub_pd(iy1,jy3);
241             dz13             = _mm_sub_pd(iz1,jz3);
242             dx21             = _mm_sub_pd(ix2,jx1);
243             dy21             = _mm_sub_pd(iy2,jy1);
244             dz21             = _mm_sub_pd(iz2,jz1);
245             dx22             = _mm_sub_pd(ix2,jx2);
246             dy22             = _mm_sub_pd(iy2,jy2);
247             dz22             = _mm_sub_pd(iz2,jz2);
248             dx23             = _mm_sub_pd(ix2,jx3);
249             dy23             = _mm_sub_pd(iy2,jy3);
250             dz23             = _mm_sub_pd(iz2,jz3);
251             dx31             = _mm_sub_pd(ix3,jx1);
252             dy31             = _mm_sub_pd(iy3,jy1);
253             dz31             = _mm_sub_pd(iz3,jz1);
254             dx32             = _mm_sub_pd(ix3,jx2);
255             dy32             = _mm_sub_pd(iy3,jy2);
256             dz32             = _mm_sub_pd(iz3,jz2);
257             dx33             = _mm_sub_pd(ix3,jx3);
258             dy33             = _mm_sub_pd(iy3,jy3);
259             dz33             = _mm_sub_pd(iz3,jz3);
260
261             /* Calculate squared distance and things based on it */
262             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
263             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
264             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
265             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
266             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
267             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
268             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
269             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
270             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
271             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
272
273             rinv00           = gmx_mm_invsqrt_pd(rsq00);
274             rinv11           = gmx_mm_invsqrt_pd(rsq11);
275             rinv12           = gmx_mm_invsqrt_pd(rsq12);
276             rinv13           = gmx_mm_invsqrt_pd(rsq13);
277             rinv21           = gmx_mm_invsqrt_pd(rsq21);
278             rinv22           = gmx_mm_invsqrt_pd(rsq22);
279             rinv23           = gmx_mm_invsqrt_pd(rsq23);
280             rinv31           = gmx_mm_invsqrt_pd(rsq31);
281             rinv32           = gmx_mm_invsqrt_pd(rsq32);
282             rinv33           = gmx_mm_invsqrt_pd(rsq33);
283
284             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
285             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
286             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
287             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
288             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
289             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
290             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
291             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
292             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
293             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
294
295             fjx0             = _mm_setzero_pd();
296             fjy0             = _mm_setzero_pd();
297             fjz0             = _mm_setzero_pd();
298             fjx1             = _mm_setzero_pd();
299             fjy1             = _mm_setzero_pd();
300             fjz1             = _mm_setzero_pd();
301             fjx2             = _mm_setzero_pd();
302             fjy2             = _mm_setzero_pd();
303             fjz2             = _mm_setzero_pd();
304             fjx3             = _mm_setzero_pd();
305             fjy3             = _mm_setzero_pd();
306             fjz3             = _mm_setzero_pd();
307
308             /**************************
309              * CALCULATE INTERACTIONS *
310              **************************/
311
312             if (gmx_mm_any_lt(rsq00,rcutoff2))
313             {
314
315             r00              = _mm_mul_pd(rsq00,rinv00);
316
317             /* LENNARD-JONES DISPERSION/REPULSION */
318
319             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
320             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
321             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
322             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
323             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
324
325             d                = _mm_sub_pd(r00,rswitch);
326             d                = _mm_max_pd(d,_mm_setzero_pd());
327             d2               = _mm_mul_pd(d,d);
328             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
329
330             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
331
332             /* Evaluate switch function */
333             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
334             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
335             vvdw             = _mm_mul_pd(vvdw,sw);
336             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
337
338             /* Update potential sum for this i atom from the interaction with this j atom. */
339             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
340             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
341
342             fscal            = fvdw;
343
344             fscal            = _mm_and_pd(fscal,cutoff_mask);
345
346             /* Calculate temporary vectorial force */
347             tx               = _mm_mul_pd(fscal,dx00);
348             ty               = _mm_mul_pd(fscal,dy00);
349             tz               = _mm_mul_pd(fscal,dz00);
350
351             /* Update vectorial force */
352             fix0             = _mm_add_pd(fix0,tx);
353             fiy0             = _mm_add_pd(fiy0,ty);
354             fiz0             = _mm_add_pd(fiz0,tz);
355
356             fjx0             = _mm_add_pd(fjx0,tx);
357             fjy0             = _mm_add_pd(fjy0,ty);
358             fjz0             = _mm_add_pd(fjz0,tz);
359
360             }
361
362             /**************************
363              * CALCULATE INTERACTIONS *
364              **************************/
365
366             if (gmx_mm_any_lt(rsq11,rcutoff2))
367             {
368
369             r11              = _mm_mul_pd(rsq11,rinv11);
370
371             /* EWALD ELECTROSTATICS */
372
373             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
374             ewrt             = _mm_mul_pd(r11,ewtabscale);
375             ewitab           = _mm_cvttpd_epi32(ewrt);
376             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
377             ewitab           = _mm_slli_epi32(ewitab,2);
378             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
379             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
380             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
381             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
382             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
383             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
384             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
385             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
386             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
387             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
388
389             d                = _mm_sub_pd(r11,rswitch);
390             d                = _mm_max_pd(d,_mm_setzero_pd());
391             d2               = _mm_mul_pd(d,d);
392             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
393
394             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
395
396             /* Evaluate switch function */
397             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
398             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
399             velec            = _mm_mul_pd(velec,sw);
400             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
401
402             /* Update potential sum for this i atom from the interaction with this j atom. */
403             velec            = _mm_and_pd(velec,cutoff_mask);
404             velecsum         = _mm_add_pd(velecsum,velec);
405
406             fscal            = felec;
407
408             fscal            = _mm_and_pd(fscal,cutoff_mask);
409
410             /* Calculate temporary vectorial force */
411             tx               = _mm_mul_pd(fscal,dx11);
412             ty               = _mm_mul_pd(fscal,dy11);
413             tz               = _mm_mul_pd(fscal,dz11);
414
415             /* Update vectorial force */
416             fix1             = _mm_add_pd(fix1,tx);
417             fiy1             = _mm_add_pd(fiy1,ty);
418             fiz1             = _mm_add_pd(fiz1,tz);
419
420             fjx1             = _mm_add_pd(fjx1,tx);
421             fjy1             = _mm_add_pd(fjy1,ty);
422             fjz1             = _mm_add_pd(fjz1,tz);
423
424             }
425
426             /**************************
427              * CALCULATE INTERACTIONS *
428              **************************/
429
430             if (gmx_mm_any_lt(rsq12,rcutoff2))
431             {
432
433             r12              = _mm_mul_pd(rsq12,rinv12);
434
435             /* EWALD ELECTROSTATICS */
436
437             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
438             ewrt             = _mm_mul_pd(r12,ewtabscale);
439             ewitab           = _mm_cvttpd_epi32(ewrt);
440             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
441             ewitab           = _mm_slli_epi32(ewitab,2);
442             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
443             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
444             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
445             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
446             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
447             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
448             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
449             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
450             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
451             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
452
453             d                = _mm_sub_pd(r12,rswitch);
454             d                = _mm_max_pd(d,_mm_setzero_pd());
455             d2               = _mm_mul_pd(d,d);
456             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
457
458             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
459
460             /* Evaluate switch function */
461             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
462             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
463             velec            = _mm_mul_pd(velec,sw);
464             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
465
466             /* Update potential sum for this i atom from the interaction with this j atom. */
467             velec            = _mm_and_pd(velec,cutoff_mask);
468             velecsum         = _mm_add_pd(velecsum,velec);
469
470             fscal            = felec;
471
472             fscal            = _mm_and_pd(fscal,cutoff_mask);
473
474             /* Calculate temporary vectorial force */
475             tx               = _mm_mul_pd(fscal,dx12);
476             ty               = _mm_mul_pd(fscal,dy12);
477             tz               = _mm_mul_pd(fscal,dz12);
478
479             /* Update vectorial force */
480             fix1             = _mm_add_pd(fix1,tx);
481             fiy1             = _mm_add_pd(fiy1,ty);
482             fiz1             = _mm_add_pd(fiz1,tz);
483
484             fjx2             = _mm_add_pd(fjx2,tx);
485             fjy2             = _mm_add_pd(fjy2,ty);
486             fjz2             = _mm_add_pd(fjz2,tz);
487
488             }
489
490             /**************************
491              * CALCULATE INTERACTIONS *
492              **************************/
493
494             if (gmx_mm_any_lt(rsq13,rcutoff2))
495             {
496
497             r13              = _mm_mul_pd(rsq13,rinv13);
498
499             /* EWALD ELECTROSTATICS */
500
501             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
502             ewrt             = _mm_mul_pd(r13,ewtabscale);
503             ewitab           = _mm_cvttpd_epi32(ewrt);
504             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
505             ewitab           = _mm_slli_epi32(ewitab,2);
506             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
507             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
508             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
509             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
510             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
511             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
512             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
513             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
514             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
515             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
516
517             d                = _mm_sub_pd(r13,rswitch);
518             d                = _mm_max_pd(d,_mm_setzero_pd());
519             d2               = _mm_mul_pd(d,d);
520             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
521
522             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
523
524             /* Evaluate switch function */
525             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
526             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
527             velec            = _mm_mul_pd(velec,sw);
528             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
529
530             /* Update potential sum for this i atom from the interaction with this j atom. */
531             velec            = _mm_and_pd(velec,cutoff_mask);
532             velecsum         = _mm_add_pd(velecsum,velec);
533
534             fscal            = felec;
535
536             fscal            = _mm_and_pd(fscal,cutoff_mask);
537
538             /* Calculate temporary vectorial force */
539             tx               = _mm_mul_pd(fscal,dx13);
540             ty               = _mm_mul_pd(fscal,dy13);
541             tz               = _mm_mul_pd(fscal,dz13);
542
543             /* Update vectorial force */
544             fix1             = _mm_add_pd(fix1,tx);
545             fiy1             = _mm_add_pd(fiy1,ty);
546             fiz1             = _mm_add_pd(fiz1,tz);
547
548             fjx3             = _mm_add_pd(fjx3,tx);
549             fjy3             = _mm_add_pd(fjy3,ty);
550             fjz3             = _mm_add_pd(fjz3,tz);
551
552             }
553
554             /**************************
555              * CALCULATE INTERACTIONS *
556              **************************/
557
558             if (gmx_mm_any_lt(rsq21,rcutoff2))
559             {
560
561             r21              = _mm_mul_pd(rsq21,rinv21);
562
563             /* EWALD ELECTROSTATICS */
564
565             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
566             ewrt             = _mm_mul_pd(r21,ewtabscale);
567             ewitab           = _mm_cvttpd_epi32(ewrt);
568             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
569             ewitab           = _mm_slli_epi32(ewitab,2);
570             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
571             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
572             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
573             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
574             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
575             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
576             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
577             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
578             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
579             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
580
581             d                = _mm_sub_pd(r21,rswitch);
582             d                = _mm_max_pd(d,_mm_setzero_pd());
583             d2               = _mm_mul_pd(d,d);
584             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
585
586             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
587
588             /* Evaluate switch function */
589             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
590             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
591             velec            = _mm_mul_pd(velec,sw);
592             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
593
594             /* Update potential sum for this i atom from the interaction with this j atom. */
595             velec            = _mm_and_pd(velec,cutoff_mask);
596             velecsum         = _mm_add_pd(velecsum,velec);
597
598             fscal            = felec;
599
600             fscal            = _mm_and_pd(fscal,cutoff_mask);
601
602             /* Calculate temporary vectorial force */
603             tx               = _mm_mul_pd(fscal,dx21);
604             ty               = _mm_mul_pd(fscal,dy21);
605             tz               = _mm_mul_pd(fscal,dz21);
606
607             /* Update vectorial force */
608             fix2             = _mm_add_pd(fix2,tx);
609             fiy2             = _mm_add_pd(fiy2,ty);
610             fiz2             = _mm_add_pd(fiz2,tz);
611
612             fjx1             = _mm_add_pd(fjx1,tx);
613             fjy1             = _mm_add_pd(fjy1,ty);
614             fjz1             = _mm_add_pd(fjz1,tz);
615
616             }
617
618             /**************************
619              * CALCULATE INTERACTIONS *
620              **************************/
621
622             if (gmx_mm_any_lt(rsq22,rcutoff2))
623             {
624
625             r22              = _mm_mul_pd(rsq22,rinv22);
626
627             /* EWALD ELECTROSTATICS */
628
629             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
630             ewrt             = _mm_mul_pd(r22,ewtabscale);
631             ewitab           = _mm_cvttpd_epi32(ewrt);
632             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
633             ewitab           = _mm_slli_epi32(ewitab,2);
634             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
635             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
636             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
637             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
638             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
639             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
640             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
641             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
642             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
643             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
644
645             d                = _mm_sub_pd(r22,rswitch);
646             d                = _mm_max_pd(d,_mm_setzero_pd());
647             d2               = _mm_mul_pd(d,d);
648             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
649
650             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
651
652             /* Evaluate switch function */
653             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
654             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
655             velec            = _mm_mul_pd(velec,sw);
656             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
657
658             /* Update potential sum for this i atom from the interaction with this j atom. */
659             velec            = _mm_and_pd(velec,cutoff_mask);
660             velecsum         = _mm_add_pd(velecsum,velec);
661
662             fscal            = felec;
663
664             fscal            = _mm_and_pd(fscal,cutoff_mask);
665
666             /* Calculate temporary vectorial force */
667             tx               = _mm_mul_pd(fscal,dx22);
668             ty               = _mm_mul_pd(fscal,dy22);
669             tz               = _mm_mul_pd(fscal,dz22);
670
671             /* Update vectorial force */
672             fix2             = _mm_add_pd(fix2,tx);
673             fiy2             = _mm_add_pd(fiy2,ty);
674             fiz2             = _mm_add_pd(fiz2,tz);
675
676             fjx2             = _mm_add_pd(fjx2,tx);
677             fjy2             = _mm_add_pd(fjy2,ty);
678             fjz2             = _mm_add_pd(fjz2,tz);
679
680             }
681
682             /**************************
683              * CALCULATE INTERACTIONS *
684              **************************/
685
686             if (gmx_mm_any_lt(rsq23,rcutoff2))
687             {
688
689             r23              = _mm_mul_pd(rsq23,rinv23);
690
691             /* EWALD ELECTROSTATICS */
692
693             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
694             ewrt             = _mm_mul_pd(r23,ewtabscale);
695             ewitab           = _mm_cvttpd_epi32(ewrt);
696             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
697             ewitab           = _mm_slli_epi32(ewitab,2);
698             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
699             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
700             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
701             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
702             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
703             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
704             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
705             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
706             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
707             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
708
709             d                = _mm_sub_pd(r23,rswitch);
710             d                = _mm_max_pd(d,_mm_setzero_pd());
711             d2               = _mm_mul_pd(d,d);
712             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
713
714             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
715
716             /* Evaluate switch function */
717             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
718             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
719             velec            = _mm_mul_pd(velec,sw);
720             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
721
722             /* Update potential sum for this i atom from the interaction with this j atom. */
723             velec            = _mm_and_pd(velec,cutoff_mask);
724             velecsum         = _mm_add_pd(velecsum,velec);
725
726             fscal            = felec;
727
728             fscal            = _mm_and_pd(fscal,cutoff_mask);
729
730             /* Calculate temporary vectorial force */
731             tx               = _mm_mul_pd(fscal,dx23);
732             ty               = _mm_mul_pd(fscal,dy23);
733             tz               = _mm_mul_pd(fscal,dz23);
734
735             /* Update vectorial force */
736             fix2             = _mm_add_pd(fix2,tx);
737             fiy2             = _mm_add_pd(fiy2,ty);
738             fiz2             = _mm_add_pd(fiz2,tz);
739
740             fjx3             = _mm_add_pd(fjx3,tx);
741             fjy3             = _mm_add_pd(fjy3,ty);
742             fjz3             = _mm_add_pd(fjz3,tz);
743
744             }
745
746             /**************************
747              * CALCULATE INTERACTIONS *
748              **************************/
749
750             if (gmx_mm_any_lt(rsq31,rcutoff2))
751             {
752
753             r31              = _mm_mul_pd(rsq31,rinv31);
754
755             /* EWALD ELECTROSTATICS */
756
757             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
758             ewrt             = _mm_mul_pd(r31,ewtabscale);
759             ewitab           = _mm_cvttpd_epi32(ewrt);
760             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
761             ewitab           = _mm_slli_epi32(ewitab,2);
762             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
763             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
764             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
765             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
766             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
767             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
768             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
769             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
770             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
771             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
772
773             d                = _mm_sub_pd(r31,rswitch);
774             d                = _mm_max_pd(d,_mm_setzero_pd());
775             d2               = _mm_mul_pd(d,d);
776             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
777
778             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
779
780             /* Evaluate switch function */
781             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
782             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
783             velec            = _mm_mul_pd(velec,sw);
784             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
785
786             /* Update potential sum for this i atom from the interaction with this j atom. */
787             velec            = _mm_and_pd(velec,cutoff_mask);
788             velecsum         = _mm_add_pd(velecsum,velec);
789
790             fscal            = felec;
791
792             fscal            = _mm_and_pd(fscal,cutoff_mask);
793
794             /* Calculate temporary vectorial force */
795             tx               = _mm_mul_pd(fscal,dx31);
796             ty               = _mm_mul_pd(fscal,dy31);
797             tz               = _mm_mul_pd(fscal,dz31);
798
799             /* Update vectorial force */
800             fix3             = _mm_add_pd(fix3,tx);
801             fiy3             = _mm_add_pd(fiy3,ty);
802             fiz3             = _mm_add_pd(fiz3,tz);
803
804             fjx1             = _mm_add_pd(fjx1,tx);
805             fjy1             = _mm_add_pd(fjy1,ty);
806             fjz1             = _mm_add_pd(fjz1,tz);
807
808             }
809
810             /**************************
811              * CALCULATE INTERACTIONS *
812              **************************/
813
814             if (gmx_mm_any_lt(rsq32,rcutoff2))
815             {
816
817             r32              = _mm_mul_pd(rsq32,rinv32);
818
819             /* EWALD ELECTROSTATICS */
820
821             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
822             ewrt             = _mm_mul_pd(r32,ewtabscale);
823             ewitab           = _mm_cvttpd_epi32(ewrt);
824             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
825             ewitab           = _mm_slli_epi32(ewitab,2);
826             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
827             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
828             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
829             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
830             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
831             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
832             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
833             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
834             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
835             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
836
837             d                = _mm_sub_pd(r32,rswitch);
838             d                = _mm_max_pd(d,_mm_setzero_pd());
839             d2               = _mm_mul_pd(d,d);
840             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
841
842             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
843
844             /* Evaluate switch function */
845             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
846             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
847             velec            = _mm_mul_pd(velec,sw);
848             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
849
850             /* Update potential sum for this i atom from the interaction with this j atom. */
851             velec            = _mm_and_pd(velec,cutoff_mask);
852             velecsum         = _mm_add_pd(velecsum,velec);
853
854             fscal            = felec;
855
856             fscal            = _mm_and_pd(fscal,cutoff_mask);
857
858             /* Calculate temporary vectorial force */
859             tx               = _mm_mul_pd(fscal,dx32);
860             ty               = _mm_mul_pd(fscal,dy32);
861             tz               = _mm_mul_pd(fscal,dz32);
862
863             /* Update vectorial force */
864             fix3             = _mm_add_pd(fix3,tx);
865             fiy3             = _mm_add_pd(fiy3,ty);
866             fiz3             = _mm_add_pd(fiz3,tz);
867
868             fjx2             = _mm_add_pd(fjx2,tx);
869             fjy2             = _mm_add_pd(fjy2,ty);
870             fjz2             = _mm_add_pd(fjz2,tz);
871
872             }
873
874             /**************************
875              * CALCULATE INTERACTIONS *
876              **************************/
877
878             if (gmx_mm_any_lt(rsq33,rcutoff2))
879             {
880
881             r33              = _mm_mul_pd(rsq33,rinv33);
882
883             /* EWALD ELECTROSTATICS */
884
885             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
886             ewrt             = _mm_mul_pd(r33,ewtabscale);
887             ewitab           = _mm_cvttpd_epi32(ewrt);
888             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
889             ewitab           = _mm_slli_epi32(ewitab,2);
890             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
891             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
892             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
893             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
894             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
895             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
896             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
897             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
898             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
899             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
900
901             d                = _mm_sub_pd(r33,rswitch);
902             d                = _mm_max_pd(d,_mm_setzero_pd());
903             d2               = _mm_mul_pd(d,d);
904             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
905
906             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
907
908             /* Evaluate switch function */
909             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
910             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
911             velec            = _mm_mul_pd(velec,sw);
912             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
913
914             /* Update potential sum for this i atom from the interaction with this j atom. */
915             velec            = _mm_and_pd(velec,cutoff_mask);
916             velecsum         = _mm_add_pd(velecsum,velec);
917
918             fscal            = felec;
919
920             fscal            = _mm_and_pd(fscal,cutoff_mask);
921
922             /* Calculate temporary vectorial force */
923             tx               = _mm_mul_pd(fscal,dx33);
924             ty               = _mm_mul_pd(fscal,dy33);
925             tz               = _mm_mul_pd(fscal,dz33);
926
927             /* Update vectorial force */
928             fix3             = _mm_add_pd(fix3,tx);
929             fiy3             = _mm_add_pd(fiy3,ty);
930             fiz3             = _mm_add_pd(fiz3,tz);
931
932             fjx3             = _mm_add_pd(fjx3,tx);
933             fjy3             = _mm_add_pd(fjy3,ty);
934             fjz3             = _mm_add_pd(fjz3,tz);
935
936             }
937
938             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
939
940             /* Inner loop uses 647 flops */
941         }
942
943         if(jidx<j_index_end)
944         {
945
946             jnrA             = jjnr[jidx];
947             j_coord_offsetA  = DIM*jnrA;
948
949             /* load j atom coordinates */
950             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
951                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
952                                               &jy2,&jz2,&jx3,&jy3,&jz3);
953             
954             /* Calculate displacement vector */
955             dx00             = _mm_sub_pd(ix0,jx0);
956             dy00             = _mm_sub_pd(iy0,jy0);
957             dz00             = _mm_sub_pd(iz0,jz0);
958             dx11             = _mm_sub_pd(ix1,jx1);
959             dy11             = _mm_sub_pd(iy1,jy1);
960             dz11             = _mm_sub_pd(iz1,jz1);
961             dx12             = _mm_sub_pd(ix1,jx2);
962             dy12             = _mm_sub_pd(iy1,jy2);
963             dz12             = _mm_sub_pd(iz1,jz2);
964             dx13             = _mm_sub_pd(ix1,jx3);
965             dy13             = _mm_sub_pd(iy1,jy3);
966             dz13             = _mm_sub_pd(iz1,jz3);
967             dx21             = _mm_sub_pd(ix2,jx1);
968             dy21             = _mm_sub_pd(iy2,jy1);
969             dz21             = _mm_sub_pd(iz2,jz1);
970             dx22             = _mm_sub_pd(ix2,jx2);
971             dy22             = _mm_sub_pd(iy2,jy2);
972             dz22             = _mm_sub_pd(iz2,jz2);
973             dx23             = _mm_sub_pd(ix2,jx3);
974             dy23             = _mm_sub_pd(iy2,jy3);
975             dz23             = _mm_sub_pd(iz2,jz3);
976             dx31             = _mm_sub_pd(ix3,jx1);
977             dy31             = _mm_sub_pd(iy3,jy1);
978             dz31             = _mm_sub_pd(iz3,jz1);
979             dx32             = _mm_sub_pd(ix3,jx2);
980             dy32             = _mm_sub_pd(iy3,jy2);
981             dz32             = _mm_sub_pd(iz3,jz2);
982             dx33             = _mm_sub_pd(ix3,jx3);
983             dy33             = _mm_sub_pd(iy3,jy3);
984             dz33             = _mm_sub_pd(iz3,jz3);
985
986             /* Calculate squared distance and things based on it */
987             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
988             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
989             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
990             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
991             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
992             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
993             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
994             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
995             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
996             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
997
998             rinv00           = gmx_mm_invsqrt_pd(rsq00);
999             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1000             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1001             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1002             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1003             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1004             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1005             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1006             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1007             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1008
1009             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1010             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1011             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1012             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1013             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1014             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1015             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1016             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1017             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1018             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1019
1020             fjx0             = _mm_setzero_pd();
1021             fjy0             = _mm_setzero_pd();
1022             fjz0             = _mm_setzero_pd();
1023             fjx1             = _mm_setzero_pd();
1024             fjy1             = _mm_setzero_pd();
1025             fjz1             = _mm_setzero_pd();
1026             fjx2             = _mm_setzero_pd();
1027             fjy2             = _mm_setzero_pd();
1028             fjz2             = _mm_setzero_pd();
1029             fjx3             = _mm_setzero_pd();
1030             fjy3             = _mm_setzero_pd();
1031             fjz3             = _mm_setzero_pd();
1032
1033             /**************************
1034              * CALCULATE INTERACTIONS *
1035              **************************/
1036
1037             if (gmx_mm_any_lt(rsq00,rcutoff2))
1038             {
1039
1040             r00              = _mm_mul_pd(rsq00,rinv00);
1041
1042             /* LENNARD-JONES DISPERSION/REPULSION */
1043
1044             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1045             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
1046             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1047             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1048             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1049
1050             d                = _mm_sub_pd(r00,rswitch);
1051             d                = _mm_max_pd(d,_mm_setzero_pd());
1052             d2               = _mm_mul_pd(d,d);
1053             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1054
1055             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1056
1057             /* Evaluate switch function */
1058             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1059             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1060             vvdw             = _mm_mul_pd(vvdw,sw);
1061             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1062
1063             /* Update potential sum for this i atom from the interaction with this j atom. */
1064             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
1065             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1066             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
1067
1068             fscal            = fvdw;
1069
1070             fscal            = _mm_and_pd(fscal,cutoff_mask);
1071
1072             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1073
1074             /* Calculate temporary vectorial force */
1075             tx               = _mm_mul_pd(fscal,dx00);
1076             ty               = _mm_mul_pd(fscal,dy00);
1077             tz               = _mm_mul_pd(fscal,dz00);
1078
1079             /* Update vectorial force */
1080             fix0             = _mm_add_pd(fix0,tx);
1081             fiy0             = _mm_add_pd(fiy0,ty);
1082             fiz0             = _mm_add_pd(fiz0,tz);
1083
1084             fjx0             = _mm_add_pd(fjx0,tx);
1085             fjy0             = _mm_add_pd(fjy0,ty);
1086             fjz0             = _mm_add_pd(fjz0,tz);
1087
1088             }
1089
1090             /**************************
1091              * CALCULATE INTERACTIONS *
1092              **************************/
1093
1094             if (gmx_mm_any_lt(rsq11,rcutoff2))
1095             {
1096
1097             r11              = _mm_mul_pd(rsq11,rinv11);
1098
1099             /* EWALD ELECTROSTATICS */
1100
1101             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1102             ewrt             = _mm_mul_pd(r11,ewtabscale);
1103             ewitab           = _mm_cvttpd_epi32(ewrt);
1104             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1105             ewitab           = _mm_slli_epi32(ewitab,2);
1106             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1107             ewtabD           = _mm_setzero_pd();
1108             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1109             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1110             ewtabFn          = _mm_setzero_pd();
1111             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1112             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1113             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1114             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1115             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1116
1117             d                = _mm_sub_pd(r11,rswitch);
1118             d                = _mm_max_pd(d,_mm_setzero_pd());
1119             d2               = _mm_mul_pd(d,d);
1120             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1121
1122             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1123
1124             /* Evaluate switch function */
1125             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1126             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1127             velec            = _mm_mul_pd(velec,sw);
1128             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1129
1130             /* Update potential sum for this i atom from the interaction with this j atom. */
1131             velec            = _mm_and_pd(velec,cutoff_mask);
1132             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1133             velecsum         = _mm_add_pd(velecsum,velec);
1134
1135             fscal            = felec;
1136
1137             fscal            = _mm_and_pd(fscal,cutoff_mask);
1138
1139             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1140
1141             /* Calculate temporary vectorial force */
1142             tx               = _mm_mul_pd(fscal,dx11);
1143             ty               = _mm_mul_pd(fscal,dy11);
1144             tz               = _mm_mul_pd(fscal,dz11);
1145
1146             /* Update vectorial force */
1147             fix1             = _mm_add_pd(fix1,tx);
1148             fiy1             = _mm_add_pd(fiy1,ty);
1149             fiz1             = _mm_add_pd(fiz1,tz);
1150
1151             fjx1             = _mm_add_pd(fjx1,tx);
1152             fjy1             = _mm_add_pd(fjy1,ty);
1153             fjz1             = _mm_add_pd(fjz1,tz);
1154
1155             }
1156
1157             /**************************
1158              * CALCULATE INTERACTIONS *
1159              **************************/
1160
1161             if (gmx_mm_any_lt(rsq12,rcutoff2))
1162             {
1163
1164             r12              = _mm_mul_pd(rsq12,rinv12);
1165
1166             /* EWALD ELECTROSTATICS */
1167
1168             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1169             ewrt             = _mm_mul_pd(r12,ewtabscale);
1170             ewitab           = _mm_cvttpd_epi32(ewrt);
1171             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1172             ewitab           = _mm_slli_epi32(ewitab,2);
1173             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1174             ewtabD           = _mm_setzero_pd();
1175             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1176             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1177             ewtabFn          = _mm_setzero_pd();
1178             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1179             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1180             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1181             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1182             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1183
1184             d                = _mm_sub_pd(r12,rswitch);
1185             d                = _mm_max_pd(d,_mm_setzero_pd());
1186             d2               = _mm_mul_pd(d,d);
1187             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1188
1189             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1190
1191             /* Evaluate switch function */
1192             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1193             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1194             velec            = _mm_mul_pd(velec,sw);
1195             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1196
1197             /* Update potential sum for this i atom from the interaction with this j atom. */
1198             velec            = _mm_and_pd(velec,cutoff_mask);
1199             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1200             velecsum         = _mm_add_pd(velecsum,velec);
1201
1202             fscal            = felec;
1203
1204             fscal            = _mm_and_pd(fscal,cutoff_mask);
1205
1206             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1207
1208             /* Calculate temporary vectorial force */
1209             tx               = _mm_mul_pd(fscal,dx12);
1210             ty               = _mm_mul_pd(fscal,dy12);
1211             tz               = _mm_mul_pd(fscal,dz12);
1212
1213             /* Update vectorial force */
1214             fix1             = _mm_add_pd(fix1,tx);
1215             fiy1             = _mm_add_pd(fiy1,ty);
1216             fiz1             = _mm_add_pd(fiz1,tz);
1217
1218             fjx2             = _mm_add_pd(fjx2,tx);
1219             fjy2             = _mm_add_pd(fjy2,ty);
1220             fjz2             = _mm_add_pd(fjz2,tz);
1221
1222             }
1223
1224             /**************************
1225              * CALCULATE INTERACTIONS *
1226              **************************/
1227
1228             if (gmx_mm_any_lt(rsq13,rcutoff2))
1229             {
1230
1231             r13              = _mm_mul_pd(rsq13,rinv13);
1232
1233             /* EWALD ELECTROSTATICS */
1234
1235             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1236             ewrt             = _mm_mul_pd(r13,ewtabscale);
1237             ewitab           = _mm_cvttpd_epi32(ewrt);
1238             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1239             ewitab           = _mm_slli_epi32(ewitab,2);
1240             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1241             ewtabD           = _mm_setzero_pd();
1242             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1243             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1244             ewtabFn          = _mm_setzero_pd();
1245             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1246             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1247             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1248             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1249             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1250
1251             d                = _mm_sub_pd(r13,rswitch);
1252             d                = _mm_max_pd(d,_mm_setzero_pd());
1253             d2               = _mm_mul_pd(d,d);
1254             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1255
1256             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1257
1258             /* Evaluate switch function */
1259             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1260             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1261             velec            = _mm_mul_pd(velec,sw);
1262             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
1263
1264             /* Update potential sum for this i atom from the interaction with this j atom. */
1265             velec            = _mm_and_pd(velec,cutoff_mask);
1266             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1267             velecsum         = _mm_add_pd(velecsum,velec);
1268
1269             fscal            = felec;
1270
1271             fscal            = _mm_and_pd(fscal,cutoff_mask);
1272
1273             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1274
1275             /* Calculate temporary vectorial force */
1276             tx               = _mm_mul_pd(fscal,dx13);
1277             ty               = _mm_mul_pd(fscal,dy13);
1278             tz               = _mm_mul_pd(fscal,dz13);
1279
1280             /* Update vectorial force */
1281             fix1             = _mm_add_pd(fix1,tx);
1282             fiy1             = _mm_add_pd(fiy1,ty);
1283             fiz1             = _mm_add_pd(fiz1,tz);
1284
1285             fjx3             = _mm_add_pd(fjx3,tx);
1286             fjy3             = _mm_add_pd(fjy3,ty);
1287             fjz3             = _mm_add_pd(fjz3,tz);
1288
1289             }
1290
1291             /**************************
1292              * CALCULATE INTERACTIONS *
1293              **************************/
1294
1295             if (gmx_mm_any_lt(rsq21,rcutoff2))
1296             {
1297
1298             r21              = _mm_mul_pd(rsq21,rinv21);
1299
1300             /* EWALD ELECTROSTATICS */
1301
1302             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1303             ewrt             = _mm_mul_pd(r21,ewtabscale);
1304             ewitab           = _mm_cvttpd_epi32(ewrt);
1305             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1306             ewitab           = _mm_slli_epi32(ewitab,2);
1307             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1308             ewtabD           = _mm_setzero_pd();
1309             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1310             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1311             ewtabFn          = _mm_setzero_pd();
1312             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1313             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1314             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1315             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1316             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1317
1318             d                = _mm_sub_pd(r21,rswitch);
1319             d                = _mm_max_pd(d,_mm_setzero_pd());
1320             d2               = _mm_mul_pd(d,d);
1321             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1322
1323             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1324
1325             /* Evaluate switch function */
1326             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1327             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1328             velec            = _mm_mul_pd(velec,sw);
1329             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1330
1331             /* Update potential sum for this i atom from the interaction with this j atom. */
1332             velec            = _mm_and_pd(velec,cutoff_mask);
1333             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1334             velecsum         = _mm_add_pd(velecsum,velec);
1335
1336             fscal            = felec;
1337
1338             fscal            = _mm_and_pd(fscal,cutoff_mask);
1339
1340             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1341
1342             /* Calculate temporary vectorial force */
1343             tx               = _mm_mul_pd(fscal,dx21);
1344             ty               = _mm_mul_pd(fscal,dy21);
1345             tz               = _mm_mul_pd(fscal,dz21);
1346
1347             /* Update vectorial force */
1348             fix2             = _mm_add_pd(fix2,tx);
1349             fiy2             = _mm_add_pd(fiy2,ty);
1350             fiz2             = _mm_add_pd(fiz2,tz);
1351
1352             fjx1             = _mm_add_pd(fjx1,tx);
1353             fjy1             = _mm_add_pd(fjy1,ty);
1354             fjz1             = _mm_add_pd(fjz1,tz);
1355
1356             }
1357
1358             /**************************
1359              * CALCULATE INTERACTIONS *
1360              **************************/
1361
1362             if (gmx_mm_any_lt(rsq22,rcutoff2))
1363             {
1364
1365             r22              = _mm_mul_pd(rsq22,rinv22);
1366
1367             /* EWALD ELECTROSTATICS */
1368
1369             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1370             ewrt             = _mm_mul_pd(r22,ewtabscale);
1371             ewitab           = _mm_cvttpd_epi32(ewrt);
1372             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1373             ewitab           = _mm_slli_epi32(ewitab,2);
1374             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1375             ewtabD           = _mm_setzero_pd();
1376             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1377             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1378             ewtabFn          = _mm_setzero_pd();
1379             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1380             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1381             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1382             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1383             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1384
1385             d                = _mm_sub_pd(r22,rswitch);
1386             d                = _mm_max_pd(d,_mm_setzero_pd());
1387             d2               = _mm_mul_pd(d,d);
1388             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1389
1390             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1391
1392             /* Evaluate switch function */
1393             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1394             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1395             velec            = _mm_mul_pd(velec,sw);
1396             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1397
1398             /* Update potential sum for this i atom from the interaction with this j atom. */
1399             velec            = _mm_and_pd(velec,cutoff_mask);
1400             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1401             velecsum         = _mm_add_pd(velecsum,velec);
1402
1403             fscal            = felec;
1404
1405             fscal            = _mm_and_pd(fscal,cutoff_mask);
1406
1407             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1408
1409             /* Calculate temporary vectorial force */
1410             tx               = _mm_mul_pd(fscal,dx22);
1411             ty               = _mm_mul_pd(fscal,dy22);
1412             tz               = _mm_mul_pd(fscal,dz22);
1413
1414             /* Update vectorial force */
1415             fix2             = _mm_add_pd(fix2,tx);
1416             fiy2             = _mm_add_pd(fiy2,ty);
1417             fiz2             = _mm_add_pd(fiz2,tz);
1418
1419             fjx2             = _mm_add_pd(fjx2,tx);
1420             fjy2             = _mm_add_pd(fjy2,ty);
1421             fjz2             = _mm_add_pd(fjz2,tz);
1422
1423             }
1424
1425             /**************************
1426              * CALCULATE INTERACTIONS *
1427              **************************/
1428
1429             if (gmx_mm_any_lt(rsq23,rcutoff2))
1430             {
1431
1432             r23              = _mm_mul_pd(rsq23,rinv23);
1433
1434             /* EWALD ELECTROSTATICS */
1435
1436             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1437             ewrt             = _mm_mul_pd(r23,ewtabscale);
1438             ewitab           = _mm_cvttpd_epi32(ewrt);
1439             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1440             ewitab           = _mm_slli_epi32(ewitab,2);
1441             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1442             ewtabD           = _mm_setzero_pd();
1443             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1444             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1445             ewtabFn          = _mm_setzero_pd();
1446             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1447             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1448             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1449             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1450             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1451
1452             d                = _mm_sub_pd(r23,rswitch);
1453             d                = _mm_max_pd(d,_mm_setzero_pd());
1454             d2               = _mm_mul_pd(d,d);
1455             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1456
1457             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1458
1459             /* Evaluate switch function */
1460             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1461             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1462             velec            = _mm_mul_pd(velec,sw);
1463             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
1464
1465             /* Update potential sum for this i atom from the interaction with this j atom. */
1466             velec            = _mm_and_pd(velec,cutoff_mask);
1467             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1468             velecsum         = _mm_add_pd(velecsum,velec);
1469
1470             fscal            = felec;
1471
1472             fscal            = _mm_and_pd(fscal,cutoff_mask);
1473
1474             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1475
1476             /* Calculate temporary vectorial force */
1477             tx               = _mm_mul_pd(fscal,dx23);
1478             ty               = _mm_mul_pd(fscal,dy23);
1479             tz               = _mm_mul_pd(fscal,dz23);
1480
1481             /* Update vectorial force */
1482             fix2             = _mm_add_pd(fix2,tx);
1483             fiy2             = _mm_add_pd(fiy2,ty);
1484             fiz2             = _mm_add_pd(fiz2,tz);
1485
1486             fjx3             = _mm_add_pd(fjx3,tx);
1487             fjy3             = _mm_add_pd(fjy3,ty);
1488             fjz3             = _mm_add_pd(fjz3,tz);
1489
1490             }
1491
1492             /**************************
1493              * CALCULATE INTERACTIONS *
1494              **************************/
1495
1496             if (gmx_mm_any_lt(rsq31,rcutoff2))
1497             {
1498
1499             r31              = _mm_mul_pd(rsq31,rinv31);
1500
1501             /* EWALD ELECTROSTATICS */
1502
1503             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1504             ewrt             = _mm_mul_pd(r31,ewtabscale);
1505             ewitab           = _mm_cvttpd_epi32(ewrt);
1506             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1507             ewitab           = _mm_slli_epi32(ewitab,2);
1508             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1509             ewtabD           = _mm_setzero_pd();
1510             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1511             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1512             ewtabFn          = _mm_setzero_pd();
1513             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1514             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1515             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1516             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1517             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1518
1519             d                = _mm_sub_pd(r31,rswitch);
1520             d                = _mm_max_pd(d,_mm_setzero_pd());
1521             d2               = _mm_mul_pd(d,d);
1522             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1523
1524             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1525
1526             /* Evaluate switch function */
1527             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1528             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1529             velec            = _mm_mul_pd(velec,sw);
1530             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
1531
1532             /* Update potential sum for this i atom from the interaction with this j atom. */
1533             velec            = _mm_and_pd(velec,cutoff_mask);
1534             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1535             velecsum         = _mm_add_pd(velecsum,velec);
1536
1537             fscal            = felec;
1538
1539             fscal            = _mm_and_pd(fscal,cutoff_mask);
1540
1541             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1542
1543             /* Calculate temporary vectorial force */
1544             tx               = _mm_mul_pd(fscal,dx31);
1545             ty               = _mm_mul_pd(fscal,dy31);
1546             tz               = _mm_mul_pd(fscal,dz31);
1547
1548             /* Update vectorial force */
1549             fix3             = _mm_add_pd(fix3,tx);
1550             fiy3             = _mm_add_pd(fiy3,ty);
1551             fiz3             = _mm_add_pd(fiz3,tz);
1552
1553             fjx1             = _mm_add_pd(fjx1,tx);
1554             fjy1             = _mm_add_pd(fjy1,ty);
1555             fjz1             = _mm_add_pd(fjz1,tz);
1556
1557             }
1558
1559             /**************************
1560              * CALCULATE INTERACTIONS *
1561              **************************/
1562
1563             if (gmx_mm_any_lt(rsq32,rcutoff2))
1564             {
1565
1566             r32              = _mm_mul_pd(rsq32,rinv32);
1567
1568             /* EWALD ELECTROSTATICS */
1569
1570             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1571             ewrt             = _mm_mul_pd(r32,ewtabscale);
1572             ewitab           = _mm_cvttpd_epi32(ewrt);
1573             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1574             ewitab           = _mm_slli_epi32(ewitab,2);
1575             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1576             ewtabD           = _mm_setzero_pd();
1577             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1578             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1579             ewtabFn          = _mm_setzero_pd();
1580             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1581             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1582             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1583             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1584             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1585
1586             d                = _mm_sub_pd(r32,rswitch);
1587             d                = _mm_max_pd(d,_mm_setzero_pd());
1588             d2               = _mm_mul_pd(d,d);
1589             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1590
1591             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1592
1593             /* Evaluate switch function */
1594             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1595             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1596             velec            = _mm_mul_pd(velec,sw);
1597             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
1598
1599             /* Update potential sum for this i atom from the interaction with this j atom. */
1600             velec            = _mm_and_pd(velec,cutoff_mask);
1601             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1602             velecsum         = _mm_add_pd(velecsum,velec);
1603
1604             fscal            = felec;
1605
1606             fscal            = _mm_and_pd(fscal,cutoff_mask);
1607
1608             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1609
1610             /* Calculate temporary vectorial force */
1611             tx               = _mm_mul_pd(fscal,dx32);
1612             ty               = _mm_mul_pd(fscal,dy32);
1613             tz               = _mm_mul_pd(fscal,dz32);
1614
1615             /* Update vectorial force */
1616             fix3             = _mm_add_pd(fix3,tx);
1617             fiy3             = _mm_add_pd(fiy3,ty);
1618             fiz3             = _mm_add_pd(fiz3,tz);
1619
1620             fjx2             = _mm_add_pd(fjx2,tx);
1621             fjy2             = _mm_add_pd(fjy2,ty);
1622             fjz2             = _mm_add_pd(fjz2,tz);
1623
1624             }
1625
1626             /**************************
1627              * CALCULATE INTERACTIONS *
1628              **************************/
1629
1630             if (gmx_mm_any_lt(rsq33,rcutoff2))
1631             {
1632
1633             r33              = _mm_mul_pd(rsq33,rinv33);
1634
1635             /* EWALD ELECTROSTATICS */
1636
1637             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1638             ewrt             = _mm_mul_pd(r33,ewtabscale);
1639             ewitab           = _mm_cvttpd_epi32(ewrt);
1640             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1641             ewitab           = _mm_slli_epi32(ewitab,2);
1642             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1643             ewtabD           = _mm_setzero_pd();
1644             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1645             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1646             ewtabFn          = _mm_setzero_pd();
1647             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1648             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1649             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1650             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1651             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1652
1653             d                = _mm_sub_pd(r33,rswitch);
1654             d                = _mm_max_pd(d,_mm_setzero_pd());
1655             d2               = _mm_mul_pd(d,d);
1656             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1657
1658             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1659
1660             /* Evaluate switch function */
1661             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1662             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1663             velec            = _mm_mul_pd(velec,sw);
1664             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
1665
1666             /* Update potential sum for this i atom from the interaction with this j atom. */
1667             velec            = _mm_and_pd(velec,cutoff_mask);
1668             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1669             velecsum         = _mm_add_pd(velecsum,velec);
1670
1671             fscal            = felec;
1672
1673             fscal            = _mm_and_pd(fscal,cutoff_mask);
1674
1675             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1676
1677             /* Calculate temporary vectorial force */
1678             tx               = _mm_mul_pd(fscal,dx33);
1679             ty               = _mm_mul_pd(fscal,dy33);
1680             tz               = _mm_mul_pd(fscal,dz33);
1681
1682             /* Update vectorial force */
1683             fix3             = _mm_add_pd(fix3,tx);
1684             fiy3             = _mm_add_pd(fiy3,ty);
1685             fiz3             = _mm_add_pd(fiz3,tz);
1686
1687             fjx3             = _mm_add_pd(fjx3,tx);
1688             fjy3             = _mm_add_pd(fjy3,ty);
1689             fjz3             = _mm_add_pd(fjz3,tz);
1690
1691             }
1692
1693             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1694
1695             /* Inner loop uses 647 flops */
1696         }
1697
1698         /* End of innermost loop */
1699
1700         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1701                                               f+i_coord_offset,fshift+i_shift_offset);
1702
1703         ggid                        = gid[iidx];
1704         /* Update potential energies */
1705         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1706         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1707
1708         /* Increment number of inner iterations */
1709         inneriter                  += j_index_end - j_index_start;
1710
1711         /* Outer loop uses 26 flops */
1712     }
1713
1714     /* Increment number of outer iterations */
1715     outeriter        += nri;
1716
1717     /* Update outer/inner flops */
1718
1719     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*647);
1720 }
1721 /*
1722  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double
1723  * Electrostatics interaction: Ewald
1724  * VdW interaction:            LennardJones
1725  * Geometry:                   Water4-Water4
1726  * Calculate force/pot:        Force
1727  */
1728 void
1729 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double
1730                     (t_nblist * gmx_restrict                nlist,
1731                      rvec * gmx_restrict                    xx,
1732                      rvec * gmx_restrict                    ff,
1733                      t_forcerec * gmx_restrict              fr,
1734                      t_mdatoms * gmx_restrict               mdatoms,
1735                      nb_kernel_data_t * gmx_restrict        kernel_data,
1736                      t_nrnb * gmx_restrict                  nrnb)
1737 {
1738     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1739      * just 0 for non-waters.
1740      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1741      * jnr indices corresponding to data put in the four positions in the SIMD register.
1742      */
1743     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1744     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1745     int              jnrA,jnrB;
1746     int              j_coord_offsetA,j_coord_offsetB;
1747     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1748     real             rcutoff_scalar;
1749     real             *shiftvec,*fshift,*x,*f;
1750     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1751     int              vdwioffset0;
1752     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1753     int              vdwioffset1;
1754     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1755     int              vdwioffset2;
1756     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1757     int              vdwioffset3;
1758     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1759     int              vdwjidx0A,vdwjidx0B;
1760     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1761     int              vdwjidx1A,vdwjidx1B;
1762     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1763     int              vdwjidx2A,vdwjidx2B;
1764     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1765     int              vdwjidx3A,vdwjidx3B;
1766     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1767     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1768     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1769     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1770     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1771     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1772     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1773     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1774     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1775     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1776     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1777     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1778     real             *charge;
1779     int              nvdwtype;
1780     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1781     int              *vdwtype;
1782     real             *vdwparam;
1783     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1784     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1785     __m128i          ewitab;
1786     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1787     real             *ewtab;
1788     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1789     real             rswitch_scalar,d_scalar;
1790     __m128d          dummy_mask,cutoff_mask;
1791     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1792     __m128d          one     = _mm_set1_pd(1.0);
1793     __m128d          two     = _mm_set1_pd(2.0);
1794     x                = xx[0];
1795     f                = ff[0];
1796
1797     nri              = nlist->nri;
1798     iinr             = nlist->iinr;
1799     jindex           = nlist->jindex;
1800     jjnr             = nlist->jjnr;
1801     shiftidx         = nlist->shift;
1802     gid              = nlist->gid;
1803     shiftvec         = fr->shift_vec[0];
1804     fshift           = fr->fshift[0];
1805     facel            = _mm_set1_pd(fr->epsfac);
1806     charge           = mdatoms->chargeA;
1807     nvdwtype         = fr->ntype;
1808     vdwparam         = fr->nbfp;
1809     vdwtype          = mdatoms->typeA;
1810
1811     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1812     ewtab            = fr->ic->tabq_coul_FDV0;
1813     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1814     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1815
1816     /* Setup water-specific parameters */
1817     inr              = nlist->iinr[0];
1818     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1819     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1820     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1821     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1822
1823     jq1              = _mm_set1_pd(charge[inr+1]);
1824     jq2              = _mm_set1_pd(charge[inr+2]);
1825     jq3              = _mm_set1_pd(charge[inr+3]);
1826     vdwjidx0A        = 2*vdwtype[inr+0];
1827     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1828     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1829     qq11             = _mm_mul_pd(iq1,jq1);
1830     qq12             = _mm_mul_pd(iq1,jq2);
1831     qq13             = _mm_mul_pd(iq1,jq3);
1832     qq21             = _mm_mul_pd(iq2,jq1);
1833     qq22             = _mm_mul_pd(iq2,jq2);
1834     qq23             = _mm_mul_pd(iq2,jq3);
1835     qq31             = _mm_mul_pd(iq3,jq1);
1836     qq32             = _mm_mul_pd(iq3,jq2);
1837     qq33             = _mm_mul_pd(iq3,jq3);
1838
1839     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1840     rcutoff_scalar   = fr->rcoulomb;
1841     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1842     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1843
1844     rswitch_scalar   = fr->rcoulomb_switch;
1845     rswitch          = _mm_set1_pd(rswitch_scalar);
1846     /* Setup switch parameters */
1847     d_scalar         = rcutoff_scalar-rswitch_scalar;
1848     d                = _mm_set1_pd(d_scalar);
1849     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1850     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1851     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1852     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1853     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1854     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1855
1856     /* Avoid stupid compiler warnings */
1857     jnrA = jnrB = 0;
1858     j_coord_offsetA = 0;
1859     j_coord_offsetB = 0;
1860
1861     outeriter        = 0;
1862     inneriter        = 0;
1863
1864     /* Start outer loop over neighborlists */
1865     for(iidx=0; iidx<nri; iidx++)
1866     {
1867         /* Load shift vector for this list */
1868         i_shift_offset   = DIM*shiftidx[iidx];
1869
1870         /* Load limits for loop over neighbors */
1871         j_index_start    = jindex[iidx];
1872         j_index_end      = jindex[iidx+1];
1873
1874         /* Get outer coordinate index */
1875         inr              = iinr[iidx];
1876         i_coord_offset   = DIM*inr;
1877
1878         /* Load i particle coords and add shift vector */
1879         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1880                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1881
1882         fix0             = _mm_setzero_pd();
1883         fiy0             = _mm_setzero_pd();
1884         fiz0             = _mm_setzero_pd();
1885         fix1             = _mm_setzero_pd();
1886         fiy1             = _mm_setzero_pd();
1887         fiz1             = _mm_setzero_pd();
1888         fix2             = _mm_setzero_pd();
1889         fiy2             = _mm_setzero_pd();
1890         fiz2             = _mm_setzero_pd();
1891         fix3             = _mm_setzero_pd();
1892         fiy3             = _mm_setzero_pd();
1893         fiz3             = _mm_setzero_pd();
1894
1895         /* Start inner kernel loop */
1896         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1897         {
1898
1899             /* Get j neighbor index, and coordinate index */
1900             jnrA             = jjnr[jidx];
1901             jnrB             = jjnr[jidx+1];
1902             j_coord_offsetA  = DIM*jnrA;
1903             j_coord_offsetB  = DIM*jnrB;
1904             
1905             /* load j atom coordinates */
1906             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1907                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1908                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1909             
1910             /* Calculate displacement vector */
1911             dx00             = _mm_sub_pd(ix0,jx0);
1912             dy00             = _mm_sub_pd(iy0,jy0);
1913             dz00             = _mm_sub_pd(iz0,jz0);
1914             dx11             = _mm_sub_pd(ix1,jx1);
1915             dy11             = _mm_sub_pd(iy1,jy1);
1916             dz11             = _mm_sub_pd(iz1,jz1);
1917             dx12             = _mm_sub_pd(ix1,jx2);
1918             dy12             = _mm_sub_pd(iy1,jy2);
1919             dz12             = _mm_sub_pd(iz1,jz2);
1920             dx13             = _mm_sub_pd(ix1,jx3);
1921             dy13             = _mm_sub_pd(iy1,jy3);
1922             dz13             = _mm_sub_pd(iz1,jz3);
1923             dx21             = _mm_sub_pd(ix2,jx1);
1924             dy21             = _mm_sub_pd(iy2,jy1);
1925             dz21             = _mm_sub_pd(iz2,jz1);
1926             dx22             = _mm_sub_pd(ix2,jx2);
1927             dy22             = _mm_sub_pd(iy2,jy2);
1928             dz22             = _mm_sub_pd(iz2,jz2);
1929             dx23             = _mm_sub_pd(ix2,jx3);
1930             dy23             = _mm_sub_pd(iy2,jy3);
1931             dz23             = _mm_sub_pd(iz2,jz3);
1932             dx31             = _mm_sub_pd(ix3,jx1);
1933             dy31             = _mm_sub_pd(iy3,jy1);
1934             dz31             = _mm_sub_pd(iz3,jz1);
1935             dx32             = _mm_sub_pd(ix3,jx2);
1936             dy32             = _mm_sub_pd(iy3,jy2);
1937             dz32             = _mm_sub_pd(iz3,jz2);
1938             dx33             = _mm_sub_pd(ix3,jx3);
1939             dy33             = _mm_sub_pd(iy3,jy3);
1940             dz33             = _mm_sub_pd(iz3,jz3);
1941
1942             /* Calculate squared distance and things based on it */
1943             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1944             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1945             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1946             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1947             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1948             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1949             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1950             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1951             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1952             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1953
1954             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1955             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1956             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1957             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1958             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1959             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1960             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1961             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1962             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1963             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1964
1965             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1966             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1967             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1968             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1969             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1970             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1971             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1972             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1973             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1974             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1975
1976             fjx0             = _mm_setzero_pd();
1977             fjy0             = _mm_setzero_pd();
1978             fjz0             = _mm_setzero_pd();
1979             fjx1             = _mm_setzero_pd();
1980             fjy1             = _mm_setzero_pd();
1981             fjz1             = _mm_setzero_pd();
1982             fjx2             = _mm_setzero_pd();
1983             fjy2             = _mm_setzero_pd();
1984             fjz2             = _mm_setzero_pd();
1985             fjx3             = _mm_setzero_pd();
1986             fjy3             = _mm_setzero_pd();
1987             fjz3             = _mm_setzero_pd();
1988
1989             /**************************
1990              * CALCULATE INTERACTIONS *
1991              **************************/
1992
1993             if (gmx_mm_any_lt(rsq00,rcutoff2))
1994             {
1995
1996             r00              = _mm_mul_pd(rsq00,rinv00);
1997
1998             /* LENNARD-JONES DISPERSION/REPULSION */
1999
2000             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2001             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
2002             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2003             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2004             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2005
2006             d                = _mm_sub_pd(r00,rswitch);
2007             d                = _mm_max_pd(d,_mm_setzero_pd());
2008             d2               = _mm_mul_pd(d,d);
2009             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2010
2011             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2012
2013             /* Evaluate switch function */
2014             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2015             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2016             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2017
2018             fscal            = fvdw;
2019
2020             fscal            = _mm_and_pd(fscal,cutoff_mask);
2021
2022             /* Calculate temporary vectorial force */
2023             tx               = _mm_mul_pd(fscal,dx00);
2024             ty               = _mm_mul_pd(fscal,dy00);
2025             tz               = _mm_mul_pd(fscal,dz00);
2026
2027             /* Update vectorial force */
2028             fix0             = _mm_add_pd(fix0,tx);
2029             fiy0             = _mm_add_pd(fiy0,ty);
2030             fiz0             = _mm_add_pd(fiz0,tz);
2031
2032             fjx0             = _mm_add_pd(fjx0,tx);
2033             fjy0             = _mm_add_pd(fjy0,ty);
2034             fjz0             = _mm_add_pd(fjz0,tz);
2035
2036             }
2037
2038             /**************************
2039              * CALCULATE INTERACTIONS *
2040              **************************/
2041
2042             if (gmx_mm_any_lt(rsq11,rcutoff2))
2043             {
2044
2045             r11              = _mm_mul_pd(rsq11,rinv11);
2046
2047             /* EWALD ELECTROSTATICS */
2048
2049             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2050             ewrt             = _mm_mul_pd(r11,ewtabscale);
2051             ewitab           = _mm_cvttpd_epi32(ewrt);
2052             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2053             ewitab           = _mm_slli_epi32(ewitab,2);
2054             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2055             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2056             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2057             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2058             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2059             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2060             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2061             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2062             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2063             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2064
2065             d                = _mm_sub_pd(r11,rswitch);
2066             d                = _mm_max_pd(d,_mm_setzero_pd());
2067             d2               = _mm_mul_pd(d,d);
2068             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2069
2070             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2071
2072             /* Evaluate switch function */
2073             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2074             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2075             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2076
2077             fscal            = felec;
2078
2079             fscal            = _mm_and_pd(fscal,cutoff_mask);
2080
2081             /* Calculate temporary vectorial force */
2082             tx               = _mm_mul_pd(fscal,dx11);
2083             ty               = _mm_mul_pd(fscal,dy11);
2084             tz               = _mm_mul_pd(fscal,dz11);
2085
2086             /* Update vectorial force */
2087             fix1             = _mm_add_pd(fix1,tx);
2088             fiy1             = _mm_add_pd(fiy1,ty);
2089             fiz1             = _mm_add_pd(fiz1,tz);
2090
2091             fjx1             = _mm_add_pd(fjx1,tx);
2092             fjy1             = _mm_add_pd(fjy1,ty);
2093             fjz1             = _mm_add_pd(fjz1,tz);
2094
2095             }
2096
2097             /**************************
2098              * CALCULATE INTERACTIONS *
2099              **************************/
2100
2101             if (gmx_mm_any_lt(rsq12,rcutoff2))
2102             {
2103
2104             r12              = _mm_mul_pd(rsq12,rinv12);
2105
2106             /* EWALD ELECTROSTATICS */
2107
2108             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2109             ewrt             = _mm_mul_pd(r12,ewtabscale);
2110             ewitab           = _mm_cvttpd_epi32(ewrt);
2111             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2112             ewitab           = _mm_slli_epi32(ewitab,2);
2113             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2114             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2115             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2116             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2117             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2118             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2119             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2120             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2121             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2122             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2123
2124             d                = _mm_sub_pd(r12,rswitch);
2125             d                = _mm_max_pd(d,_mm_setzero_pd());
2126             d2               = _mm_mul_pd(d,d);
2127             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2128
2129             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2130
2131             /* Evaluate switch function */
2132             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2133             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2134             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2135
2136             fscal            = felec;
2137
2138             fscal            = _mm_and_pd(fscal,cutoff_mask);
2139
2140             /* Calculate temporary vectorial force */
2141             tx               = _mm_mul_pd(fscal,dx12);
2142             ty               = _mm_mul_pd(fscal,dy12);
2143             tz               = _mm_mul_pd(fscal,dz12);
2144
2145             /* Update vectorial force */
2146             fix1             = _mm_add_pd(fix1,tx);
2147             fiy1             = _mm_add_pd(fiy1,ty);
2148             fiz1             = _mm_add_pd(fiz1,tz);
2149
2150             fjx2             = _mm_add_pd(fjx2,tx);
2151             fjy2             = _mm_add_pd(fjy2,ty);
2152             fjz2             = _mm_add_pd(fjz2,tz);
2153
2154             }
2155
2156             /**************************
2157              * CALCULATE INTERACTIONS *
2158              **************************/
2159
2160             if (gmx_mm_any_lt(rsq13,rcutoff2))
2161             {
2162
2163             r13              = _mm_mul_pd(rsq13,rinv13);
2164
2165             /* EWALD ELECTROSTATICS */
2166
2167             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2168             ewrt             = _mm_mul_pd(r13,ewtabscale);
2169             ewitab           = _mm_cvttpd_epi32(ewrt);
2170             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2171             ewitab           = _mm_slli_epi32(ewitab,2);
2172             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2173             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2174             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2175             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2176             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2177             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2178             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2179             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2180             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2181             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2182
2183             d                = _mm_sub_pd(r13,rswitch);
2184             d                = _mm_max_pd(d,_mm_setzero_pd());
2185             d2               = _mm_mul_pd(d,d);
2186             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2187
2188             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2189
2190             /* Evaluate switch function */
2191             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2192             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2193             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2194
2195             fscal            = felec;
2196
2197             fscal            = _mm_and_pd(fscal,cutoff_mask);
2198
2199             /* Calculate temporary vectorial force */
2200             tx               = _mm_mul_pd(fscal,dx13);
2201             ty               = _mm_mul_pd(fscal,dy13);
2202             tz               = _mm_mul_pd(fscal,dz13);
2203
2204             /* Update vectorial force */
2205             fix1             = _mm_add_pd(fix1,tx);
2206             fiy1             = _mm_add_pd(fiy1,ty);
2207             fiz1             = _mm_add_pd(fiz1,tz);
2208
2209             fjx3             = _mm_add_pd(fjx3,tx);
2210             fjy3             = _mm_add_pd(fjy3,ty);
2211             fjz3             = _mm_add_pd(fjz3,tz);
2212
2213             }
2214
2215             /**************************
2216              * CALCULATE INTERACTIONS *
2217              **************************/
2218
2219             if (gmx_mm_any_lt(rsq21,rcutoff2))
2220             {
2221
2222             r21              = _mm_mul_pd(rsq21,rinv21);
2223
2224             /* EWALD ELECTROSTATICS */
2225
2226             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2227             ewrt             = _mm_mul_pd(r21,ewtabscale);
2228             ewitab           = _mm_cvttpd_epi32(ewrt);
2229             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2230             ewitab           = _mm_slli_epi32(ewitab,2);
2231             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2232             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2233             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2234             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2235             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2236             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2237             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2238             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2239             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2240             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2241
2242             d                = _mm_sub_pd(r21,rswitch);
2243             d                = _mm_max_pd(d,_mm_setzero_pd());
2244             d2               = _mm_mul_pd(d,d);
2245             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2246
2247             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2248
2249             /* Evaluate switch function */
2250             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2251             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2252             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2253
2254             fscal            = felec;
2255
2256             fscal            = _mm_and_pd(fscal,cutoff_mask);
2257
2258             /* Calculate temporary vectorial force */
2259             tx               = _mm_mul_pd(fscal,dx21);
2260             ty               = _mm_mul_pd(fscal,dy21);
2261             tz               = _mm_mul_pd(fscal,dz21);
2262
2263             /* Update vectorial force */
2264             fix2             = _mm_add_pd(fix2,tx);
2265             fiy2             = _mm_add_pd(fiy2,ty);
2266             fiz2             = _mm_add_pd(fiz2,tz);
2267
2268             fjx1             = _mm_add_pd(fjx1,tx);
2269             fjy1             = _mm_add_pd(fjy1,ty);
2270             fjz1             = _mm_add_pd(fjz1,tz);
2271
2272             }
2273
2274             /**************************
2275              * CALCULATE INTERACTIONS *
2276              **************************/
2277
2278             if (gmx_mm_any_lt(rsq22,rcutoff2))
2279             {
2280
2281             r22              = _mm_mul_pd(rsq22,rinv22);
2282
2283             /* EWALD ELECTROSTATICS */
2284
2285             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2286             ewrt             = _mm_mul_pd(r22,ewtabscale);
2287             ewitab           = _mm_cvttpd_epi32(ewrt);
2288             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2289             ewitab           = _mm_slli_epi32(ewitab,2);
2290             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2291             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2292             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2293             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2294             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2295             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2296             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2297             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2298             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2299             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2300
2301             d                = _mm_sub_pd(r22,rswitch);
2302             d                = _mm_max_pd(d,_mm_setzero_pd());
2303             d2               = _mm_mul_pd(d,d);
2304             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2305
2306             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2307
2308             /* Evaluate switch function */
2309             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2310             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2311             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2312
2313             fscal            = felec;
2314
2315             fscal            = _mm_and_pd(fscal,cutoff_mask);
2316
2317             /* Calculate temporary vectorial force */
2318             tx               = _mm_mul_pd(fscal,dx22);
2319             ty               = _mm_mul_pd(fscal,dy22);
2320             tz               = _mm_mul_pd(fscal,dz22);
2321
2322             /* Update vectorial force */
2323             fix2             = _mm_add_pd(fix2,tx);
2324             fiy2             = _mm_add_pd(fiy2,ty);
2325             fiz2             = _mm_add_pd(fiz2,tz);
2326
2327             fjx2             = _mm_add_pd(fjx2,tx);
2328             fjy2             = _mm_add_pd(fjy2,ty);
2329             fjz2             = _mm_add_pd(fjz2,tz);
2330
2331             }
2332
2333             /**************************
2334              * CALCULATE INTERACTIONS *
2335              **************************/
2336
2337             if (gmx_mm_any_lt(rsq23,rcutoff2))
2338             {
2339
2340             r23              = _mm_mul_pd(rsq23,rinv23);
2341
2342             /* EWALD ELECTROSTATICS */
2343
2344             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2345             ewrt             = _mm_mul_pd(r23,ewtabscale);
2346             ewitab           = _mm_cvttpd_epi32(ewrt);
2347             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2348             ewitab           = _mm_slli_epi32(ewitab,2);
2349             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2350             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2351             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2352             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2353             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2354             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2355             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2356             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2357             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2358             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2359
2360             d                = _mm_sub_pd(r23,rswitch);
2361             d                = _mm_max_pd(d,_mm_setzero_pd());
2362             d2               = _mm_mul_pd(d,d);
2363             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2364
2365             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2366
2367             /* Evaluate switch function */
2368             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2369             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2370             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
2371
2372             fscal            = felec;
2373
2374             fscal            = _mm_and_pd(fscal,cutoff_mask);
2375
2376             /* Calculate temporary vectorial force */
2377             tx               = _mm_mul_pd(fscal,dx23);
2378             ty               = _mm_mul_pd(fscal,dy23);
2379             tz               = _mm_mul_pd(fscal,dz23);
2380
2381             /* Update vectorial force */
2382             fix2             = _mm_add_pd(fix2,tx);
2383             fiy2             = _mm_add_pd(fiy2,ty);
2384             fiz2             = _mm_add_pd(fiz2,tz);
2385
2386             fjx3             = _mm_add_pd(fjx3,tx);
2387             fjy3             = _mm_add_pd(fjy3,ty);
2388             fjz3             = _mm_add_pd(fjz3,tz);
2389
2390             }
2391
2392             /**************************
2393              * CALCULATE INTERACTIONS *
2394              **************************/
2395
2396             if (gmx_mm_any_lt(rsq31,rcutoff2))
2397             {
2398
2399             r31              = _mm_mul_pd(rsq31,rinv31);
2400
2401             /* EWALD ELECTROSTATICS */
2402
2403             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2404             ewrt             = _mm_mul_pd(r31,ewtabscale);
2405             ewitab           = _mm_cvttpd_epi32(ewrt);
2406             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2407             ewitab           = _mm_slli_epi32(ewitab,2);
2408             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2409             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2410             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2411             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2412             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2413             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2414             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2415             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2416             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2417             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2418
2419             d                = _mm_sub_pd(r31,rswitch);
2420             d                = _mm_max_pd(d,_mm_setzero_pd());
2421             d2               = _mm_mul_pd(d,d);
2422             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2423
2424             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2425
2426             /* Evaluate switch function */
2427             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2428             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2429             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
2430
2431             fscal            = felec;
2432
2433             fscal            = _mm_and_pd(fscal,cutoff_mask);
2434
2435             /* Calculate temporary vectorial force */
2436             tx               = _mm_mul_pd(fscal,dx31);
2437             ty               = _mm_mul_pd(fscal,dy31);
2438             tz               = _mm_mul_pd(fscal,dz31);
2439
2440             /* Update vectorial force */
2441             fix3             = _mm_add_pd(fix3,tx);
2442             fiy3             = _mm_add_pd(fiy3,ty);
2443             fiz3             = _mm_add_pd(fiz3,tz);
2444
2445             fjx1             = _mm_add_pd(fjx1,tx);
2446             fjy1             = _mm_add_pd(fjy1,ty);
2447             fjz1             = _mm_add_pd(fjz1,tz);
2448
2449             }
2450
2451             /**************************
2452              * CALCULATE INTERACTIONS *
2453              **************************/
2454
2455             if (gmx_mm_any_lt(rsq32,rcutoff2))
2456             {
2457
2458             r32              = _mm_mul_pd(rsq32,rinv32);
2459
2460             /* EWALD ELECTROSTATICS */
2461
2462             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2463             ewrt             = _mm_mul_pd(r32,ewtabscale);
2464             ewitab           = _mm_cvttpd_epi32(ewrt);
2465             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2466             ewitab           = _mm_slli_epi32(ewitab,2);
2467             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2468             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2469             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2470             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2471             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2472             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2473             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2474             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2475             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2476             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2477
2478             d                = _mm_sub_pd(r32,rswitch);
2479             d                = _mm_max_pd(d,_mm_setzero_pd());
2480             d2               = _mm_mul_pd(d,d);
2481             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2482
2483             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2484
2485             /* Evaluate switch function */
2486             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2487             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2488             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
2489
2490             fscal            = felec;
2491
2492             fscal            = _mm_and_pd(fscal,cutoff_mask);
2493
2494             /* Calculate temporary vectorial force */
2495             tx               = _mm_mul_pd(fscal,dx32);
2496             ty               = _mm_mul_pd(fscal,dy32);
2497             tz               = _mm_mul_pd(fscal,dz32);
2498
2499             /* Update vectorial force */
2500             fix3             = _mm_add_pd(fix3,tx);
2501             fiy3             = _mm_add_pd(fiy3,ty);
2502             fiz3             = _mm_add_pd(fiz3,tz);
2503
2504             fjx2             = _mm_add_pd(fjx2,tx);
2505             fjy2             = _mm_add_pd(fjy2,ty);
2506             fjz2             = _mm_add_pd(fjz2,tz);
2507
2508             }
2509
2510             /**************************
2511              * CALCULATE INTERACTIONS *
2512              **************************/
2513
2514             if (gmx_mm_any_lt(rsq33,rcutoff2))
2515             {
2516
2517             r33              = _mm_mul_pd(rsq33,rinv33);
2518
2519             /* EWALD ELECTROSTATICS */
2520
2521             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2522             ewrt             = _mm_mul_pd(r33,ewtabscale);
2523             ewitab           = _mm_cvttpd_epi32(ewrt);
2524             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2525             ewitab           = _mm_slli_epi32(ewitab,2);
2526             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2527             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2528             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2529             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2530             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2531             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2532             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2533             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2534             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2535             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2536
2537             d                = _mm_sub_pd(r33,rswitch);
2538             d                = _mm_max_pd(d,_mm_setzero_pd());
2539             d2               = _mm_mul_pd(d,d);
2540             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2541
2542             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2543
2544             /* Evaluate switch function */
2545             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2546             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2547             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
2548
2549             fscal            = felec;
2550
2551             fscal            = _mm_and_pd(fscal,cutoff_mask);
2552
2553             /* Calculate temporary vectorial force */
2554             tx               = _mm_mul_pd(fscal,dx33);
2555             ty               = _mm_mul_pd(fscal,dy33);
2556             tz               = _mm_mul_pd(fscal,dz33);
2557
2558             /* Update vectorial force */
2559             fix3             = _mm_add_pd(fix3,tx);
2560             fiy3             = _mm_add_pd(fiy3,ty);
2561             fiz3             = _mm_add_pd(fiz3,tz);
2562
2563             fjx3             = _mm_add_pd(fjx3,tx);
2564             fjy3             = _mm_add_pd(fjy3,ty);
2565             fjz3             = _mm_add_pd(fjz3,tz);
2566
2567             }
2568
2569             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2570
2571             /* Inner loop uses 617 flops */
2572         }
2573
2574         if(jidx<j_index_end)
2575         {
2576
2577             jnrA             = jjnr[jidx];
2578             j_coord_offsetA  = DIM*jnrA;
2579
2580             /* load j atom coordinates */
2581             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2582                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2583                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2584             
2585             /* Calculate displacement vector */
2586             dx00             = _mm_sub_pd(ix0,jx0);
2587             dy00             = _mm_sub_pd(iy0,jy0);
2588             dz00             = _mm_sub_pd(iz0,jz0);
2589             dx11             = _mm_sub_pd(ix1,jx1);
2590             dy11             = _mm_sub_pd(iy1,jy1);
2591             dz11             = _mm_sub_pd(iz1,jz1);
2592             dx12             = _mm_sub_pd(ix1,jx2);
2593             dy12             = _mm_sub_pd(iy1,jy2);
2594             dz12             = _mm_sub_pd(iz1,jz2);
2595             dx13             = _mm_sub_pd(ix1,jx3);
2596             dy13             = _mm_sub_pd(iy1,jy3);
2597             dz13             = _mm_sub_pd(iz1,jz3);
2598             dx21             = _mm_sub_pd(ix2,jx1);
2599             dy21             = _mm_sub_pd(iy2,jy1);
2600             dz21             = _mm_sub_pd(iz2,jz1);
2601             dx22             = _mm_sub_pd(ix2,jx2);
2602             dy22             = _mm_sub_pd(iy2,jy2);
2603             dz22             = _mm_sub_pd(iz2,jz2);
2604             dx23             = _mm_sub_pd(ix2,jx3);
2605             dy23             = _mm_sub_pd(iy2,jy3);
2606             dz23             = _mm_sub_pd(iz2,jz3);
2607             dx31             = _mm_sub_pd(ix3,jx1);
2608             dy31             = _mm_sub_pd(iy3,jy1);
2609             dz31             = _mm_sub_pd(iz3,jz1);
2610             dx32             = _mm_sub_pd(ix3,jx2);
2611             dy32             = _mm_sub_pd(iy3,jy2);
2612             dz32             = _mm_sub_pd(iz3,jz2);
2613             dx33             = _mm_sub_pd(ix3,jx3);
2614             dy33             = _mm_sub_pd(iy3,jy3);
2615             dz33             = _mm_sub_pd(iz3,jz3);
2616
2617             /* Calculate squared distance and things based on it */
2618             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2619             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2620             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2621             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2622             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2623             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2624             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2625             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2626             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2627             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2628
2629             rinv00           = gmx_mm_invsqrt_pd(rsq00);
2630             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2631             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2632             rinv13           = gmx_mm_invsqrt_pd(rsq13);
2633             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2634             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2635             rinv23           = gmx_mm_invsqrt_pd(rsq23);
2636             rinv31           = gmx_mm_invsqrt_pd(rsq31);
2637             rinv32           = gmx_mm_invsqrt_pd(rsq32);
2638             rinv33           = gmx_mm_invsqrt_pd(rsq33);
2639
2640             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
2641             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2642             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2643             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
2644             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2645             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2646             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
2647             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
2648             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
2649             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
2650
2651             fjx0             = _mm_setzero_pd();
2652             fjy0             = _mm_setzero_pd();
2653             fjz0             = _mm_setzero_pd();
2654             fjx1             = _mm_setzero_pd();
2655             fjy1             = _mm_setzero_pd();
2656             fjz1             = _mm_setzero_pd();
2657             fjx2             = _mm_setzero_pd();
2658             fjy2             = _mm_setzero_pd();
2659             fjz2             = _mm_setzero_pd();
2660             fjx3             = _mm_setzero_pd();
2661             fjy3             = _mm_setzero_pd();
2662             fjz3             = _mm_setzero_pd();
2663
2664             /**************************
2665              * CALCULATE INTERACTIONS *
2666              **************************/
2667
2668             if (gmx_mm_any_lt(rsq00,rcutoff2))
2669             {
2670
2671             r00              = _mm_mul_pd(rsq00,rinv00);
2672
2673             /* LENNARD-JONES DISPERSION/REPULSION */
2674
2675             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2676             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
2677             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2678             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2679             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2680
2681             d                = _mm_sub_pd(r00,rswitch);
2682             d                = _mm_max_pd(d,_mm_setzero_pd());
2683             d2               = _mm_mul_pd(d,d);
2684             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2685
2686             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2687
2688             /* Evaluate switch function */
2689             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2690             fvdw             = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2691             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2692
2693             fscal            = fvdw;
2694
2695             fscal            = _mm_and_pd(fscal,cutoff_mask);
2696
2697             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2698
2699             /* Calculate temporary vectorial force */
2700             tx               = _mm_mul_pd(fscal,dx00);
2701             ty               = _mm_mul_pd(fscal,dy00);
2702             tz               = _mm_mul_pd(fscal,dz00);
2703
2704             /* Update vectorial force */
2705             fix0             = _mm_add_pd(fix0,tx);
2706             fiy0             = _mm_add_pd(fiy0,ty);
2707             fiz0             = _mm_add_pd(fiz0,tz);
2708
2709             fjx0             = _mm_add_pd(fjx0,tx);
2710             fjy0             = _mm_add_pd(fjy0,ty);
2711             fjz0             = _mm_add_pd(fjz0,tz);
2712
2713             }
2714
2715             /**************************
2716              * CALCULATE INTERACTIONS *
2717              **************************/
2718
2719             if (gmx_mm_any_lt(rsq11,rcutoff2))
2720             {
2721
2722             r11              = _mm_mul_pd(rsq11,rinv11);
2723
2724             /* EWALD ELECTROSTATICS */
2725
2726             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2727             ewrt             = _mm_mul_pd(r11,ewtabscale);
2728             ewitab           = _mm_cvttpd_epi32(ewrt);
2729             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2730             ewitab           = _mm_slli_epi32(ewitab,2);
2731             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2732             ewtabD           = _mm_setzero_pd();
2733             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2734             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2735             ewtabFn          = _mm_setzero_pd();
2736             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2737             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2738             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2739             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2740             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2741
2742             d                = _mm_sub_pd(r11,rswitch);
2743             d                = _mm_max_pd(d,_mm_setzero_pd());
2744             d2               = _mm_mul_pd(d,d);
2745             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2746
2747             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2748
2749             /* Evaluate switch function */
2750             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2751             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2752             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2753
2754             fscal            = felec;
2755
2756             fscal            = _mm_and_pd(fscal,cutoff_mask);
2757
2758             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2759
2760             /* Calculate temporary vectorial force */
2761             tx               = _mm_mul_pd(fscal,dx11);
2762             ty               = _mm_mul_pd(fscal,dy11);
2763             tz               = _mm_mul_pd(fscal,dz11);
2764
2765             /* Update vectorial force */
2766             fix1             = _mm_add_pd(fix1,tx);
2767             fiy1             = _mm_add_pd(fiy1,ty);
2768             fiz1             = _mm_add_pd(fiz1,tz);
2769
2770             fjx1             = _mm_add_pd(fjx1,tx);
2771             fjy1             = _mm_add_pd(fjy1,ty);
2772             fjz1             = _mm_add_pd(fjz1,tz);
2773
2774             }
2775
2776             /**************************
2777              * CALCULATE INTERACTIONS *
2778              **************************/
2779
2780             if (gmx_mm_any_lt(rsq12,rcutoff2))
2781             {
2782
2783             r12              = _mm_mul_pd(rsq12,rinv12);
2784
2785             /* EWALD ELECTROSTATICS */
2786
2787             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2788             ewrt             = _mm_mul_pd(r12,ewtabscale);
2789             ewitab           = _mm_cvttpd_epi32(ewrt);
2790             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2791             ewitab           = _mm_slli_epi32(ewitab,2);
2792             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2793             ewtabD           = _mm_setzero_pd();
2794             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2795             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2796             ewtabFn          = _mm_setzero_pd();
2797             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2798             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2799             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2800             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2801             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2802
2803             d                = _mm_sub_pd(r12,rswitch);
2804             d                = _mm_max_pd(d,_mm_setzero_pd());
2805             d2               = _mm_mul_pd(d,d);
2806             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2807
2808             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2809
2810             /* Evaluate switch function */
2811             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2812             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2813             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2814
2815             fscal            = felec;
2816
2817             fscal            = _mm_and_pd(fscal,cutoff_mask);
2818
2819             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2820
2821             /* Calculate temporary vectorial force */
2822             tx               = _mm_mul_pd(fscal,dx12);
2823             ty               = _mm_mul_pd(fscal,dy12);
2824             tz               = _mm_mul_pd(fscal,dz12);
2825
2826             /* Update vectorial force */
2827             fix1             = _mm_add_pd(fix1,tx);
2828             fiy1             = _mm_add_pd(fiy1,ty);
2829             fiz1             = _mm_add_pd(fiz1,tz);
2830
2831             fjx2             = _mm_add_pd(fjx2,tx);
2832             fjy2             = _mm_add_pd(fjy2,ty);
2833             fjz2             = _mm_add_pd(fjz2,tz);
2834
2835             }
2836
2837             /**************************
2838              * CALCULATE INTERACTIONS *
2839              **************************/
2840
2841             if (gmx_mm_any_lt(rsq13,rcutoff2))
2842             {
2843
2844             r13              = _mm_mul_pd(rsq13,rinv13);
2845
2846             /* EWALD ELECTROSTATICS */
2847
2848             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2849             ewrt             = _mm_mul_pd(r13,ewtabscale);
2850             ewitab           = _mm_cvttpd_epi32(ewrt);
2851             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2852             ewitab           = _mm_slli_epi32(ewitab,2);
2853             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2854             ewtabD           = _mm_setzero_pd();
2855             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2856             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2857             ewtabFn          = _mm_setzero_pd();
2858             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2859             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2860             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2861             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2862             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2863
2864             d                = _mm_sub_pd(r13,rswitch);
2865             d                = _mm_max_pd(d,_mm_setzero_pd());
2866             d2               = _mm_mul_pd(d,d);
2867             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2868
2869             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2870
2871             /* Evaluate switch function */
2872             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2873             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2874             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2875
2876             fscal            = felec;
2877
2878             fscal            = _mm_and_pd(fscal,cutoff_mask);
2879
2880             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2881
2882             /* Calculate temporary vectorial force */
2883             tx               = _mm_mul_pd(fscal,dx13);
2884             ty               = _mm_mul_pd(fscal,dy13);
2885             tz               = _mm_mul_pd(fscal,dz13);
2886
2887             /* Update vectorial force */
2888             fix1             = _mm_add_pd(fix1,tx);
2889             fiy1             = _mm_add_pd(fiy1,ty);
2890             fiz1             = _mm_add_pd(fiz1,tz);
2891
2892             fjx3             = _mm_add_pd(fjx3,tx);
2893             fjy3             = _mm_add_pd(fjy3,ty);
2894             fjz3             = _mm_add_pd(fjz3,tz);
2895
2896             }
2897
2898             /**************************
2899              * CALCULATE INTERACTIONS *
2900              **************************/
2901
2902             if (gmx_mm_any_lt(rsq21,rcutoff2))
2903             {
2904
2905             r21              = _mm_mul_pd(rsq21,rinv21);
2906
2907             /* EWALD ELECTROSTATICS */
2908
2909             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2910             ewrt             = _mm_mul_pd(r21,ewtabscale);
2911             ewitab           = _mm_cvttpd_epi32(ewrt);
2912             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2913             ewitab           = _mm_slli_epi32(ewitab,2);
2914             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2915             ewtabD           = _mm_setzero_pd();
2916             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2917             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2918             ewtabFn          = _mm_setzero_pd();
2919             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2920             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2921             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2922             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2923             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2924
2925             d                = _mm_sub_pd(r21,rswitch);
2926             d                = _mm_max_pd(d,_mm_setzero_pd());
2927             d2               = _mm_mul_pd(d,d);
2928             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2929
2930             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2931
2932             /* Evaluate switch function */
2933             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2934             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2935             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2936
2937             fscal            = felec;
2938
2939             fscal            = _mm_and_pd(fscal,cutoff_mask);
2940
2941             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2942
2943             /* Calculate temporary vectorial force */
2944             tx               = _mm_mul_pd(fscal,dx21);
2945             ty               = _mm_mul_pd(fscal,dy21);
2946             tz               = _mm_mul_pd(fscal,dz21);
2947
2948             /* Update vectorial force */
2949             fix2             = _mm_add_pd(fix2,tx);
2950             fiy2             = _mm_add_pd(fiy2,ty);
2951             fiz2             = _mm_add_pd(fiz2,tz);
2952
2953             fjx1             = _mm_add_pd(fjx1,tx);
2954             fjy1             = _mm_add_pd(fjy1,ty);
2955             fjz1             = _mm_add_pd(fjz1,tz);
2956
2957             }
2958
2959             /**************************
2960              * CALCULATE INTERACTIONS *
2961              **************************/
2962
2963             if (gmx_mm_any_lt(rsq22,rcutoff2))
2964             {
2965
2966             r22              = _mm_mul_pd(rsq22,rinv22);
2967
2968             /* EWALD ELECTROSTATICS */
2969
2970             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2971             ewrt             = _mm_mul_pd(r22,ewtabscale);
2972             ewitab           = _mm_cvttpd_epi32(ewrt);
2973             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2974             ewitab           = _mm_slli_epi32(ewitab,2);
2975             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2976             ewtabD           = _mm_setzero_pd();
2977             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2978             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2979             ewtabFn          = _mm_setzero_pd();
2980             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2981             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2982             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2983             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2984             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2985
2986             d                = _mm_sub_pd(r22,rswitch);
2987             d                = _mm_max_pd(d,_mm_setzero_pd());
2988             d2               = _mm_mul_pd(d,d);
2989             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2990
2991             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2992
2993             /* Evaluate switch function */
2994             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2995             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2996             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2997
2998             fscal            = felec;
2999
3000             fscal            = _mm_and_pd(fscal,cutoff_mask);
3001
3002             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3003
3004             /* Calculate temporary vectorial force */
3005             tx               = _mm_mul_pd(fscal,dx22);
3006             ty               = _mm_mul_pd(fscal,dy22);
3007             tz               = _mm_mul_pd(fscal,dz22);
3008
3009             /* Update vectorial force */
3010             fix2             = _mm_add_pd(fix2,tx);
3011             fiy2             = _mm_add_pd(fiy2,ty);
3012             fiz2             = _mm_add_pd(fiz2,tz);
3013
3014             fjx2             = _mm_add_pd(fjx2,tx);
3015             fjy2             = _mm_add_pd(fjy2,ty);
3016             fjz2             = _mm_add_pd(fjz2,tz);
3017
3018             }
3019
3020             /**************************
3021              * CALCULATE INTERACTIONS *
3022              **************************/
3023
3024             if (gmx_mm_any_lt(rsq23,rcutoff2))
3025             {
3026
3027             r23              = _mm_mul_pd(rsq23,rinv23);
3028
3029             /* EWALD ELECTROSTATICS */
3030
3031             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3032             ewrt             = _mm_mul_pd(r23,ewtabscale);
3033             ewitab           = _mm_cvttpd_epi32(ewrt);
3034             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3035             ewitab           = _mm_slli_epi32(ewitab,2);
3036             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3037             ewtabD           = _mm_setzero_pd();
3038             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3039             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3040             ewtabFn          = _mm_setzero_pd();
3041             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3042             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3043             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3044             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
3045             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
3046
3047             d                = _mm_sub_pd(r23,rswitch);
3048             d                = _mm_max_pd(d,_mm_setzero_pd());
3049             d2               = _mm_mul_pd(d,d);
3050             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3051
3052             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3053
3054             /* Evaluate switch function */
3055             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3056             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
3057             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
3058
3059             fscal            = felec;
3060
3061             fscal            = _mm_and_pd(fscal,cutoff_mask);
3062
3063             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3064
3065             /* Calculate temporary vectorial force */
3066             tx               = _mm_mul_pd(fscal,dx23);
3067             ty               = _mm_mul_pd(fscal,dy23);
3068             tz               = _mm_mul_pd(fscal,dz23);
3069
3070             /* Update vectorial force */
3071             fix2             = _mm_add_pd(fix2,tx);
3072             fiy2             = _mm_add_pd(fiy2,ty);
3073             fiz2             = _mm_add_pd(fiz2,tz);
3074
3075             fjx3             = _mm_add_pd(fjx3,tx);
3076             fjy3             = _mm_add_pd(fjy3,ty);
3077             fjz3             = _mm_add_pd(fjz3,tz);
3078
3079             }
3080
3081             /**************************
3082              * CALCULATE INTERACTIONS *
3083              **************************/
3084
3085             if (gmx_mm_any_lt(rsq31,rcutoff2))
3086             {
3087
3088             r31              = _mm_mul_pd(rsq31,rinv31);
3089
3090             /* EWALD ELECTROSTATICS */
3091
3092             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3093             ewrt             = _mm_mul_pd(r31,ewtabscale);
3094             ewitab           = _mm_cvttpd_epi32(ewrt);
3095             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3096             ewitab           = _mm_slli_epi32(ewitab,2);
3097             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3098             ewtabD           = _mm_setzero_pd();
3099             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3100             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3101             ewtabFn          = _mm_setzero_pd();
3102             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3103             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3104             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3105             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
3106             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
3107
3108             d                = _mm_sub_pd(r31,rswitch);
3109             d                = _mm_max_pd(d,_mm_setzero_pd());
3110             d2               = _mm_mul_pd(d,d);
3111             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3112
3113             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3114
3115             /* Evaluate switch function */
3116             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3117             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
3118             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
3119
3120             fscal            = felec;
3121
3122             fscal            = _mm_and_pd(fscal,cutoff_mask);
3123
3124             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3125
3126             /* Calculate temporary vectorial force */
3127             tx               = _mm_mul_pd(fscal,dx31);
3128             ty               = _mm_mul_pd(fscal,dy31);
3129             tz               = _mm_mul_pd(fscal,dz31);
3130
3131             /* Update vectorial force */
3132             fix3             = _mm_add_pd(fix3,tx);
3133             fiy3             = _mm_add_pd(fiy3,ty);
3134             fiz3             = _mm_add_pd(fiz3,tz);
3135
3136             fjx1             = _mm_add_pd(fjx1,tx);
3137             fjy1             = _mm_add_pd(fjy1,ty);
3138             fjz1             = _mm_add_pd(fjz1,tz);
3139
3140             }
3141
3142             /**************************
3143              * CALCULATE INTERACTIONS *
3144              **************************/
3145
3146             if (gmx_mm_any_lt(rsq32,rcutoff2))
3147             {
3148
3149             r32              = _mm_mul_pd(rsq32,rinv32);
3150
3151             /* EWALD ELECTROSTATICS */
3152
3153             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3154             ewrt             = _mm_mul_pd(r32,ewtabscale);
3155             ewitab           = _mm_cvttpd_epi32(ewrt);
3156             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3157             ewitab           = _mm_slli_epi32(ewitab,2);
3158             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3159             ewtabD           = _mm_setzero_pd();
3160             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3161             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3162             ewtabFn          = _mm_setzero_pd();
3163             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3164             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3165             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3166             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
3167             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
3168
3169             d                = _mm_sub_pd(r32,rswitch);
3170             d                = _mm_max_pd(d,_mm_setzero_pd());
3171             d2               = _mm_mul_pd(d,d);
3172             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3173
3174             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3175
3176             /* Evaluate switch function */
3177             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3178             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
3179             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
3180
3181             fscal            = felec;
3182
3183             fscal            = _mm_and_pd(fscal,cutoff_mask);
3184
3185             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3186
3187             /* Calculate temporary vectorial force */
3188             tx               = _mm_mul_pd(fscal,dx32);
3189             ty               = _mm_mul_pd(fscal,dy32);
3190             tz               = _mm_mul_pd(fscal,dz32);
3191
3192             /* Update vectorial force */
3193             fix3             = _mm_add_pd(fix3,tx);
3194             fiy3             = _mm_add_pd(fiy3,ty);
3195             fiz3             = _mm_add_pd(fiz3,tz);
3196
3197             fjx2             = _mm_add_pd(fjx2,tx);
3198             fjy2             = _mm_add_pd(fjy2,ty);
3199             fjz2             = _mm_add_pd(fjz2,tz);
3200
3201             }
3202
3203             /**************************
3204              * CALCULATE INTERACTIONS *
3205              **************************/
3206
3207             if (gmx_mm_any_lt(rsq33,rcutoff2))
3208             {
3209
3210             r33              = _mm_mul_pd(rsq33,rinv33);
3211
3212             /* EWALD ELECTROSTATICS */
3213
3214             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3215             ewrt             = _mm_mul_pd(r33,ewtabscale);
3216             ewitab           = _mm_cvttpd_epi32(ewrt);
3217             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3218             ewitab           = _mm_slli_epi32(ewitab,2);
3219             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3220             ewtabD           = _mm_setzero_pd();
3221             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3222             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3223             ewtabFn          = _mm_setzero_pd();
3224             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3225             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3226             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3227             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
3228             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
3229
3230             d                = _mm_sub_pd(r33,rswitch);
3231             d                = _mm_max_pd(d,_mm_setzero_pd());
3232             d2               = _mm_mul_pd(d,d);
3233             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3234
3235             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3236
3237             /* Evaluate switch function */
3238             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3239             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
3240             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
3241
3242             fscal            = felec;
3243
3244             fscal            = _mm_and_pd(fscal,cutoff_mask);
3245
3246             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3247
3248             /* Calculate temporary vectorial force */
3249             tx               = _mm_mul_pd(fscal,dx33);
3250             ty               = _mm_mul_pd(fscal,dy33);
3251             tz               = _mm_mul_pd(fscal,dz33);
3252
3253             /* Update vectorial force */
3254             fix3             = _mm_add_pd(fix3,tx);
3255             fiy3             = _mm_add_pd(fiy3,ty);
3256             fiz3             = _mm_add_pd(fiz3,tz);
3257
3258             fjx3             = _mm_add_pd(fjx3,tx);
3259             fjy3             = _mm_add_pd(fjy3,ty);
3260             fjz3             = _mm_add_pd(fjz3,tz);
3261
3262             }
3263
3264             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3265
3266             /* Inner loop uses 617 flops */
3267         }
3268
3269         /* End of innermost loop */
3270
3271         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3272                                               f+i_coord_offset,fshift+i_shift_offset);
3273
3274         /* Increment number of inner iterations */
3275         inneriter                  += j_index_end - j_index_start;
3276
3277         /* Outer loop uses 24 flops */
3278     }
3279
3280     /* Increment number of outer iterations */
3281     outeriter        += nri;
3282
3283     /* Update outer/inner flops */
3284
3285     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*617);
3286 }