96e56f2781c6bf377175601c62f6fa8d45db74a8
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sse4_1_double.c
1 /*
2  * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse4_1_double.h"
34 #include "kernelutil_x86_sse4_1_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_double
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            None
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset1;
67     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
68     int              vdwioffset2;
69     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
70     int              vdwioffset3;
71     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
72     int              vdwjidx1A,vdwjidx1B;
73     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
74     int              vdwjidx2A,vdwjidx2B;
75     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
76     int              vdwjidx3A,vdwjidx3B;
77     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
78     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
79     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
80     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
81     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
82     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
83     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
84     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
85     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
86     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
87     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     __m128i          ewitab;
90     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
91     real             *ewtab;
92     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
93     real             rswitch_scalar,d_scalar;
94     __m128d          dummy_mask,cutoff_mask;
95     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
96     __m128d          one     = _mm_set1_pd(1.0);
97     __m128d          two     = _mm_set1_pd(2.0);
98     x                = xx[0];
99     f                = ff[0];
100
101     nri              = nlist->nri;
102     iinr             = nlist->iinr;
103     jindex           = nlist->jindex;
104     jjnr             = nlist->jjnr;
105     shiftidx         = nlist->shift;
106     gid              = nlist->gid;
107     shiftvec         = fr->shift_vec[0];
108     fshift           = fr->fshift[0];
109     facel            = _mm_set1_pd(fr->epsfac);
110     charge           = mdatoms->chargeA;
111
112     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
113     ewtab            = fr->ic->tabq_coul_FDV0;
114     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
115     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
116
117     /* Setup water-specific parameters */
118     inr              = nlist->iinr[0];
119     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
120     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
121     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
122
123     jq1              = _mm_set1_pd(charge[inr+1]);
124     jq2              = _mm_set1_pd(charge[inr+2]);
125     jq3              = _mm_set1_pd(charge[inr+3]);
126     qq11             = _mm_mul_pd(iq1,jq1);
127     qq12             = _mm_mul_pd(iq1,jq2);
128     qq13             = _mm_mul_pd(iq1,jq3);
129     qq21             = _mm_mul_pd(iq2,jq1);
130     qq22             = _mm_mul_pd(iq2,jq2);
131     qq23             = _mm_mul_pd(iq2,jq3);
132     qq31             = _mm_mul_pd(iq3,jq1);
133     qq32             = _mm_mul_pd(iq3,jq2);
134     qq33             = _mm_mul_pd(iq3,jq3);
135
136     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
137     rcutoff_scalar   = fr->rcoulomb;
138     rcutoff          = _mm_set1_pd(rcutoff_scalar);
139     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
140
141     rswitch_scalar   = fr->rcoulomb_switch;
142     rswitch          = _mm_set1_pd(rswitch_scalar);
143     /* Setup switch parameters */
144     d_scalar         = rcutoff_scalar-rswitch_scalar;
145     d                = _mm_set1_pd(d_scalar);
146     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
147     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
148     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
149     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
150     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
151     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
152
153     /* Avoid stupid compiler warnings */
154     jnrA = jnrB = 0;
155     j_coord_offsetA = 0;
156     j_coord_offsetB = 0;
157
158     outeriter        = 0;
159     inneriter        = 0;
160
161     /* Start outer loop over neighborlists */
162     for(iidx=0; iidx<nri; iidx++)
163     {
164         /* Load shift vector for this list */
165         i_shift_offset   = DIM*shiftidx[iidx];
166
167         /* Load limits for loop over neighbors */
168         j_index_start    = jindex[iidx];
169         j_index_end      = jindex[iidx+1];
170
171         /* Get outer coordinate index */
172         inr              = iinr[iidx];
173         i_coord_offset   = DIM*inr;
174
175         /* Load i particle coords and add shift vector */
176         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
177                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
178
179         fix1             = _mm_setzero_pd();
180         fiy1             = _mm_setzero_pd();
181         fiz1             = _mm_setzero_pd();
182         fix2             = _mm_setzero_pd();
183         fiy2             = _mm_setzero_pd();
184         fiz2             = _mm_setzero_pd();
185         fix3             = _mm_setzero_pd();
186         fiy3             = _mm_setzero_pd();
187         fiz3             = _mm_setzero_pd();
188
189         /* Reset potential sums */
190         velecsum         = _mm_setzero_pd();
191
192         /* Start inner kernel loop */
193         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
194         {
195
196             /* Get j neighbor index, and coordinate index */
197             jnrA             = jjnr[jidx];
198             jnrB             = jjnr[jidx+1];
199             j_coord_offsetA  = DIM*jnrA;
200             j_coord_offsetB  = DIM*jnrB;
201
202             /* load j atom coordinates */
203             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
204                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
205
206             /* Calculate displacement vector */
207             dx11             = _mm_sub_pd(ix1,jx1);
208             dy11             = _mm_sub_pd(iy1,jy1);
209             dz11             = _mm_sub_pd(iz1,jz1);
210             dx12             = _mm_sub_pd(ix1,jx2);
211             dy12             = _mm_sub_pd(iy1,jy2);
212             dz12             = _mm_sub_pd(iz1,jz2);
213             dx13             = _mm_sub_pd(ix1,jx3);
214             dy13             = _mm_sub_pd(iy1,jy3);
215             dz13             = _mm_sub_pd(iz1,jz3);
216             dx21             = _mm_sub_pd(ix2,jx1);
217             dy21             = _mm_sub_pd(iy2,jy1);
218             dz21             = _mm_sub_pd(iz2,jz1);
219             dx22             = _mm_sub_pd(ix2,jx2);
220             dy22             = _mm_sub_pd(iy2,jy2);
221             dz22             = _mm_sub_pd(iz2,jz2);
222             dx23             = _mm_sub_pd(ix2,jx3);
223             dy23             = _mm_sub_pd(iy2,jy3);
224             dz23             = _mm_sub_pd(iz2,jz3);
225             dx31             = _mm_sub_pd(ix3,jx1);
226             dy31             = _mm_sub_pd(iy3,jy1);
227             dz31             = _mm_sub_pd(iz3,jz1);
228             dx32             = _mm_sub_pd(ix3,jx2);
229             dy32             = _mm_sub_pd(iy3,jy2);
230             dz32             = _mm_sub_pd(iz3,jz2);
231             dx33             = _mm_sub_pd(ix3,jx3);
232             dy33             = _mm_sub_pd(iy3,jy3);
233             dz33             = _mm_sub_pd(iz3,jz3);
234
235             /* Calculate squared distance and things based on it */
236             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
237             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
238             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
239             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
240             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
241             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
242             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
243             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
244             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
245
246             rinv11           = gmx_mm_invsqrt_pd(rsq11);
247             rinv12           = gmx_mm_invsqrt_pd(rsq12);
248             rinv13           = gmx_mm_invsqrt_pd(rsq13);
249             rinv21           = gmx_mm_invsqrt_pd(rsq21);
250             rinv22           = gmx_mm_invsqrt_pd(rsq22);
251             rinv23           = gmx_mm_invsqrt_pd(rsq23);
252             rinv31           = gmx_mm_invsqrt_pd(rsq31);
253             rinv32           = gmx_mm_invsqrt_pd(rsq32);
254             rinv33           = gmx_mm_invsqrt_pd(rsq33);
255
256             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
257             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
258             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
259             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
260             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
261             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
262             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
263             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
264             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
265
266             fjx1             = _mm_setzero_pd();
267             fjy1             = _mm_setzero_pd();
268             fjz1             = _mm_setzero_pd();
269             fjx2             = _mm_setzero_pd();
270             fjy2             = _mm_setzero_pd();
271             fjz2             = _mm_setzero_pd();
272             fjx3             = _mm_setzero_pd();
273             fjy3             = _mm_setzero_pd();
274             fjz3             = _mm_setzero_pd();
275
276             /**************************
277              * CALCULATE INTERACTIONS *
278              **************************/
279
280             if (gmx_mm_any_lt(rsq11,rcutoff2))
281             {
282
283             r11              = _mm_mul_pd(rsq11,rinv11);
284
285             /* EWALD ELECTROSTATICS */
286
287             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
288             ewrt             = _mm_mul_pd(r11,ewtabscale);
289             ewitab           = _mm_cvttpd_epi32(ewrt);
290             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
291             ewitab           = _mm_slli_epi32(ewitab,2);
292             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
293             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
294             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
295             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
296             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
297             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
298             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
299             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
300             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
301             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
302
303             d                = _mm_sub_pd(r11,rswitch);
304             d                = _mm_max_pd(d,_mm_setzero_pd());
305             d2               = _mm_mul_pd(d,d);
306             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
307
308             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
309
310             /* Evaluate switch function */
311             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
312             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
313             velec            = _mm_mul_pd(velec,sw);
314             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
315
316             /* Update potential sum for this i atom from the interaction with this j atom. */
317             velec            = _mm_and_pd(velec,cutoff_mask);
318             velecsum         = _mm_add_pd(velecsum,velec);
319
320             fscal            = felec;
321
322             fscal            = _mm_and_pd(fscal,cutoff_mask);
323
324             /* Calculate temporary vectorial force */
325             tx               = _mm_mul_pd(fscal,dx11);
326             ty               = _mm_mul_pd(fscal,dy11);
327             tz               = _mm_mul_pd(fscal,dz11);
328
329             /* Update vectorial force */
330             fix1             = _mm_add_pd(fix1,tx);
331             fiy1             = _mm_add_pd(fiy1,ty);
332             fiz1             = _mm_add_pd(fiz1,tz);
333
334             fjx1             = _mm_add_pd(fjx1,tx);
335             fjy1             = _mm_add_pd(fjy1,ty);
336             fjz1             = _mm_add_pd(fjz1,tz);
337
338             }
339
340             /**************************
341              * CALCULATE INTERACTIONS *
342              **************************/
343
344             if (gmx_mm_any_lt(rsq12,rcutoff2))
345             {
346
347             r12              = _mm_mul_pd(rsq12,rinv12);
348
349             /* EWALD ELECTROSTATICS */
350
351             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
352             ewrt             = _mm_mul_pd(r12,ewtabscale);
353             ewitab           = _mm_cvttpd_epi32(ewrt);
354             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
355             ewitab           = _mm_slli_epi32(ewitab,2);
356             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
357             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
358             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
359             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
360             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
361             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
362             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
363             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
364             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
365             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
366
367             d                = _mm_sub_pd(r12,rswitch);
368             d                = _mm_max_pd(d,_mm_setzero_pd());
369             d2               = _mm_mul_pd(d,d);
370             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
371
372             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
373
374             /* Evaluate switch function */
375             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
376             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
377             velec            = _mm_mul_pd(velec,sw);
378             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
379
380             /* Update potential sum for this i atom from the interaction with this j atom. */
381             velec            = _mm_and_pd(velec,cutoff_mask);
382             velecsum         = _mm_add_pd(velecsum,velec);
383
384             fscal            = felec;
385
386             fscal            = _mm_and_pd(fscal,cutoff_mask);
387
388             /* Calculate temporary vectorial force */
389             tx               = _mm_mul_pd(fscal,dx12);
390             ty               = _mm_mul_pd(fscal,dy12);
391             tz               = _mm_mul_pd(fscal,dz12);
392
393             /* Update vectorial force */
394             fix1             = _mm_add_pd(fix1,tx);
395             fiy1             = _mm_add_pd(fiy1,ty);
396             fiz1             = _mm_add_pd(fiz1,tz);
397
398             fjx2             = _mm_add_pd(fjx2,tx);
399             fjy2             = _mm_add_pd(fjy2,ty);
400             fjz2             = _mm_add_pd(fjz2,tz);
401
402             }
403
404             /**************************
405              * CALCULATE INTERACTIONS *
406              **************************/
407
408             if (gmx_mm_any_lt(rsq13,rcutoff2))
409             {
410
411             r13              = _mm_mul_pd(rsq13,rinv13);
412
413             /* EWALD ELECTROSTATICS */
414
415             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
416             ewrt             = _mm_mul_pd(r13,ewtabscale);
417             ewitab           = _mm_cvttpd_epi32(ewrt);
418             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
419             ewitab           = _mm_slli_epi32(ewitab,2);
420             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
421             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
422             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
423             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
424             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
425             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
426             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
427             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
428             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
429             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
430
431             d                = _mm_sub_pd(r13,rswitch);
432             d                = _mm_max_pd(d,_mm_setzero_pd());
433             d2               = _mm_mul_pd(d,d);
434             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
435
436             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
437
438             /* Evaluate switch function */
439             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
440             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
441             velec            = _mm_mul_pd(velec,sw);
442             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
443
444             /* Update potential sum for this i atom from the interaction with this j atom. */
445             velec            = _mm_and_pd(velec,cutoff_mask);
446             velecsum         = _mm_add_pd(velecsum,velec);
447
448             fscal            = felec;
449
450             fscal            = _mm_and_pd(fscal,cutoff_mask);
451
452             /* Calculate temporary vectorial force */
453             tx               = _mm_mul_pd(fscal,dx13);
454             ty               = _mm_mul_pd(fscal,dy13);
455             tz               = _mm_mul_pd(fscal,dz13);
456
457             /* Update vectorial force */
458             fix1             = _mm_add_pd(fix1,tx);
459             fiy1             = _mm_add_pd(fiy1,ty);
460             fiz1             = _mm_add_pd(fiz1,tz);
461
462             fjx3             = _mm_add_pd(fjx3,tx);
463             fjy3             = _mm_add_pd(fjy3,ty);
464             fjz3             = _mm_add_pd(fjz3,tz);
465
466             }
467
468             /**************************
469              * CALCULATE INTERACTIONS *
470              **************************/
471
472             if (gmx_mm_any_lt(rsq21,rcutoff2))
473             {
474
475             r21              = _mm_mul_pd(rsq21,rinv21);
476
477             /* EWALD ELECTROSTATICS */
478
479             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
480             ewrt             = _mm_mul_pd(r21,ewtabscale);
481             ewitab           = _mm_cvttpd_epi32(ewrt);
482             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
483             ewitab           = _mm_slli_epi32(ewitab,2);
484             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
485             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
486             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
487             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
488             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
489             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
490             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
491             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
492             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
493             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
494
495             d                = _mm_sub_pd(r21,rswitch);
496             d                = _mm_max_pd(d,_mm_setzero_pd());
497             d2               = _mm_mul_pd(d,d);
498             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
499
500             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
501
502             /* Evaluate switch function */
503             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
504             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
505             velec            = _mm_mul_pd(velec,sw);
506             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
507
508             /* Update potential sum for this i atom from the interaction with this j atom. */
509             velec            = _mm_and_pd(velec,cutoff_mask);
510             velecsum         = _mm_add_pd(velecsum,velec);
511
512             fscal            = felec;
513
514             fscal            = _mm_and_pd(fscal,cutoff_mask);
515
516             /* Calculate temporary vectorial force */
517             tx               = _mm_mul_pd(fscal,dx21);
518             ty               = _mm_mul_pd(fscal,dy21);
519             tz               = _mm_mul_pd(fscal,dz21);
520
521             /* Update vectorial force */
522             fix2             = _mm_add_pd(fix2,tx);
523             fiy2             = _mm_add_pd(fiy2,ty);
524             fiz2             = _mm_add_pd(fiz2,tz);
525
526             fjx1             = _mm_add_pd(fjx1,tx);
527             fjy1             = _mm_add_pd(fjy1,ty);
528             fjz1             = _mm_add_pd(fjz1,tz);
529
530             }
531
532             /**************************
533              * CALCULATE INTERACTIONS *
534              **************************/
535
536             if (gmx_mm_any_lt(rsq22,rcutoff2))
537             {
538
539             r22              = _mm_mul_pd(rsq22,rinv22);
540
541             /* EWALD ELECTROSTATICS */
542
543             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
544             ewrt             = _mm_mul_pd(r22,ewtabscale);
545             ewitab           = _mm_cvttpd_epi32(ewrt);
546             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
547             ewitab           = _mm_slli_epi32(ewitab,2);
548             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
549             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
550             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
551             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
552             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
553             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
554             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
555             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
556             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
557             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
558
559             d                = _mm_sub_pd(r22,rswitch);
560             d                = _mm_max_pd(d,_mm_setzero_pd());
561             d2               = _mm_mul_pd(d,d);
562             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
563
564             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
565
566             /* Evaluate switch function */
567             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
568             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
569             velec            = _mm_mul_pd(velec,sw);
570             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
571
572             /* Update potential sum for this i atom from the interaction with this j atom. */
573             velec            = _mm_and_pd(velec,cutoff_mask);
574             velecsum         = _mm_add_pd(velecsum,velec);
575
576             fscal            = felec;
577
578             fscal            = _mm_and_pd(fscal,cutoff_mask);
579
580             /* Calculate temporary vectorial force */
581             tx               = _mm_mul_pd(fscal,dx22);
582             ty               = _mm_mul_pd(fscal,dy22);
583             tz               = _mm_mul_pd(fscal,dz22);
584
585             /* Update vectorial force */
586             fix2             = _mm_add_pd(fix2,tx);
587             fiy2             = _mm_add_pd(fiy2,ty);
588             fiz2             = _mm_add_pd(fiz2,tz);
589
590             fjx2             = _mm_add_pd(fjx2,tx);
591             fjy2             = _mm_add_pd(fjy2,ty);
592             fjz2             = _mm_add_pd(fjz2,tz);
593
594             }
595
596             /**************************
597              * CALCULATE INTERACTIONS *
598              **************************/
599
600             if (gmx_mm_any_lt(rsq23,rcutoff2))
601             {
602
603             r23              = _mm_mul_pd(rsq23,rinv23);
604
605             /* EWALD ELECTROSTATICS */
606
607             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
608             ewrt             = _mm_mul_pd(r23,ewtabscale);
609             ewitab           = _mm_cvttpd_epi32(ewrt);
610             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
611             ewitab           = _mm_slli_epi32(ewitab,2);
612             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
613             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
614             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
615             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
616             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
617             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
618             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
619             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
620             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
621             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
622
623             d                = _mm_sub_pd(r23,rswitch);
624             d                = _mm_max_pd(d,_mm_setzero_pd());
625             d2               = _mm_mul_pd(d,d);
626             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
627
628             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
629
630             /* Evaluate switch function */
631             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
632             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
633             velec            = _mm_mul_pd(velec,sw);
634             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
635
636             /* Update potential sum for this i atom from the interaction with this j atom. */
637             velec            = _mm_and_pd(velec,cutoff_mask);
638             velecsum         = _mm_add_pd(velecsum,velec);
639
640             fscal            = felec;
641
642             fscal            = _mm_and_pd(fscal,cutoff_mask);
643
644             /* Calculate temporary vectorial force */
645             tx               = _mm_mul_pd(fscal,dx23);
646             ty               = _mm_mul_pd(fscal,dy23);
647             tz               = _mm_mul_pd(fscal,dz23);
648
649             /* Update vectorial force */
650             fix2             = _mm_add_pd(fix2,tx);
651             fiy2             = _mm_add_pd(fiy2,ty);
652             fiz2             = _mm_add_pd(fiz2,tz);
653
654             fjx3             = _mm_add_pd(fjx3,tx);
655             fjy3             = _mm_add_pd(fjy3,ty);
656             fjz3             = _mm_add_pd(fjz3,tz);
657
658             }
659
660             /**************************
661              * CALCULATE INTERACTIONS *
662              **************************/
663
664             if (gmx_mm_any_lt(rsq31,rcutoff2))
665             {
666
667             r31              = _mm_mul_pd(rsq31,rinv31);
668
669             /* EWALD ELECTROSTATICS */
670
671             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
672             ewrt             = _mm_mul_pd(r31,ewtabscale);
673             ewitab           = _mm_cvttpd_epi32(ewrt);
674             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
675             ewitab           = _mm_slli_epi32(ewitab,2);
676             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
677             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
678             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
679             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
680             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
681             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
682             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
683             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
684             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
685             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
686
687             d                = _mm_sub_pd(r31,rswitch);
688             d                = _mm_max_pd(d,_mm_setzero_pd());
689             d2               = _mm_mul_pd(d,d);
690             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
691
692             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
693
694             /* Evaluate switch function */
695             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
696             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
697             velec            = _mm_mul_pd(velec,sw);
698             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
699
700             /* Update potential sum for this i atom from the interaction with this j atom. */
701             velec            = _mm_and_pd(velec,cutoff_mask);
702             velecsum         = _mm_add_pd(velecsum,velec);
703
704             fscal            = felec;
705
706             fscal            = _mm_and_pd(fscal,cutoff_mask);
707
708             /* Calculate temporary vectorial force */
709             tx               = _mm_mul_pd(fscal,dx31);
710             ty               = _mm_mul_pd(fscal,dy31);
711             tz               = _mm_mul_pd(fscal,dz31);
712
713             /* Update vectorial force */
714             fix3             = _mm_add_pd(fix3,tx);
715             fiy3             = _mm_add_pd(fiy3,ty);
716             fiz3             = _mm_add_pd(fiz3,tz);
717
718             fjx1             = _mm_add_pd(fjx1,tx);
719             fjy1             = _mm_add_pd(fjy1,ty);
720             fjz1             = _mm_add_pd(fjz1,tz);
721
722             }
723
724             /**************************
725              * CALCULATE INTERACTIONS *
726              **************************/
727
728             if (gmx_mm_any_lt(rsq32,rcutoff2))
729             {
730
731             r32              = _mm_mul_pd(rsq32,rinv32);
732
733             /* EWALD ELECTROSTATICS */
734
735             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
736             ewrt             = _mm_mul_pd(r32,ewtabscale);
737             ewitab           = _mm_cvttpd_epi32(ewrt);
738             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
739             ewitab           = _mm_slli_epi32(ewitab,2);
740             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
741             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
742             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
743             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
744             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
745             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
746             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
747             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
748             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
749             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
750
751             d                = _mm_sub_pd(r32,rswitch);
752             d                = _mm_max_pd(d,_mm_setzero_pd());
753             d2               = _mm_mul_pd(d,d);
754             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
755
756             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
757
758             /* Evaluate switch function */
759             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
760             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
761             velec            = _mm_mul_pd(velec,sw);
762             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
763
764             /* Update potential sum for this i atom from the interaction with this j atom. */
765             velec            = _mm_and_pd(velec,cutoff_mask);
766             velecsum         = _mm_add_pd(velecsum,velec);
767
768             fscal            = felec;
769
770             fscal            = _mm_and_pd(fscal,cutoff_mask);
771
772             /* Calculate temporary vectorial force */
773             tx               = _mm_mul_pd(fscal,dx32);
774             ty               = _mm_mul_pd(fscal,dy32);
775             tz               = _mm_mul_pd(fscal,dz32);
776
777             /* Update vectorial force */
778             fix3             = _mm_add_pd(fix3,tx);
779             fiy3             = _mm_add_pd(fiy3,ty);
780             fiz3             = _mm_add_pd(fiz3,tz);
781
782             fjx2             = _mm_add_pd(fjx2,tx);
783             fjy2             = _mm_add_pd(fjy2,ty);
784             fjz2             = _mm_add_pd(fjz2,tz);
785
786             }
787
788             /**************************
789              * CALCULATE INTERACTIONS *
790              **************************/
791
792             if (gmx_mm_any_lt(rsq33,rcutoff2))
793             {
794
795             r33              = _mm_mul_pd(rsq33,rinv33);
796
797             /* EWALD ELECTROSTATICS */
798
799             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
800             ewrt             = _mm_mul_pd(r33,ewtabscale);
801             ewitab           = _mm_cvttpd_epi32(ewrt);
802             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
803             ewitab           = _mm_slli_epi32(ewitab,2);
804             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
805             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
806             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
807             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
808             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
809             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
810             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
811             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
812             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
813             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
814
815             d                = _mm_sub_pd(r33,rswitch);
816             d                = _mm_max_pd(d,_mm_setzero_pd());
817             d2               = _mm_mul_pd(d,d);
818             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
819
820             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
821
822             /* Evaluate switch function */
823             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
824             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
825             velec            = _mm_mul_pd(velec,sw);
826             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
827
828             /* Update potential sum for this i atom from the interaction with this j atom. */
829             velec            = _mm_and_pd(velec,cutoff_mask);
830             velecsum         = _mm_add_pd(velecsum,velec);
831
832             fscal            = felec;
833
834             fscal            = _mm_and_pd(fscal,cutoff_mask);
835
836             /* Calculate temporary vectorial force */
837             tx               = _mm_mul_pd(fscal,dx33);
838             ty               = _mm_mul_pd(fscal,dy33);
839             tz               = _mm_mul_pd(fscal,dz33);
840
841             /* Update vectorial force */
842             fix3             = _mm_add_pd(fix3,tx);
843             fiy3             = _mm_add_pd(fiy3,ty);
844             fiz3             = _mm_add_pd(fiz3,tz);
845
846             fjx3             = _mm_add_pd(fjx3,tx);
847             fjy3             = _mm_add_pd(fjy3,ty);
848             fjz3             = _mm_add_pd(fjz3,tz);
849
850             }
851
852             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
853
854             /* Inner loop uses 585 flops */
855         }
856
857         if(jidx<j_index_end)
858         {
859
860             jnrA             = jjnr[jidx];
861             j_coord_offsetA  = DIM*jnrA;
862
863             /* load j atom coordinates */
864             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
865                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
866
867             /* Calculate displacement vector */
868             dx11             = _mm_sub_pd(ix1,jx1);
869             dy11             = _mm_sub_pd(iy1,jy1);
870             dz11             = _mm_sub_pd(iz1,jz1);
871             dx12             = _mm_sub_pd(ix1,jx2);
872             dy12             = _mm_sub_pd(iy1,jy2);
873             dz12             = _mm_sub_pd(iz1,jz2);
874             dx13             = _mm_sub_pd(ix1,jx3);
875             dy13             = _mm_sub_pd(iy1,jy3);
876             dz13             = _mm_sub_pd(iz1,jz3);
877             dx21             = _mm_sub_pd(ix2,jx1);
878             dy21             = _mm_sub_pd(iy2,jy1);
879             dz21             = _mm_sub_pd(iz2,jz1);
880             dx22             = _mm_sub_pd(ix2,jx2);
881             dy22             = _mm_sub_pd(iy2,jy2);
882             dz22             = _mm_sub_pd(iz2,jz2);
883             dx23             = _mm_sub_pd(ix2,jx3);
884             dy23             = _mm_sub_pd(iy2,jy3);
885             dz23             = _mm_sub_pd(iz2,jz3);
886             dx31             = _mm_sub_pd(ix3,jx1);
887             dy31             = _mm_sub_pd(iy3,jy1);
888             dz31             = _mm_sub_pd(iz3,jz1);
889             dx32             = _mm_sub_pd(ix3,jx2);
890             dy32             = _mm_sub_pd(iy3,jy2);
891             dz32             = _mm_sub_pd(iz3,jz2);
892             dx33             = _mm_sub_pd(ix3,jx3);
893             dy33             = _mm_sub_pd(iy3,jy3);
894             dz33             = _mm_sub_pd(iz3,jz3);
895
896             /* Calculate squared distance and things based on it */
897             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
898             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
899             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
900             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
901             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
902             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
903             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
904             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
905             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
906
907             rinv11           = gmx_mm_invsqrt_pd(rsq11);
908             rinv12           = gmx_mm_invsqrt_pd(rsq12);
909             rinv13           = gmx_mm_invsqrt_pd(rsq13);
910             rinv21           = gmx_mm_invsqrt_pd(rsq21);
911             rinv22           = gmx_mm_invsqrt_pd(rsq22);
912             rinv23           = gmx_mm_invsqrt_pd(rsq23);
913             rinv31           = gmx_mm_invsqrt_pd(rsq31);
914             rinv32           = gmx_mm_invsqrt_pd(rsq32);
915             rinv33           = gmx_mm_invsqrt_pd(rsq33);
916
917             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
918             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
919             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
920             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
921             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
922             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
923             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
924             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
925             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
926
927             fjx1             = _mm_setzero_pd();
928             fjy1             = _mm_setzero_pd();
929             fjz1             = _mm_setzero_pd();
930             fjx2             = _mm_setzero_pd();
931             fjy2             = _mm_setzero_pd();
932             fjz2             = _mm_setzero_pd();
933             fjx3             = _mm_setzero_pd();
934             fjy3             = _mm_setzero_pd();
935             fjz3             = _mm_setzero_pd();
936
937             /**************************
938              * CALCULATE INTERACTIONS *
939              **************************/
940
941             if (gmx_mm_any_lt(rsq11,rcutoff2))
942             {
943
944             r11              = _mm_mul_pd(rsq11,rinv11);
945
946             /* EWALD ELECTROSTATICS */
947
948             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
949             ewrt             = _mm_mul_pd(r11,ewtabscale);
950             ewitab           = _mm_cvttpd_epi32(ewrt);
951             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
952             ewitab           = _mm_slli_epi32(ewitab,2);
953             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
954             ewtabD           = _mm_setzero_pd();
955             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
956             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
957             ewtabFn          = _mm_setzero_pd();
958             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
959             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
960             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
961             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
962             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
963
964             d                = _mm_sub_pd(r11,rswitch);
965             d                = _mm_max_pd(d,_mm_setzero_pd());
966             d2               = _mm_mul_pd(d,d);
967             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
968
969             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
970
971             /* Evaluate switch function */
972             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
973             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
974             velec            = _mm_mul_pd(velec,sw);
975             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
976
977             /* Update potential sum for this i atom from the interaction with this j atom. */
978             velec            = _mm_and_pd(velec,cutoff_mask);
979             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
980             velecsum         = _mm_add_pd(velecsum,velec);
981
982             fscal            = felec;
983
984             fscal            = _mm_and_pd(fscal,cutoff_mask);
985
986             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
987
988             /* Calculate temporary vectorial force */
989             tx               = _mm_mul_pd(fscal,dx11);
990             ty               = _mm_mul_pd(fscal,dy11);
991             tz               = _mm_mul_pd(fscal,dz11);
992
993             /* Update vectorial force */
994             fix1             = _mm_add_pd(fix1,tx);
995             fiy1             = _mm_add_pd(fiy1,ty);
996             fiz1             = _mm_add_pd(fiz1,tz);
997
998             fjx1             = _mm_add_pd(fjx1,tx);
999             fjy1             = _mm_add_pd(fjy1,ty);
1000             fjz1             = _mm_add_pd(fjz1,tz);
1001
1002             }
1003
1004             /**************************
1005              * CALCULATE INTERACTIONS *
1006              **************************/
1007
1008             if (gmx_mm_any_lt(rsq12,rcutoff2))
1009             {
1010
1011             r12              = _mm_mul_pd(rsq12,rinv12);
1012
1013             /* EWALD ELECTROSTATICS */
1014
1015             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1016             ewrt             = _mm_mul_pd(r12,ewtabscale);
1017             ewitab           = _mm_cvttpd_epi32(ewrt);
1018             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1019             ewitab           = _mm_slli_epi32(ewitab,2);
1020             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1021             ewtabD           = _mm_setzero_pd();
1022             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1023             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1024             ewtabFn          = _mm_setzero_pd();
1025             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1026             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1027             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1028             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1029             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1030
1031             d                = _mm_sub_pd(r12,rswitch);
1032             d                = _mm_max_pd(d,_mm_setzero_pd());
1033             d2               = _mm_mul_pd(d,d);
1034             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1035
1036             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1037
1038             /* Evaluate switch function */
1039             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1040             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1041             velec            = _mm_mul_pd(velec,sw);
1042             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1043
1044             /* Update potential sum for this i atom from the interaction with this j atom. */
1045             velec            = _mm_and_pd(velec,cutoff_mask);
1046             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1047             velecsum         = _mm_add_pd(velecsum,velec);
1048
1049             fscal            = felec;
1050
1051             fscal            = _mm_and_pd(fscal,cutoff_mask);
1052
1053             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1054
1055             /* Calculate temporary vectorial force */
1056             tx               = _mm_mul_pd(fscal,dx12);
1057             ty               = _mm_mul_pd(fscal,dy12);
1058             tz               = _mm_mul_pd(fscal,dz12);
1059
1060             /* Update vectorial force */
1061             fix1             = _mm_add_pd(fix1,tx);
1062             fiy1             = _mm_add_pd(fiy1,ty);
1063             fiz1             = _mm_add_pd(fiz1,tz);
1064
1065             fjx2             = _mm_add_pd(fjx2,tx);
1066             fjy2             = _mm_add_pd(fjy2,ty);
1067             fjz2             = _mm_add_pd(fjz2,tz);
1068
1069             }
1070
1071             /**************************
1072              * CALCULATE INTERACTIONS *
1073              **************************/
1074
1075             if (gmx_mm_any_lt(rsq13,rcutoff2))
1076             {
1077
1078             r13              = _mm_mul_pd(rsq13,rinv13);
1079
1080             /* EWALD ELECTROSTATICS */
1081
1082             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1083             ewrt             = _mm_mul_pd(r13,ewtabscale);
1084             ewitab           = _mm_cvttpd_epi32(ewrt);
1085             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1086             ewitab           = _mm_slli_epi32(ewitab,2);
1087             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1088             ewtabD           = _mm_setzero_pd();
1089             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1090             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1091             ewtabFn          = _mm_setzero_pd();
1092             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1093             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1094             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1095             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1096             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1097
1098             d                = _mm_sub_pd(r13,rswitch);
1099             d                = _mm_max_pd(d,_mm_setzero_pd());
1100             d2               = _mm_mul_pd(d,d);
1101             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1102
1103             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1104
1105             /* Evaluate switch function */
1106             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1107             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1108             velec            = _mm_mul_pd(velec,sw);
1109             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
1110
1111             /* Update potential sum for this i atom from the interaction with this j atom. */
1112             velec            = _mm_and_pd(velec,cutoff_mask);
1113             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1114             velecsum         = _mm_add_pd(velecsum,velec);
1115
1116             fscal            = felec;
1117
1118             fscal            = _mm_and_pd(fscal,cutoff_mask);
1119
1120             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1121
1122             /* Calculate temporary vectorial force */
1123             tx               = _mm_mul_pd(fscal,dx13);
1124             ty               = _mm_mul_pd(fscal,dy13);
1125             tz               = _mm_mul_pd(fscal,dz13);
1126
1127             /* Update vectorial force */
1128             fix1             = _mm_add_pd(fix1,tx);
1129             fiy1             = _mm_add_pd(fiy1,ty);
1130             fiz1             = _mm_add_pd(fiz1,tz);
1131
1132             fjx3             = _mm_add_pd(fjx3,tx);
1133             fjy3             = _mm_add_pd(fjy3,ty);
1134             fjz3             = _mm_add_pd(fjz3,tz);
1135
1136             }
1137
1138             /**************************
1139              * CALCULATE INTERACTIONS *
1140              **************************/
1141
1142             if (gmx_mm_any_lt(rsq21,rcutoff2))
1143             {
1144
1145             r21              = _mm_mul_pd(rsq21,rinv21);
1146
1147             /* EWALD ELECTROSTATICS */
1148
1149             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150             ewrt             = _mm_mul_pd(r21,ewtabscale);
1151             ewitab           = _mm_cvttpd_epi32(ewrt);
1152             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1153             ewitab           = _mm_slli_epi32(ewitab,2);
1154             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1155             ewtabD           = _mm_setzero_pd();
1156             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1157             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1158             ewtabFn          = _mm_setzero_pd();
1159             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1160             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1161             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1162             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1163             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1164
1165             d                = _mm_sub_pd(r21,rswitch);
1166             d                = _mm_max_pd(d,_mm_setzero_pd());
1167             d2               = _mm_mul_pd(d,d);
1168             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1169
1170             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1171
1172             /* Evaluate switch function */
1173             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1174             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1175             velec            = _mm_mul_pd(velec,sw);
1176             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1177
1178             /* Update potential sum for this i atom from the interaction with this j atom. */
1179             velec            = _mm_and_pd(velec,cutoff_mask);
1180             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1181             velecsum         = _mm_add_pd(velecsum,velec);
1182
1183             fscal            = felec;
1184
1185             fscal            = _mm_and_pd(fscal,cutoff_mask);
1186
1187             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1188
1189             /* Calculate temporary vectorial force */
1190             tx               = _mm_mul_pd(fscal,dx21);
1191             ty               = _mm_mul_pd(fscal,dy21);
1192             tz               = _mm_mul_pd(fscal,dz21);
1193
1194             /* Update vectorial force */
1195             fix2             = _mm_add_pd(fix2,tx);
1196             fiy2             = _mm_add_pd(fiy2,ty);
1197             fiz2             = _mm_add_pd(fiz2,tz);
1198
1199             fjx1             = _mm_add_pd(fjx1,tx);
1200             fjy1             = _mm_add_pd(fjy1,ty);
1201             fjz1             = _mm_add_pd(fjz1,tz);
1202
1203             }
1204
1205             /**************************
1206              * CALCULATE INTERACTIONS *
1207              **************************/
1208
1209             if (gmx_mm_any_lt(rsq22,rcutoff2))
1210             {
1211
1212             r22              = _mm_mul_pd(rsq22,rinv22);
1213
1214             /* EWALD ELECTROSTATICS */
1215
1216             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1217             ewrt             = _mm_mul_pd(r22,ewtabscale);
1218             ewitab           = _mm_cvttpd_epi32(ewrt);
1219             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1220             ewitab           = _mm_slli_epi32(ewitab,2);
1221             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1222             ewtabD           = _mm_setzero_pd();
1223             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1224             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1225             ewtabFn          = _mm_setzero_pd();
1226             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1227             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1228             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1229             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1230             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1231
1232             d                = _mm_sub_pd(r22,rswitch);
1233             d                = _mm_max_pd(d,_mm_setzero_pd());
1234             d2               = _mm_mul_pd(d,d);
1235             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1236
1237             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1238
1239             /* Evaluate switch function */
1240             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1241             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1242             velec            = _mm_mul_pd(velec,sw);
1243             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1244
1245             /* Update potential sum for this i atom from the interaction with this j atom. */
1246             velec            = _mm_and_pd(velec,cutoff_mask);
1247             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1248             velecsum         = _mm_add_pd(velecsum,velec);
1249
1250             fscal            = felec;
1251
1252             fscal            = _mm_and_pd(fscal,cutoff_mask);
1253
1254             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1255
1256             /* Calculate temporary vectorial force */
1257             tx               = _mm_mul_pd(fscal,dx22);
1258             ty               = _mm_mul_pd(fscal,dy22);
1259             tz               = _mm_mul_pd(fscal,dz22);
1260
1261             /* Update vectorial force */
1262             fix2             = _mm_add_pd(fix2,tx);
1263             fiy2             = _mm_add_pd(fiy2,ty);
1264             fiz2             = _mm_add_pd(fiz2,tz);
1265
1266             fjx2             = _mm_add_pd(fjx2,tx);
1267             fjy2             = _mm_add_pd(fjy2,ty);
1268             fjz2             = _mm_add_pd(fjz2,tz);
1269
1270             }
1271
1272             /**************************
1273              * CALCULATE INTERACTIONS *
1274              **************************/
1275
1276             if (gmx_mm_any_lt(rsq23,rcutoff2))
1277             {
1278
1279             r23              = _mm_mul_pd(rsq23,rinv23);
1280
1281             /* EWALD ELECTROSTATICS */
1282
1283             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1284             ewrt             = _mm_mul_pd(r23,ewtabscale);
1285             ewitab           = _mm_cvttpd_epi32(ewrt);
1286             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1287             ewitab           = _mm_slli_epi32(ewitab,2);
1288             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1289             ewtabD           = _mm_setzero_pd();
1290             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1291             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1292             ewtabFn          = _mm_setzero_pd();
1293             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1294             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1295             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1296             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1297             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1298
1299             d                = _mm_sub_pd(r23,rswitch);
1300             d                = _mm_max_pd(d,_mm_setzero_pd());
1301             d2               = _mm_mul_pd(d,d);
1302             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1303
1304             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1305
1306             /* Evaluate switch function */
1307             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1308             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1309             velec            = _mm_mul_pd(velec,sw);
1310             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
1311
1312             /* Update potential sum for this i atom from the interaction with this j atom. */
1313             velec            = _mm_and_pd(velec,cutoff_mask);
1314             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1315             velecsum         = _mm_add_pd(velecsum,velec);
1316
1317             fscal            = felec;
1318
1319             fscal            = _mm_and_pd(fscal,cutoff_mask);
1320
1321             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1322
1323             /* Calculate temporary vectorial force */
1324             tx               = _mm_mul_pd(fscal,dx23);
1325             ty               = _mm_mul_pd(fscal,dy23);
1326             tz               = _mm_mul_pd(fscal,dz23);
1327
1328             /* Update vectorial force */
1329             fix2             = _mm_add_pd(fix2,tx);
1330             fiy2             = _mm_add_pd(fiy2,ty);
1331             fiz2             = _mm_add_pd(fiz2,tz);
1332
1333             fjx3             = _mm_add_pd(fjx3,tx);
1334             fjy3             = _mm_add_pd(fjy3,ty);
1335             fjz3             = _mm_add_pd(fjz3,tz);
1336
1337             }
1338
1339             /**************************
1340              * CALCULATE INTERACTIONS *
1341              **************************/
1342
1343             if (gmx_mm_any_lt(rsq31,rcutoff2))
1344             {
1345
1346             r31              = _mm_mul_pd(rsq31,rinv31);
1347
1348             /* EWALD ELECTROSTATICS */
1349
1350             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1351             ewrt             = _mm_mul_pd(r31,ewtabscale);
1352             ewitab           = _mm_cvttpd_epi32(ewrt);
1353             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1354             ewitab           = _mm_slli_epi32(ewitab,2);
1355             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1356             ewtabD           = _mm_setzero_pd();
1357             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1358             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1359             ewtabFn          = _mm_setzero_pd();
1360             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1361             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1362             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1363             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1364             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1365
1366             d                = _mm_sub_pd(r31,rswitch);
1367             d                = _mm_max_pd(d,_mm_setzero_pd());
1368             d2               = _mm_mul_pd(d,d);
1369             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1370
1371             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1372
1373             /* Evaluate switch function */
1374             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1375             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1376             velec            = _mm_mul_pd(velec,sw);
1377             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
1378
1379             /* Update potential sum for this i atom from the interaction with this j atom. */
1380             velec            = _mm_and_pd(velec,cutoff_mask);
1381             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1382             velecsum         = _mm_add_pd(velecsum,velec);
1383
1384             fscal            = felec;
1385
1386             fscal            = _mm_and_pd(fscal,cutoff_mask);
1387
1388             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1389
1390             /* Calculate temporary vectorial force */
1391             tx               = _mm_mul_pd(fscal,dx31);
1392             ty               = _mm_mul_pd(fscal,dy31);
1393             tz               = _mm_mul_pd(fscal,dz31);
1394
1395             /* Update vectorial force */
1396             fix3             = _mm_add_pd(fix3,tx);
1397             fiy3             = _mm_add_pd(fiy3,ty);
1398             fiz3             = _mm_add_pd(fiz3,tz);
1399
1400             fjx1             = _mm_add_pd(fjx1,tx);
1401             fjy1             = _mm_add_pd(fjy1,ty);
1402             fjz1             = _mm_add_pd(fjz1,tz);
1403
1404             }
1405
1406             /**************************
1407              * CALCULATE INTERACTIONS *
1408              **************************/
1409
1410             if (gmx_mm_any_lt(rsq32,rcutoff2))
1411             {
1412
1413             r32              = _mm_mul_pd(rsq32,rinv32);
1414
1415             /* EWALD ELECTROSTATICS */
1416
1417             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1418             ewrt             = _mm_mul_pd(r32,ewtabscale);
1419             ewitab           = _mm_cvttpd_epi32(ewrt);
1420             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1421             ewitab           = _mm_slli_epi32(ewitab,2);
1422             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1423             ewtabD           = _mm_setzero_pd();
1424             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1425             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1426             ewtabFn          = _mm_setzero_pd();
1427             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1428             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1429             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1430             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1431             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1432
1433             d                = _mm_sub_pd(r32,rswitch);
1434             d                = _mm_max_pd(d,_mm_setzero_pd());
1435             d2               = _mm_mul_pd(d,d);
1436             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1437
1438             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1439
1440             /* Evaluate switch function */
1441             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1442             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1443             velec            = _mm_mul_pd(velec,sw);
1444             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
1445
1446             /* Update potential sum for this i atom from the interaction with this j atom. */
1447             velec            = _mm_and_pd(velec,cutoff_mask);
1448             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1449             velecsum         = _mm_add_pd(velecsum,velec);
1450
1451             fscal            = felec;
1452
1453             fscal            = _mm_and_pd(fscal,cutoff_mask);
1454
1455             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1456
1457             /* Calculate temporary vectorial force */
1458             tx               = _mm_mul_pd(fscal,dx32);
1459             ty               = _mm_mul_pd(fscal,dy32);
1460             tz               = _mm_mul_pd(fscal,dz32);
1461
1462             /* Update vectorial force */
1463             fix3             = _mm_add_pd(fix3,tx);
1464             fiy3             = _mm_add_pd(fiy3,ty);
1465             fiz3             = _mm_add_pd(fiz3,tz);
1466
1467             fjx2             = _mm_add_pd(fjx2,tx);
1468             fjy2             = _mm_add_pd(fjy2,ty);
1469             fjz2             = _mm_add_pd(fjz2,tz);
1470
1471             }
1472
1473             /**************************
1474              * CALCULATE INTERACTIONS *
1475              **************************/
1476
1477             if (gmx_mm_any_lt(rsq33,rcutoff2))
1478             {
1479
1480             r33              = _mm_mul_pd(rsq33,rinv33);
1481
1482             /* EWALD ELECTROSTATICS */
1483
1484             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1485             ewrt             = _mm_mul_pd(r33,ewtabscale);
1486             ewitab           = _mm_cvttpd_epi32(ewrt);
1487             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1488             ewitab           = _mm_slli_epi32(ewitab,2);
1489             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1490             ewtabD           = _mm_setzero_pd();
1491             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1492             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1493             ewtabFn          = _mm_setzero_pd();
1494             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1495             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1496             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1497             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1498             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1499
1500             d                = _mm_sub_pd(r33,rswitch);
1501             d                = _mm_max_pd(d,_mm_setzero_pd());
1502             d2               = _mm_mul_pd(d,d);
1503             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1504
1505             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1506
1507             /* Evaluate switch function */
1508             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1509             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1510             velec            = _mm_mul_pd(velec,sw);
1511             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
1512
1513             /* Update potential sum for this i atom from the interaction with this j atom. */
1514             velec            = _mm_and_pd(velec,cutoff_mask);
1515             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1516             velecsum         = _mm_add_pd(velecsum,velec);
1517
1518             fscal            = felec;
1519
1520             fscal            = _mm_and_pd(fscal,cutoff_mask);
1521
1522             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1523
1524             /* Calculate temporary vectorial force */
1525             tx               = _mm_mul_pd(fscal,dx33);
1526             ty               = _mm_mul_pd(fscal,dy33);
1527             tz               = _mm_mul_pd(fscal,dz33);
1528
1529             /* Update vectorial force */
1530             fix3             = _mm_add_pd(fix3,tx);
1531             fiy3             = _mm_add_pd(fiy3,ty);
1532             fiz3             = _mm_add_pd(fiz3,tz);
1533
1534             fjx3             = _mm_add_pd(fjx3,tx);
1535             fjy3             = _mm_add_pd(fjy3,ty);
1536             fjz3             = _mm_add_pd(fjz3,tz);
1537
1538             }
1539
1540             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1541
1542             /* Inner loop uses 585 flops */
1543         }
1544
1545         /* End of innermost loop */
1546
1547         gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1548                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1549
1550         ggid                        = gid[iidx];
1551         /* Update potential energies */
1552         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1553
1554         /* Increment number of inner iterations */
1555         inneriter                  += j_index_end - j_index_start;
1556
1557         /* Outer loop uses 19 flops */
1558     }
1559
1560     /* Increment number of outer iterations */
1561     outeriter        += nri;
1562
1563     /* Update outer/inner flops */
1564
1565     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*585);
1566 }
1567 /*
1568  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_double
1569  * Electrostatics interaction: Ewald
1570  * VdW interaction:            None
1571  * Geometry:                   Water4-Water4
1572  * Calculate force/pot:        Force
1573  */
1574 void
1575 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_double
1576                     (t_nblist * gmx_restrict                nlist,
1577                      rvec * gmx_restrict                    xx,
1578                      rvec * gmx_restrict                    ff,
1579                      t_forcerec * gmx_restrict              fr,
1580                      t_mdatoms * gmx_restrict               mdatoms,
1581                      nb_kernel_data_t * gmx_restrict        kernel_data,
1582                      t_nrnb * gmx_restrict                  nrnb)
1583 {
1584     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1585      * just 0 for non-waters.
1586      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1587      * jnr indices corresponding to data put in the four positions in the SIMD register.
1588      */
1589     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1590     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1591     int              jnrA,jnrB;
1592     int              j_coord_offsetA,j_coord_offsetB;
1593     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1594     real             rcutoff_scalar;
1595     real             *shiftvec,*fshift,*x,*f;
1596     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1597     int              vdwioffset1;
1598     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1599     int              vdwioffset2;
1600     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1601     int              vdwioffset3;
1602     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1603     int              vdwjidx1A,vdwjidx1B;
1604     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1605     int              vdwjidx2A,vdwjidx2B;
1606     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1607     int              vdwjidx3A,vdwjidx3B;
1608     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1609     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1610     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1611     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1612     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1613     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1614     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1615     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1616     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1617     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1618     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1619     real             *charge;
1620     __m128i          ewitab;
1621     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1622     real             *ewtab;
1623     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1624     real             rswitch_scalar,d_scalar;
1625     __m128d          dummy_mask,cutoff_mask;
1626     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1627     __m128d          one     = _mm_set1_pd(1.0);
1628     __m128d          two     = _mm_set1_pd(2.0);
1629     x                = xx[0];
1630     f                = ff[0];
1631
1632     nri              = nlist->nri;
1633     iinr             = nlist->iinr;
1634     jindex           = nlist->jindex;
1635     jjnr             = nlist->jjnr;
1636     shiftidx         = nlist->shift;
1637     gid              = nlist->gid;
1638     shiftvec         = fr->shift_vec[0];
1639     fshift           = fr->fshift[0];
1640     facel            = _mm_set1_pd(fr->epsfac);
1641     charge           = mdatoms->chargeA;
1642
1643     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1644     ewtab            = fr->ic->tabq_coul_FDV0;
1645     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1646     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1647
1648     /* Setup water-specific parameters */
1649     inr              = nlist->iinr[0];
1650     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1651     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1652     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1653
1654     jq1              = _mm_set1_pd(charge[inr+1]);
1655     jq2              = _mm_set1_pd(charge[inr+2]);
1656     jq3              = _mm_set1_pd(charge[inr+3]);
1657     qq11             = _mm_mul_pd(iq1,jq1);
1658     qq12             = _mm_mul_pd(iq1,jq2);
1659     qq13             = _mm_mul_pd(iq1,jq3);
1660     qq21             = _mm_mul_pd(iq2,jq1);
1661     qq22             = _mm_mul_pd(iq2,jq2);
1662     qq23             = _mm_mul_pd(iq2,jq3);
1663     qq31             = _mm_mul_pd(iq3,jq1);
1664     qq32             = _mm_mul_pd(iq3,jq2);
1665     qq33             = _mm_mul_pd(iq3,jq3);
1666
1667     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1668     rcutoff_scalar   = fr->rcoulomb;
1669     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1670     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1671
1672     rswitch_scalar   = fr->rcoulomb_switch;
1673     rswitch          = _mm_set1_pd(rswitch_scalar);
1674     /* Setup switch parameters */
1675     d_scalar         = rcutoff_scalar-rswitch_scalar;
1676     d                = _mm_set1_pd(d_scalar);
1677     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1678     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1679     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1680     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1681     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1682     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1683
1684     /* Avoid stupid compiler warnings */
1685     jnrA = jnrB = 0;
1686     j_coord_offsetA = 0;
1687     j_coord_offsetB = 0;
1688
1689     outeriter        = 0;
1690     inneriter        = 0;
1691
1692     /* Start outer loop over neighborlists */
1693     for(iidx=0; iidx<nri; iidx++)
1694     {
1695         /* Load shift vector for this list */
1696         i_shift_offset   = DIM*shiftidx[iidx];
1697
1698         /* Load limits for loop over neighbors */
1699         j_index_start    = jindex[iidx];
1700         j_index_end      = jindex[iidx+1];
1701
1702         /* Get outer coordinate index */
1703         inr              = iinr[iidx];
1704         i_coord_offset   = DIM*inr;
1705
1706         /* Load i particle coords and add shift vector */
1707         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1708                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1709
1710         fix1             = _mm_setzero_pd();
1711         fiy1             = _mm_setzero_pd();
1712         fiz1             = _mm_setzero_pd();
1713         fix2             = _mm_setzero_pd();
1714         fiy2             = _mm_setzero_pd();
1715         fiz2             = _mm_setzero_pd();
1716         fix3             = _mm_setzero_pd();
1717         fiy3             = _mm_setzero_pd();
1718         fiz3             = _mm_setzero_pd();
1719
1720         /* Start inner kernel loop */
1721         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1722         {
1723
1724             /* Get j neighbor index, and coordinate index */
1725             jnrA             = jjnr[jidx];
1726             jnrB             = jjnr[jidx+1];
1727             j_coord_offsetA  = DIM*jnrA;
1728             j_coord_offsetB  = DIM*jnrB;
1729
1730             /* load j atom coordinates */
1731             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1732                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1733
1734             /* Calculate displacement vector */
1735             dx11             = _mm_sub_pd(ix1,jx1);
1736             dy11             = _mm_sub_pd(iy1,jy1);
1737             dz11             = _mm_sub_pd(iz1,jz1);
1738             dx12             = _mm_sub_pd(ix1,jx2);
1739             dy12             = _mm_sub_pd(iy1,jy2);
1740             dz12             = _mm_sub_pd(iz1,jz2);
1741             dx13             = _mm_sub_pd(ix1,jx3);
1742             dy13             = _mm_sub_pd(iy1,jy3);
1743             dz13             = _mm_sub_pd(iz1,jz3);
1744             dx21             = _mm_sub_pd(ix2,jx1);
1745             dy21             = _mm_sub_pd(iy2,jy1);
1746             dz21             = _mm_sub_pd(iz2,jz1);
1747             dx22             = _mm_sub_pd(ix2,jx2);
1748             dy22             = _mm_sub_pd(iy2,jy2);
1749             dz22             = _mm_sub_pd(iz2,jz2);
1750             dx23             = _mm_sub_pd(ix2,jx3);
1751             dy23             = _mm_sub_pd(iy2,jy3);
1752             dz23             = _mm_sub_pd(iz2,jz3);
1753             dx31             = _mm_sub_pd(ix3,jx1);
1754             dy31             = _mm_sub_pd(iy3,jy1);
1755             dz31             = _mm_sub_pd(iz3,jz1);
1756             dx32             = _mm_sub_pd(ix3,jx2);
1757             dy32             = _mm_sub_pd(iy3,jy2);
1758             dz32             = _mm_sub_pd(iz3,jz2);
1759             dx33             = _mm_sub_pd(ix3,jx3);
1760             dy33             = _mm_sub_pd(iy3,jy3);
1761             dz33             = _mm_sub_pd(iz3,jz3);
1762
1763             /* Calculate squared distance and things based on it */
1764             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1765             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1766             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1767             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1768             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1769             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1770             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1771             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1772             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1773
1774             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1775             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1776             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1777             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1778             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1779             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1780             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1781             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1782             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1783
1784             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1785             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1786             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1787             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1788             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1789             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1790             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1791             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1792             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1793
1794             fjx1             = _mm_setzero_pd();
1795             fjy1             = _mm_setzero_pd();
1796             fjz1             = _mm_setzero_pd();
1797             fjx2             = _mm_setzero_pd();
1798             fjy2             = _mm_setzero_pd();
1799             fjz2             = _mm_setzero_pd();
1800             fjx3             = _mm_setzero_pd();
1801             fjy3             = _mm_setzero_pd();
1802             fjz3             = _mm_setzero_pd();
1803
1804             /**************************
1805              * CALCULATE INTERACTIONS *
1806              **************************/
1807
1808             if (gmx_mm_any_lt(rsq11,rcutoff2))
1809             {
1810
1811             r11              = _mm_mul_pd(rsq11,rinv11);
1812
1813             /* EWALD ELECTROSTATICS */
1814
1815             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1816             ewrt             = _mm_mul_pd(r11,ewtabscale);
1817             ewitab           = _mm_cvttpd_epi32(ewrt);
1818             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1819             ewitab           = _mm_slli_epi32(ewitab,2);
1820             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1821             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1822             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1823             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1824             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1825             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1826             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1827             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1828             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1829             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1830
1831             d                = _mm_sub_pd(r11,rswitch);
1832             d                = _mm_max_pd(d,_mm_setzero_pd());
1833             d2               = _mm_mul_pd(d,d);
1834             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1835
1836             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1837
1838             /* Evaluate switch function */
1839             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1840             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1841             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1842
1843             fscal            = felec;
1844
1845             fscal            = _mm_and_pd(fscal,cutoff_mask);
1846
1847             /* Calculate temporary vectorial force */
1848             tx               = _mm_mul_pd(fscal,dx11);
1849             ty               = _mm_mul_pd(fscal,dy11);
1850             tz               = _mm_mul_pd(fscal,dz11);
1851
1852             /* Update vectorial force */
1853             fix1             = _mm_add_pd(fix1,tx);
1854             fiy1             = _mm_add_pd(fiy1,ty);
1855             fiz1             = _mm_add_pd(fiz1,tz);
1856
1857             fjx1             = _mm_add_pd(fjx1,tx);
1858             fjy1             = _mm_add_pd(fjy1,ty);
1859             fjz1             = _mm_add_pd(fjz1,tz);
1860
1861             }
1862
1863             /**************************
1864              * CALCULATE INTERACTIONS *
1865              **************************/
1866
1867             if (gmx_mm_any_lt(rsq12,rcutoff2))
1868             {
1869
1870             r12              = _mm_mul_pd(rsq12,rinv12);
1871
1872             /* EWALD ELECTROSTATICS */
1873
1874             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1875             ewrt             = _mm_mul_pd(r12,ewtabscale);
1876             ewitab           = _mm_cvttpd_epi32(ewrt);
1877             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1878             ewitab           = _mm_slli_epi32(ewitab,2);
1879             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1880             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1881             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1882             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1883             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1884             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1885             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1886             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1887             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1888             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1889
1890             d                = _mm_sub_pd(r12,rswitch);
1891             d                = _mm_max_pd(d,_mm_setzero_pd());
1892             d2               = _mm_mul_pd(d,d);
1893             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1894
1895             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1896
1897             /* Evaluate switch function */
1898             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1899             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1900             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1901
1902             fscal            = felec;
1903
1904             fscal            = _mm_and_pd(fscal,cutoff_mask);
1905
1906             /* Calculate temporary vectorial force */
1907             tx               = _mm_mul_pd(fscal,dx12);
1908             ty               = _mm_mul_pd(fscal,dy12);
1909             tz               = _mm_mul_pd(fscal,dz12);
1910
1911             /* Update vectorial force */
1912             fix1             = _mm_add_pd(fix1,tx);
1913             fiy1             = _mm_add_pd(fiy1,ty);
1914             fiz1             = _mm_add_pd(fiz1,tz);
1915
1916             fjx2             = _mm_add_pd(fjx2,tx);
1917             fjy2             = _mm_add_pd(fjy2,ty);
1918             fjz2             = _mm_add_pd(fjz2,tz);
1919
1920             }
1921
1922             /**************************
1923              * CALCULATE INTERACTIONS *
1924              **************************/
1925
1926             if (gmx_mm_any_lt(rsq13,rcutoff2))
1927             {
1928
1929             r13              = _mm_mul_pd(rsq13,rinv13);
1930
1931             /* EWALD ELECTROSTATICS */
1932
1933             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1934             ewrt             = _mm_mul_pd(r13,ewtabscale);
1935             ewitab           = _mm_cvttpd_epi32(ewrt);
1936             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1937             ewitab           = _mm_slli_epi32(ewitab,2);
1938             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1939             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1940             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1941             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1942             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1943             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1944             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1945             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1946             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1947             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1948
1949             d                = _mm_sub_pd(r13,rswitch);
1950             d                = _mm_max_pd(d,_mm_setzero_pd());
1951             d2               = _mm_mul_pd(d,d);
1952             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1953
1954             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1955
1956             /* Evaluate switch function */
1957             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1958             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1959             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
1960
1961             fscal            = felec;
1962
1963             fscal            = _mm_and_pd(fscal,cutoff_mask);
1964
1965             /* Calculate temporary vectorial force */
1966             tx               = _mm_mul_pd(fscal,dx13);
1967             ty               = _mm_mul_pd(fscal,dy13);
1968             tz               = _mm_mul_pd(fscal,dz13);
1969
1970             /* Update vectorial force */
1971             fix1             = _mm_add_pd(fix1,tx);
1972             fiy1             = _mm_add_pd(fiy1,ty);
1973             fiz1             = _mm_add_pd(fiz1,tz);
1974
1975             fjx3             = _mm_add_pd(fjx3,tx);
1976             fjy3             = _mm_add_pd(fjy3,ty);
1977             fjz3             = _mm_add_pd(fjz3,tz);
1978
1979             }
1980
1981             /**************************
1982              * CALCULATE INTERACTIONS *
1983              **************************/
1984
1985             if (gmx_mm_any_lt(rsq21,rcutoff2))
1986             {
1987
1988             r21              = _mm_mul_pd(rsq21,rinv21);
1989
1990             /* EWALD ELECTROSTATICS */
1991
1992             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1993             ewrt             = _mm_mul_pd(r21,ewtabscale);
1994             ewitab           = _mm_cvttpd_epi32(ewrt);
1995             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1996             ewitab           = _mm_slli_epi32(ewitab,2);
1997             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1998             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1999             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2000             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2001             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2002             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2003             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2004             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2005             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2006             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2007
2008             d                = _mm_sub_pd(r21,rswitch);
2009             d                = _mm_max_pd(d,_mm_setzero_pd());
2010             d2               = _mm_mul_pd(d,d);
2011             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2012
2013             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2014
2015             /* Evaluate switch function */
2016             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2017             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2018             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2019
2020             fscal            = felec;
2021
2022             fscal            = _mm_and_pd(fscal,cutoff_mask);
2023
2024             /* Calculate temporary vectorial force */
2025             tx               = _mm_mul_pd(fscal,dx21);
2026             ty               = _mm_mul_pd(fscal,dy21);
2027             tz               = _mm_mul_pd(fscal,dz21);
2028
2029             /* Update vectorial force */
2030             fix2             = _mm_add_pd(fix2,tx);
2031             fiy2             = _mm_add_pd(fiy2,ty);
2032             fiz2             = _mm_add_pd(fiz2,tz);
2033
2034             fjx1             = _mm_add_pd(fjx1,tx);
2035             fjy1             = _mm_add_pd(fjy1,ty);
2036             fjz1             = _mm_add_pd(fjz1,tz);
2037
2038             }
2039
2040             /**************************
2041              * CALCULATE INTERACTIONS *
2042              **************************/
2043
2044             if (gmx_mm_any_lt(rsq22,rcutoff2))
2045             {
2046
2047             r22              = _mm_mul_pd(rsq22,rinv22);
2048
2049             /* EWALD ELECTROSTATICS */
2050
2051             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2052             ewrt             = _mm_mul_pd(r22,ewtabscale);
2053             ewitab           = _mm_cvttpd_epi32(ewrt);
2054             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2055             ewitab           = _mm_slli_epi32(ewitab,2);
2056             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2057             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2058             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2059             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2060             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2061             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2062             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2063             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2064             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2065             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2066
2067             d                = _mm_sub_pd(r22,rswitch);
2068             d                = _mm_max_pd(d,_mm_setzero_pd());
2069             d2               = _mm_mul_pd(d,d);
2070             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2071
2072             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2073
2074             /* Evaluate switch function */
2075             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2076             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2077             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2078
2079             fscal            = felec;
2080
2081             fscal            = _mm_and_pd(fscal,cutoff_mask);
2082
2083             /* Calculate temporary vectorial force */
2084             tx               = _mm_mul_pd(fscal,dx22);
2085             ty               = _mm_mul_pd(fscal,dy22);
2086             tz               = _mm_mul_pd(fscal,dz22);
2087
2088             /* Update vectorial force */
2089             fix2             = _mm_add_pd(fix2,tx);
2090             fiy2             = _mm_add_pd(fiy2,ty);
2091             fiz2             = _mm_add_pd(fiz2,tz);
2092
2093             fjx2             = _mm_add_pd(fjx2,tx);
2094             fjy2             = _mm_add_pd(fjy2,ty);
2095             fjz2             = _mm_add_pd(fjz2,tz);
2096
2097             }
2098
2099             /**************************
2100              * CALCULATE INTERACTIONS *
2101              **************************/
2102
2103             if (gmx_mm_any_lt(rsq23,rcutoff2))
2104             {
2105
2106             r23              = _mm_mul_pd(rsq23,rinv23);
2107
2108             /* EWALD ELECTROSTATICS */
2109
2110             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111             ewrt             = _mm_mul_pd(r23,ewtabscale);
2112             ewitab           = _mm_cvttpd_epi32(ewrt);
2113             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2114             ewitab           = _mm_slli_epi32(ewitab,2);
2115             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2116             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2117             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2118             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2119             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2120             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2121             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2122             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2123             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2124             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2125
2126             d                = _mm_sub_pd(r23,rswitch);
2127             d                = _mm_max_pd(d,_mm_setzero_pd());
2128             d2               = _mm_mul_pd(d,d);
2129             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2130
2131             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2132
2133             /* Evaluate switch function */
2134             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2135             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2136             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
2137
2138             fscal            = felec;
2139
2140             fscal            = _mm_and_pd(fscal,cutoff_mask);
2141
2142             /* Calculate temporary vectorial force */
2143             tx               = _mm_mul_pd(fscal,dx23);
2144             ty               = _mm_mul_pd(fscal,dy23);
2145             tz               = _mm_mul_pd(fscal,dz23);
2146
2147             /* Update vectorial force */
2148             fix2             = _mm_add_pd(fix2,tx);
2149             fiy2             = _mm_add_pd(fiy2,ty);
2150             fiz2             = _mm_add_pd(fiz2,tz);
2151
2152             fjx3             = _mm_add_pd(fjx3,tx);
2153             fjy3             = _mm_add_pd(fjy3,ty);
2154             fjz3             = _mm_add_pd(fjz3,tz);
2155
2156             }
2157
2158             /**************************
2159              * CALCULATE INTERACTIONS *
2160              **************************/
2161
2162             if (gmx_mm_any_lt(rsq31,rcutoff2))
2163             {
2164
2165             r31              = _mm_mul_pd(rsq31,rinv31);
2166
2167             /* EWALD ELECTROSTATICS */
2168
2169             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2170             ewrt             = _mm_mul_pd(r31,ewtabscale);
2171             ewitab           = _mm_cvttpd_epi32(ewrt);
2172             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2173             ewitab           = _mm_slli_epi32(ewitab,2);
2174             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2175             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2176             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2177             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2178             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2179             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2180             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2181             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2182             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2183             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2184
2185             d                = _mm_sub_pd(r31,rswitch);
2186             d                = _mm_max_pd(d,_mm_setzero_pd());
2187             d2               = _mm_mul_pd(d,d);
2188             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2189
2190             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2191
2192             /* Evaluate switch function */
2193             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2194             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2195             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
2196
2197             fscal            = felec;
2198
2199             fscal            = _mm_and_pd(fscal,cutoff_mask);
2200
2201             /* Calculate temporary vectorial force */
2202             tx               = _mm_mul_pd(fscal,dx31);
2203             ty               = _mm_mul_pd(fscal,dy31);
2204             tz               = _mm_mul_pd(fscal,dz31);
2205
2206             /* Update vectorial force */
2207             fix3             = _mm_add_pd(fix3,tx);
2208             fiy3             = _mm_add_pd(fiy3,ty);
2209             fiz3             = _mm_add_pd(fiz3,tz);
2210
2211             fjx1             = _mm_add_pd(fjx1,tx);
2212             fjy1             = _mm_add_pd(fjy1,ty);
2213             fjz1             = _mm_add_pd(fjz1,tz);
2214
2215             }
2216
2217             /**************************
2218              * CALCULATE INTERACTIONS *
2219              **************************/
2220
2221             if (gmx_mm_any_lt(rsq32,rcutoff2))
2222             {
2223
2224             r32              = _mm_mul_pd(rsq32,rinv32);
2225
2226             /* EWALD ELECTROSTATICS */
2227
2228             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2229             ewrt             = _mm_mul_pd(r32,ewtabscale);
2230             ewitab           = _mm_cvttpd_epi32(ewrt);
2231             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2232             ewitab           = _mm_slli_epi32(ewitab,2);
2233             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2234             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2235             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2236             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2237             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2238             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2239             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2240             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2241             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2242             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2243
2244             d                = _mm_sub_pd(r32,rswitch);
2245             d                = _mm_max_pd(d,_mm_setzero_pd());
2246             d2               = _mm_mul_pd(d,d);
2247             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2248
2249             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2250
2251             /* Evaluate switch function */
2252             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2253             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2254             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
2255
2256             fscal            = felec;
2257
2258             fscal            = _mm_and_pd(fscal,cutoff_mask);
2259
2260             /* Calculate temporary vectorial force */
2261             tx               = _mm_mul_pd(fscal,dx32);
2262             ty               = _mm_mul_pd(fscal,dy32);
2263             tz               = _mm_mul_pd(fscal,dz32);
2264
2265             /* Update vectorial force */
2266             fix3             = _mm_add_pd(fix3,tx);
2267             fiy3             = _mm_add_pd(fiy3,ty);
2268             fiz3             = _mm_add_pd(fiz3,tz);
2269
2270             fjx2             = _mm_add_pd(fjx2,tx);
2271             fjy2             = _mm_add_pd(fjy2,ty);
2272             fjz2             = _mm_add_pd(fjz2,tz);
2273
2274             }
2275
2276             /**************************
2277              * CALCULATE INTERACTIONS *
2278              **************************/
2279
2280             if (gmx_mm_any_lt(rsq33,rcutoff2))
2281             {
2282
2283             r33              = _mm_mul_pd(rsq33,rinv33);
2284
2285             /* EWALD ELECTROSTATICS */
2286
2287             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2288             ewrt             = _mm_mul_pd(r33,ewtabscale);
2289             ewitab           = _mm_cvttpd_epi32(ewrt);
2290             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2291             ewitab           = _mm_slli_epi32(ewitab,2);
2292             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2293             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2294             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2295             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2296             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2297             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2298             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2299             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2300             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2301             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2302
2303             d                = _mm_sub_pd(r33,rswitch);
2304             d                = _mm_max_pd(d,_mm_setzero_pd());
2305             d2               = _mm_mul_pd(d,d);
2306             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2307
2308             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2309
2310             /* Evaluate switch function */
2311             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2312             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2313             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
2314
2315             fscal            = felec;
2316
2317             fscal            = _mm_and_pd(fscal,cutoff_mask);
2318
2319             /* Calculate temporary vectorial force */
2320             tx               = _mm_mul_pd(fscal,dx33);
2321             ty               = _mm_mul_pd(fscal,dy33);
2322             tz               = _mm_mul_pd(fscal,dz33);
2323
2324             /* Update vectorial force */
2325             fix3             = _mm_add_pd(fix3,tx);
2326             fiy3             = _mm_add_pd(fiy3,ty);
2327             fiz3             = _mm_add_pd(fiz3,tz);
2328
2329             fjx3             = _mm_add_pd(fjx3,tx);
2330             fjy3             = _mm_add_pd(fjy3,ty);
2331             fjz3             = _mm_add_pd(fjz3,tz);
2332
2333             }
2334
2335             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2336
2337             /* Inner loop uses 558 flops */
2338         }
2339
2340         if(jidx<j_index_end)
2341         {
2342
2343             jnrA             = jjnr[jidx];
2344             j_coord_offsetA  = DIM*jnrA;
2345
2346             /* load j atom coordinates */
2347             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
2348                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2349
2350             /* Calculate displacement vector */
2351             dx11             = _mm_sub_pd(ix1,jx1);
2352             dy11             = _mm_sub_pd(iy1,jy1);
2353             dz11             = _mm_sub_pd(iz1,jz1);
2354             dx12             = _mm_sub_pd(ix1,jx2);
2355             dy12             = _mm_sub_pd(iy1,jy2);
2356             dz12             = _mm_sub_pd(iz1,jz2);
2357             dx13             = _mm_sub_pd(ix1,jx3);
2358             dy13             = _mm_sub_pd(iy1,jy3);
2359             dz13             = _mm_sub_pd(iz1,jz3);
2360             dx21             = _mm_sub_pd(ix2,jx1);
2361             dy21             = _mm_sub_pd(iy2,jy1);
2362             dz21             = _mm_sub_pd(iz2,jz1);
2363             dx22             = _mm_sub_pd(ix2,jx2);
2364             dy22             = _mm_sub_pd(iy2,jy2);
2365             dz22             = _mm_sub_pd(iz2,jz2);
2366             dx23             = _mm_sub_pd(ix2,jx3);
2367             dy23             = _mm_sub_pd(iy2,jy3);
2368             dz23             = _mm_sub_pd(iz2,jz3);
2369             dx31             = _mm_sub_pd(ix3,jx1);
2370             dy31             = _mm_sub_pd(iy3,jy1);
2371             dz31             = _mm_sub_pd(iz3,jz1);
2372             dx32             = _mm_sub_pd(ix3,jx2);
2373             dy32             = _mm_sub_pd(iy3,jy2);
2374             dz32             = _mm_sub_pd(iz3,jz2);
2375             dx33             = _mm_sub_pd(ix3,jx3);
2376             dy33             = _mm_sub_pd(iy3,jy3);
2377             dz33             = _mm_sub_pd(iz3,jz3);
2378
2379             /* Calculate squared distance and things based on it */
2380             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2381             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2382             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2383             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2384             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2385             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2386             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2387             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2388             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2389
2390             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2391             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2392             rinv13           = gmx_mm_invsqrt_pd(rsq13);
2393             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2394             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2395             rinv23           = gmx_mm_invsqrt_pd(rsq23);
2396             rinv31           = gmx_mm_invsqrt_pd(rsq31);
2397             rinv32           = gmx_mm_invsqrt_pd(rsq32);
2398             rinv33           = gmx_mm_invsqrt_pd(rsq33);
2399
2400             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2401             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2402             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
2403             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2404             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2405             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
2406             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
2407             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
2408             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
2409
2410             fjx1             = _mm_setzero_pd();
2411             fjy1             = _mm_setzero_pd();
2412             fjz1             = _mm_setzero_pd();
2413             fjx2             = _mm_setzero_pd();
2414             fjy2             = _mm_setzero_pd();
2415             fjz2             = _mm_setzero_pd();
2416             fjx3             = _mm_setzero_pd();
2417             fjy3             = _mm_setzero_pd();
2418             fjz3             = _mm_setzero_pd();
2419
2420             /**************************
2421              * CALCULATE INTERACTIONS *
2422              **************************/
2423
2424             if (gmx_mm_any_lt(rsq11,rcutoff2))
2425             {
2426
2427             r11              = _mm_mul_pd(rsq11,rinv11);
2428
2429             /* EWALD ELECTROSTATICS */
2430
2431             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2432             ewrt             = _mm_mul_pd(r11,ewtabscale);
2433             ewitab           = _mm_cvttpd_epi32(ewrt);
2434             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2435             ewitab           = _mm_slli_epi32(ewitab,2);
2436             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2437             ewtabD           = _mm_setzero_pd();
2438             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2439             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2440             ewtabFn          = _mm_setzero_pd();
2441             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2442             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2443             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2444             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2445             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2446
2447             d                = _mm_sub_pd(r11,rswitch);
2448             d                = _mm_max_pd(d,_mm_setzero_pd());
2449             d2               = _mm_mul_pd(d,d);
2450             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2451
2452             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2453
2454             /* Evaluate switch function */
2455             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2456             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2457             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2458
2459             fscal            = felec;
2460
2461             fscal            = _mm_and_pd(fscal,cutoff_mask);
2462
2463             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2464
2465             /* Calculate temporary vectorial force */
2466             tx               = _mm_mul_pd(fscal,dx11);
2467             ty               = _mm_mul_pd(fscal,dy11);
2468             tz               = _mm_mul_pd(fscal,dz11);
2469
2470             /* Update vectorial force */
2471             fix1             = _mm_add_pd(fix1,tx);
2472             fiy1             = _mm_add_pd(fiy1,ty);
2473             fiz1             = _mm_add_pd(fiz1,tz);
2474
2475             fjx1             = _mm_add_pd(fjx1,tx);
2476             fjy1             = _mm_add_pd(fjy1,ty);
2477             fjz1             = _mm_add_pd(fjz1,tz);
2478
2479             }
2480
2481             /**************************
2482              * CALCULATE INTERACTIONS *
2483              **************************/
2484
2485             if (gmx_mm_any_lt(rsq12,rcutoff2))
2486             {
2487
2488             r12              = _mm_mul_pd(rsq12,rinv12);
2489
2490             /* EWALD ELECTROSTATICS */
2491
2492             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2493             ewrt             = _mm_mul_pd(r12,ewtabscale);
2494             ewitab           = _mm_cvttpd_epi32(ewrt);
2495             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2496             ewitab           = _mm_slli_epi32(ewitab,2);
2497             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2498             ewtabD           = _mm_setzero_pd();
2499             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2500             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2501             ewtabFn          = _mm_setzero_pd();
2502             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2503             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2504             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2505             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2506             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2507
2508             d                = _mm_sub_pd(r12,rswitch);
2509             d                = _mm_max_pd(d,_mm_setzero_pd());
2510             d2               = _mm_mul_pd(d,d);
2511             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2512
2513             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2514
2515             /* Evaluate switch function */
2516             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2517             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2518             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2519
2520             fscal            = felec;
2521
2522             fscal            = _mm_and_pd(fscal,cutoff_mask);
2523
2524             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2525
2526             /* Calculate temporary vectorial force */
2527             tx               = _mm_mul_pd(fscal,dx12);
2528             ty               = _mm_mul_pd(fscal,dy12);
2529             tz               = _mm_mul_pd(fscal,dz12);
2530
2531             /* Update vectorial force */
2532             fix1             = _mm_add_pd(fix1,tx);
2533             fiy1             = _mm_add_pd(fiy1,ty);
2534             fiz1             = _mm_add_pd(fiz1,tz);
2535
2536             fjx2             = _mm_add_pd(fjx2,tx);
2537             fjy2             = _mm_add_pd(fjy2,ty);
2538             fjz2             = _mm_add_pd(fjz2,tz);
2539
2540             }
2541
2542             /**************************
2543              * CALCULATE INTERACTIONS *
2544              **************************/
2545
2546             if (gmx_mm_any_lt(rsq13,rcutoff2))
2547             {
2548
2549             r13              = _mm_mul_pd(rsq13,rinv13);
2550
2551             /* EWALD ELECTROSTATICS */
2552
2553             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2554             ewrt             = _mm_mul_pd(r13,ewtabscale);
2555             ewitab           = _mm_cvttpd_epi32(ewrt);
2556             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2557             ewitab           = _mm_slli_epi32(ewitab,2);
2558             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2559             ewtabD           = _mm_setzero_pd();
2560             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2561             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2562             ewtabFn          = _mm_setzero_pd();
2563             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2564             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2565             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2566             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2567             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2568
2569             d                = _mm_sub_pd(r13,rswitch);
2570             d                = _mm_max_pd(d,_mm_setzero_pd());
2571             d2               = _mm_mul_pd(d,d);
2572             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2573
2574             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2575
2576             /* Evaluate switch function */
2577             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2578             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2579             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2580
2581             fscal            = felec;
2582
2583             fscal            = _mm_and_pd(fscal,cutoff_mask);
2584
2585             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2586
2587             /* Calculate temporary vectorial force */
2588             tx               = _mm_mul_pd(fscal,dx13);
2589             ty               = _mm_mul_pd(fscal,dy13);
2590             tz               = _mm_mul_pd(fscal,dz13);
2591
2592             /* Update vectorial force */
2593             fix1             = _mm_add_pd(fix1,tx);
2594             fiy1             = _mm_add_pd(fiy1,ty);
2595             fiz1             = _mm_add_pd(fiz1,tz);
2596
2597             fjx3             = _mm_add_pd(fjx3,tx);
2598             fjy3             = _mm_add_pd(fjy3,ty);
2599             fjz3             = _mm_add_pd(fjz3,tz);
2600
2601             }
2602
2603             /**************************
2604              * CALCULATE INTERACTIONS *
2605              **************************/
2606
2607             if (gmx_mm_any_lt(rsq21,rcutoff2))
2608             {
2609
2610             r21              = _mm_mul_pd(rsq21,rinv21);
2611
2612             /* EWALD ELECTROSTATICS */
2613
2614             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2615             ewrt             = _mm_mul_pd(r21,ewtabscale);
2616             ewitab           = _mm_cvttpd_epi32(ewrt);
2617             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2618             ewitab           = _mm_slli_epi32(ewitab,2);
2619             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2620             ewtabD           = _mm_setzero_pd();
2621             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2622             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2623             ewtabFn          = _mm_setzero_pd();
2624             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2625             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2626             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2627             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2628             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2629
2630             d                = _mm_sub_pd(r21,rswitch);
2631             d                = _mm_max_pd(d,_mm_setzero_pd());
2632             d2               = _mm_mul_pd(d,d);
2633             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2634
2635             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2636
2637             /* Evaluate switch function */
2638             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2639             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2640             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2641
2642             fscal            = felec;
2643
2644             fscal            = _mm_and_pd(fscal,cutoff_mask);
2645
2646             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2647
2648             /* Calculate temporary vectorial force */
2649             tx               = _mm_mul_pd(fscal,dx21);
2650             ty               = _mm_mul_pd(fscal,dy21);
2651             tz               = _mm_mul_pd(fscal,dz21);
2652
2653             /* Update vectorial force */
2654             fix2             = _mm_add_pd(fix2,tx);
2655             fiy2             = _mm_add_pd(fiy2,ty);
2656             fiz2             = _mm_add_pd(fiz2,tz);
2657
2658             fjx1             = _mm_add_pd(fjx1,tx);
2659             fjy1             = _mm_add_pd(fjy1,ty);
2660             fjz1             = _mm_add_pd(fjz1,tz);
2661
2662             }
2663
2664             /**************************
2665              * CALCULATE INTERACTIONS *
2666              **************************/
2667
2668             if (gmx_mm_any_lt(rsq22,rcutoff2))
2669             {
2670
2671             r22              = _mm_mul_pd(rsq22,rinv22);
2672
2673             /* EWALD ELECTROSTATICS */
2674
2675             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2676             ewrt             = _mm_mul_pd(r22,ewtabscale);
2677             ewitab           = _mm_cvttpd_epi32(ewrt);
2678             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2679             ewitab           = _mm_slli_epi32(ewitab,2);
2680             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2681             ewtabD           = _mm_setzero_pd();
2682             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2683             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2684             ewtabFn          = _mm_setzero_pd();
2685             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2686             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2687             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2688             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2689             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2690
2691             d                = _mm_sub_pd(r22,rswitch);
2692             d                = _mm_max_pd(d,_mm_setzero_pd());
2693             d2               = _mm_mul_pd(d,d);
2694             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2695
2696             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2697
2698             /* Evaluate switch function */
2699             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2700             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2701             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2702
2703             fscal            = felec;
2704
2705             fscal            = _mm_and_pd(fscal,cutoff_mask);
2706
2707             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2708
2709             /* Calculate temporary vectorial force */
2710             tx               = _mm_mul_pd(fscal,dx22);
2711             ty               = _mm_mul_pd(fscal,dy22);
2712             tz               = _mm_mul_pd(fscal,dz22);
2713
2714             /* Update vectorial force */
2715             fix2             = _mm_add_pd(fix2,tx);
2716             fiy2             = _mm_add_pd(fiy2,ty);
2717             fiz2             = _mm_add_pd(fiz2,tz);
2718
2719             fjx2             = _mm_add_pd(fjx2,tx);
2720             fjy2             = _mm_add_pd(fjy2,ty);
2721             fjz2             = _mm_add_pd(fjz2,tz);
2722
2723             }
2724
2725             /**************************
2726              * CALCULATE INTERACTIONS *
2727              **************************/
2728
2729             if (gmx_mm_any_lt(rsq23,rcutoff2))
2730             {
2731
2732             r23              = _mm_mul_pd(rsq23,rinv23);
2733
2734             /* EWALD ELECTROSTATICS */
2735
2736             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2737             ewrt             = _mm_mul_pd(r23,ewtabscale);
2738             ewitab           = _mm_cvttpd_epi32(ewrt);
2739             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2740             ewitab           = _mm_slli_epi32(ewitab,2);
2741             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2742             ewtabD           = _mm_setzero_pd();
2743             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2744             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2745             ewtabFn          = _mm_setzero_pd();
2746             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2747             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2748             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2749             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2750             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2751
2752             d                = _mm_sub_pd(r23,rswitch);
2753             d                = _mm_max_pd(d,_mm_setzero_pd());
2754             d2               = _mm_mul_pd(d,d);
2755             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2756
2757             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2758
2759             /* Evaluate switch function */
2760             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2761             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2762             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
2763
2764             fscal            = felec;
2765
2766             fscal            = _mm_and_pd(fscal,cutoff_mask);
2767
2768             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2769
2770             /* Calculate temporary vectorial force */
2771             tx               = _mm_mul_pd(fscal,dx23);
2772             ty               = _mm_mul_pd(fscal,dy23);
2773             tz               = _mm_mul_pd(fscal,dz23);
2774
2775             /* Update vectorial force */
2776             fix2             = _mm_add_pd(fix2,tx);
2777             fiy2             = _mm_add_pd(fiy2,ty);
2778             fiz2             = _mm_add_pd(fiz2,tz);
2779
2780             fjx3             = _mm_add_pd(fjx3,tx);
2781             fjy3             = _mm_add_pd(fjy3,ty);
2782             fjz3             = _mm_add_pd(fjz3,tz);
2783
2784             }
2785
2786             /**************************
2787              * CALCULATE INTERACTIONS *
2788              **************************/
2789
2790             if (gmx_mm_any_lt(rsq31,rcutoff2))
2791             {
2792
2793             r31              = _mm_mul_pd(rsq31,rinv31);
2794
2795             /* EWALD ELECTROSTATICS */
2796
2797             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2798             ewrt             = _mm_mul_pd(r31,ewtabscale);
2799             ewitab           = _mm_cvttpd_epi32(ewrt);
2800             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2801             ewitab           = _mm_slli_epi32(ewitab,2);
2802             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2803             ewtabD           = _mm_setzero_pd();
2804             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2805             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2806             ewtabFn          = _mm_setzero_pd();
2807             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2808             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2809             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2810             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2811             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2812
2813             d                = _mm_sub_pd(r31,rswitch);
2814             d                = _mm_max_pd(d,_mm_setzero_pd());
2815             d2               = _mm_mul_pd(d,d);
2816             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2817
2818             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2819
2820             /* Evaluate switch function */
2821             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2822             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2823             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
2824
2825             fscal            = felec;
2826
2827             fscal            = _mm_and_pd(fscal,cutoff_mask);
2828
2829             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2830
2831             /* Calculate temporary vectorial force */
2832             tx               = _mm_mul_pd(fscal,dx31);
2833             ty               = _mm_mul_pd(fscal,dy31);
2834             tz               = _mm_mul_pd(fscal,dz31);
2835
2836             /* Update vectorial force */
2837             fix3             = _mm_add_pd(fix3,tx);
2838             fiy3             = _mm_add_pd(fiy3,ty);
2839             fiz3             = _mm_add_pd(fiz3,tz);
2840
2841             fjx1             = _mm_add_pd(fjx1,tx);
2842             fjy1             = _mm_add_pd(fjy1,ty);
2843             fjz1             = _mm_add_pd(fjz1,tz);
2844
2845             }
2846
2847             /**************************
2848              * CALCULATE INTERACTIONS *
2849              **************************/
2850
2851             if (gmx_mm_any_lt(rsq32,rcutoff2))
2852             {
2853
2854             r32              = _mm_mul_pd(rsq32,rinv32);
2855
2856             /* EWALD ELECTROSTATICS */
2857
2858             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2859             ewrt             = _mm_mul_pd(r32,ewtabscale);
2860             ewitab           = _mm_cvttpd_epi32(ewrt);
2861             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2862             ewitab           = _mm_slli_epi32(ewitab,2);
2863             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2864             ewtabD           = _mm_setzero_pd();
2865             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2866             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2867             ewtabFn          = _mm_setzero_pd();
2868             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2869             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2870             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2871             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2872             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2873
2874             d                = _mm_sub_pd(r32,rswitch);
2875             d                = _mm_max_pd(d,_mm_setzero_pd());
2876             d2               = _mm_mul_pd(d,d);
2877             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2878
2879             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2880
2881             /* Evaluate switch function */
2882             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2883             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2884             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
2885
2886             fscal            = felec;
2887
2888             fscal            = _mm_and_pd(fscal,cutoff_mask);
2889
2890             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2891
2892             /* Calculate temporary vectorial force */
2893             tx               = _mm_mul_pd(fscal,dx32);
2894             ty               = _mm_mul_pd(fscal,dy32);
2895             tz               = _mm_mul_pd(fscal,dz32);
2896
2897             /* Update vectorial force */
2898             fix3             = _mm_add_pd(fix3,tx);
2899             fiy3             = _mm_add_pd(fiy3,ty);
2900             fiz3             = _mm_add_pd(fiz3,tz);
2901
2902             fjx2             = _mm_add_pd(fjx2,tx);
2903             fjy2             = _mm_add_pd(fjy2,ty);
2904             fjz2             = _mm_add_pd(fjz2,tz);
2905
2906             }
2907
2908             /**************************
2909              * CALCULATE INTERACTIONS *
2910              **************************/
2911
2912             if (gmx_mm_any_lt(rsq33,rcutoff2))
2913             {
2914
2915             r33              = _mm_mul_pd(rsq33,rinv33);
2916
2917             /* EWALD ELECTROSTATICS */
2918
2919             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2920             ewrt             = _mm_mul_pd(r33,ewtabscale);
2921             ewitab           = _mm_cvttpd_epi32(ewrt);
2922             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2923             ewitab           = _mm_slli_epi32(ewitab,2);
2924             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2925             ewtabD           = _mm_setzero_pd();
2926             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2927             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2928             ewtabFn          = _mm_setzero_pd();
2929             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2930             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2931             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2932             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2933             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2934
2935             d                = _mm_sub_pd(r33,rswitch);
2936             d                = _mm_max_pd(d,_mm_setzero_pd());
2937             d2               = _mm_mul_pd(d,d);
2938             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2939
2940             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2941
2942             /* Evaluate switch function */
2943             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2944             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2945             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
2946
2947             fscal            = felec;
2948
2949             fscal            = _mm_and_pd(fscal,cutoff_mask);
2950
2951             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2952
2953             /* Calculate temporary vectorial force */
2954             tx               = _mm_mul_pd(fscal,dx33);
2955             ty               = _mm_mul_pd(fscal,dy33);
2956             tz               = _mm_mul_pd(fscal,dz33);
2957
2958             /* Update vectorial force */
2959             fix3             = _mm_add_pd(fix3,tx);
2960             fiy3             = _mm_add_pd(fiy3,ty);
2961             fiz3             = _mm_add_pd(fiz3,tz);
2962
2963             fjx3             = _mm_add_pd(fjx3,tx);
2964             fjy3             = _mm_add_pd(fjy3,ty);
2965             fjz3             = _mm_add_pd(fjz3,tz);
2966
2967             }
2968
2969             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2970
2971             /* Inner loop uses 558 flops */
2972         }
2973
2974         /* End of innermost loop */
2975
2976         gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2977                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
2978
2979         /* Increment number of inner iterations */
2980         inneriter                  += j_index_end - j_index_start;
2981
2982         /* Outer loop uses 18 flops */
2983     }
2984
2985     /* Increment number of outer iterations */
2986     outeriter        += nri;
2987
2988     /* Update outer/inner flops */
2989
2990     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*558);
2991 }