made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecEwSw_VdwNone_GeomW3W3_avx_128_fma_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_double
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            None
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B;
73     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B;
75     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B;
77     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     __m128i          ewitab;
90     __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
91     real             *ewtab;
92     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
93     real             rswitch_scalar,d_scalar;
94     __m128d          dummy_mask,cutoff_mask;
95     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
96     __m128d          one     = _mm_set1_pd(1.0);
97     __m128d          two     = _mm_set1_pd(2.0);
98     x                = xx[0];
99     f                = ff[0];
100
101     nri              = nlist->nri;
102     iinr             = nlist->iinr;
103     jindex           = nlist->jindex;
104     jjnr             = nlist->jjnr;
105     shiftidx         = nlist->shift;
106     gid              = nlist->gid;
107     shiftvec         = fr->shift_vec[0];
108     fshift           = fr->fshift[0];
109     facel            = _mm_set1_pd(fr->epsfac);
110     charge           = mdatoms->chargeA;
111
112     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
113     ewtab            = fr->ic->tabq_coul_FDV0;
114     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
115     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
116
117     /* Setup water-specific parameters */
118     inr              = nlist->iinr[0];
119     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
120     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
121     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
122
123     jq0              = _mm_set1_pd(charge[inr+0]);
124     jq1              = _mm_set1_pd(charge[inr+1]);
125     jq2              = _mm_set1_pd(charge[inr+2]);
126     qq00             = _mm_mul_pd(iq0,jq0);
127     qq01             = _mm_mul_pd(iq0,jq1);
128     qq02             = _mm_mul_pd(iq0,jq2);
129     qq10             = _mm_mul_pd(iq1,jq0);
130     qq11             = _mm_mul_pd(iq1,jq1);
131     qq12             = _mm_mul_pd(iq1,jq2);
132     qq20             = _mm_mul_pd(iq2,jq0);
133     qq21             = _mm_mul_pd(iq2,jq1);
134     qq22             = _mm_mul_pd(iq2,jq2);
135
136     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
137     rcutoff_scalar   = fr->rcoulomb;
138     rcutoff          = _mm_set1_pd(rcutoff_scalar);
139     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
140
141     rswitch_scalar   = fr->rcoulomb_switch;
142     rswitch          = _mm_set1_pd(rswitch_scalar);
143     /* Setup switch parameters */
144     d_scalar         = rcutoff_scalar-rswitch_scalar;
145     d                = _mm_set1_pd(d_scalar);
146     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
147     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
148     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
149     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
150     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
151     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
152
153     /* Avoid stupid compiler warnings */
154     jnrA = jnrB = 0;
155     j_coord_offsetA = 0;
156     j_coord_offsetB = 0;
157
158     outeriter        = 0;
159     inneriter        = 0;
160
161     /* Start outer loop over neighborlists */
162     for(iidx=0; iidx<nri; iidx++)
163     {
164         /* Load shift vector for this list */
165         i_shift_offset   = DIM*shiftidx[iidx];
166
167         /* Load limits for loop over neighbors */
168         j_index_start    = jindex[iidx];
169         j_index_end      = jindex[iidx+1];
170
171         /* Get outer coordinate index */
172         inr              = iinr[iidx];
173         i_coord_offset   = DIM*inr;
174
175         /* Load i particle coords and add shift vector */
176         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
177                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
178
179         fix0             = _mm_setzero_pd();
180         fiy0             = _mm_setzero_pd();
181         fiz0             = _mm_setzero_pd();
182         fix1             = _mm_setzero_pd();
183         fiy1             = _mm_setzero_pd();
184         fiz1             = _mm_setzero_pd();
185         fix2             = _mm_setzero_pd();
186         fiy2             = _mm_setzero_pd();
187         fiz2             = _mm_setzero_pd();
188
189         /* Reset potential sums */
190         velecsum         = _mm_setzero_pd();
191
192         /* Start inner kernel loop */
193         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
194         {
195
196             /* Get j neighbor index, and coordinate index */
197             jnrA             = jjnr[jidx];
198             jnrB             = jjnr[jidx+1];
199             j_coord_offsetA  = DIM*jnrA;
200             j_coord_offsetB  = DIM*jnrB;
201
202             /* load j atom coordinates */
203             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
204                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
205
206             /* Calculate displacement vector */
207             dx00             = _mm_sub_pd(ix0,jx0);
208             dy00             = _mm_sub_pd(iy0,jy0);
209             dz00             = _mm_sub_pd(iz0,jz0);
210             dx01             = _mm_sub_pd(ix0,jx1);
211             dy01             = _mm_sub_pd(iy0,jy1);
212             dz01             = _mm_sub_pd(iz0,jz1);
213             dx02             = _mm_sub_pd(ix0,jx2);
214             dy02             = _mm_sub_pd(iy0,jy2);
215             dz02             = _mm_sub_pd(iz0,jz2);
216             dx10             = _mm_sub_pd(ix1,jx0);
217             dy10             = _mm_sub_pd(iy1,jy0);
218             dz10             = _mm_sub_pd(iz1,jz0);
219             dx11             = _mm_sub_pd(ix1,jx1);
220             dy11             = _mm_sub_pd(iy1,jy1);
221             dz11             = _mm_sub_pd(iz1,jz1);
222             dx12             = _mm_sub_pd(ix1,jx2);
223             dy12             = _mm_sub_pd(iy1,jy2);
224             dz12             = _mm_sub_pd(iz1,jz2);
225             dx20             = _mm_sub_pd(ix2,jx0);
226             dy20             = _mm_sub_pd(iy2,jy0);
227             dz20             = _mm_sub_pd(iz2,jz0);
228             dx21             = _mm_sub_pd(ix2,jx1);
229             dy21             = _mm_sub_pd(iy2,jy1);
230             dz21             = _mm_sub_pd(iz2,jz1);
231             dx22             = _mm_sub_pd(ix2,jx2);
232             dy22             = _mm_sub_pd(iy2,jy2);
233             dz22             = _mm_sub_pd(iz2,jz2);
234
235             /* Calculate squared distance and things based on it */
236             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
237             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
238             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
239             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
240             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
241             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
242             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
243             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
244             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
245
246             rinv00           = gmx_mm_invsqrt_pd(rsq00);
247             rinv01           = gmx_mm_invsqrt_pd(rsq01);
248             rinv02           = gmx_mm_invsqrt_pd(rsq02);
249             rinv10           = gmx_mm_invsqrt_pd(rsq10);
250             rinv11           = gmx_mm_invsqrt_pd(rsq11);
251             rinv12           = gmx_mm_invsqrt_pd(rsq12);
252             rinv20           = gmx_mm_invsqrt_pd(rsq20);
253             rinv21           = gmx_mm_invsqrt_pd(rsq21);
254             rinv22           = gmx_mm_invsqrt_pd(rsq22);
255
256             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
257             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
258             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
259             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
260             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
261             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
262             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
263             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
264             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
265
266             fjx0             = _mm_setzero_pd();
267             fjy0             = _mm_setzero_pd();
268             fjz0             = _mm_setzero_pd();
269             fjx1             = _mm_setzero_pd();
270             fjy1             = _mm_setzero_pd();
271             fjz1             = _mm_setzero_pd();
272             fjx2             = _mm_setzero_pd();
273             fjy2             = _mm_setzero_pd();
274             fjz2             = _mm_setzero_pd();
275
276             /**************************
277              * CALCULATE INTERACTIONS *
278              **************************/
279
280             if (gmx_mm_any_lt(rsq00,rcutoff2))
281             {
282
283             r00              = _mm_mul_pd(rsq00,rinv00);
284
285             /* EWALD ELECTROSTATICS */
286
287             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
288             ewrt             = _mm_mul_pd(r00,ewtabscale);
289             ewitab           = _mm_cvttpd_epi32(ewrt);
290 #ifdef __XOP__
291             eweps            = _mm_frcz_pd(ewrt);
292 #else
293             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
294 #endif
295             twoeweps         = _mm_add_pd(eweps,eweps);
296             ewitab           = _mm_slli_epi32(ewitab,2);
297             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
298             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
299             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
300             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
301             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
302             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
303             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
304             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
305             velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
306             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
307
308             d                = _mm_sub_pd(r00,rswitch);
309             d                = _mm_max_pd(d,_mm_setzero_pd());
310             d2               = _mm_mul_pd(d,d);
311             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
312
313             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
314
315             /* Evaluate switch function */
316             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
317             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
318             velec            = _mm_mul_pd(velec,sw);
319             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
320
321             /* Update potential sum for this i atom from the interaction with this j atom. */
322             velec            = _mm_and_pd(velec,cutoff_mask);
323             velecsum         = _mm_add_pd(velecsum,velec);
324
325             fscal            = felec;
326
327             fscal            = _mm_and_pd(fscal,cutoff_mask);
328
329             /* Update vectorial force */
330             fix0             = _mm_macc_pd(dx00,fscal,fix0);
331             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
332             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
333             
334             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
335             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
336             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
337
338             }
339
340             /**************************
341              * CALCULATE INTERACTIONS *
342              **************************/
343
344             if (gmx_mm_any_lt(rsq01,rcutoff2))
345             {
346
347             r01              = _mm_mul_pd(rsq01,rinv01);
348
349             /* EWALD ELECTROSTATICS */
350
351             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
352             ewrt             = _mm_mul_pd(r01,ewtabscale);
353             ewitab           = _mm_cvttpd_epi32(ewrt);
354 #ifdef __XOP__
355             eweps            = _mm_frcz_pd(ewrt);
356 #else
357             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
358 #endif
359             twoeweps         = _mm_add_pd(eweps,eweps);
360             ewitab           = _mm_slli_epi32(ewitab,2);
361             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
362             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
363             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
364             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
365             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
366             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
367             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
368             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
369             velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
370             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
371
372             d                = _mm_sub_pd(r01,rswitch);
373             d                = _mm_max_pd(d,_mm_setzero_pd());
374             d2               = _mm_mul_pd(d,d);
375             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
376
377             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
378
379             /* Evaluate switch function */
380             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
381             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
382             velec            = _mm_mul_pd(velec,sw);
383             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
384
385             /* Update potential sum for this i atom from the interaction with this j atom. */
386             velec            = _mm_and_pd(velec,cutoff_mask);
387             velecsum         = _mm_add_pd(velecsum,velec);
388
389             fscal            = felec;
390
391             fscal            = _mm_and_pd(fscal,cutoff_mask);
392
393             /* Update vectorial force */
394             fix0             = _mm_macc_pd(dx01,fscal,fix0);
395             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
396             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
397             
398             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
399             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
400             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
401
402             }
403
404             /**************************
405              * CALCULATE INTERACTIONS *
406              **************************/
407
408             if (gmx_mm_any_lt(rsq02,rcutoff2))
409             {
410
411             r02              = _mm_mul_pd(rsq02,rinv02);
412
413             /* EWALD ELECTROSTATICS */
414
415             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
416             ewrt             = _mm_mul_pd(r02,ewtabscale);
417             ewitab           = _mm_cvttpd_epi32(ewrt);
418 #ifdef __XOP__
419             eweps            = _mm_frcz_pd(ewrt);
420 #else
421             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
422 #endif
423             twoeweps         = _mm_add_pd(eweps,eweps);
424             ewitab           = _mm_slli_epi32(ewitab,2);
425             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
426             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
427             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
428             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
429             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
430             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
431             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
432             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
433             velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
434             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
435
436             d                = _mm_sub_pd(r02,rswitch);
437             d                = _mm_max_pd(d,_mm_setzero_pd());
438             d2               = _mm_mul_pd(d,d);
439             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
440
441             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
442
443             /* Evaluate switch function */
444             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
445             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
446             velec            = _mm_mul_pd(velec,sw);
447             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
448
449             /* Update potential sum for this i atom from the interaction with this j atom. */
450             velec            = _mm_and_pd(velec,cutoff_mask);
451             velecsum         = _mm_add_pd(velecsum,velec);
452
453             fscal            = felec;
454
455             fscal            = _mm_and_pd(fscal,cutoff_mask);
456
457             /* Update vectorial force */
458             fix0             = _mm_macc_pd(dx02,fscal,fix0);
459             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
460             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
461             
462             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
463             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
464             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
465
466             }
467
468             /**************************
469              * CALCULATE INTERACTIONS *
470              **************************/
471
472             if (gmx_mm_any_lt(rsq10,rcutoff2))
473             {
474
475             r10              = _mm_mul_pd(rsq10,rinv10);
476
477             /* EWALD ELECTROSTATICS */
478
479             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
480             ewrt             = _mm_mul_pd(r10,ewtabscale);
481             ewitab           = _mm_cvttpd_epi32(ewrt);
482 #ifdef __XOP__
483             eweps            = _mm_frcz_pd(ewrt);
484 #else
485             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
486 #endif
487             twoeweps         = _mm_add_pd(eweps,eweps);
488             ewitab           = _mm_slli_epi32(ewitab,2);
489             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
490             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
491             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
492             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
493             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
494             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
495             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
496             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
497             velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
498             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
499
500             d                = _mm_sub_pd(r10,rswitch);
501             d                = _mm_max_pd(d,_mm_setzero_pd());
502             d2               = _mm_mul_pd(d,d);
503             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
504
505             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
506
507             /* Evaluate switch function */
508             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
509             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
510             velec            = _mm_mul_pd(velec,sw);
511             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
512
513             /* Update potential sum for this i atom from the interaction with this j atom. */
514             velec            = _mm_and_pd(velec,cutoff_mask);
515             velecsum         = _mm_add_pd(velecsum,velec);
516
517             fscal            = felec;
518
519             fscal            = _mm_and_pd(fscal,cutoff_mask);
520
521             /* Update vectorial force */
522             fix1             = _mm_macc_pd(dx10,fscal,fix1);
523             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
524             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
525             
526             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
527             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
528             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
529
530             }
531
532             /**************************
533              * CALCULATE INTERACTIONS *
534              **************************/
535
536             if (gmx_mm_any_lt(rsq11,rcutoff2))
537             {
538
539             r11              = _mm_mul_pd(rsq11,rinv11);
540
541             /* EWALD ELECTROSTATICS */
542
543             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
544             ewrt             = _mm_mul_pd(r11,ewtabscale);
545             ewitab           = _mm_cvttpd_epi32(ewrt);
546 #ifdef __XOP__
547             eweps            = _mm_frcz_pd(ewrt);
548 #else
549             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
550 #endif
551             twoeweps         = _mm_add_pd(eweps,eweps);
552             ewitab           = _mm_slli_epi32(ewitab,2);
553             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
554             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
555             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
556             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
557             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
558             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
559             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
560             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
561             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
562             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
563
564             d                = _mm_sub_pd(r11,rswitch);
565             d                = _mm_max_pd(d,_mm_setzero_pd());
566             d2               = _mm_mul_pd(d,d);
567             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
568
569             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
570
571             /* Evaluate switch function */
572             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
573             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
574             velec            = _mm_mul_pd(velec,sw);
575             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
576
577             /* Update potential sum for this i atom from the interaction with this j atom. */
578             velec            = _mm_and_pd(velec,cutoff_mask);
579             velecsum         = _mm_add_pd(velecsum,velec);
580
581             fscal            = felec;
582
583             fscal            = _mm_and_pd(fscal,cutoff_mask);
584
585             /* Update vectorial force */
586             fix1             = _mm_macc_pd(dx11,fscal,fix1);
587             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
588             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
589             
590             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
591             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
592             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
593
594             }
595
596             /**************************
597              * CALCULATE INTERACTIONS *
598              **************************/
599
600             if (gmx_mm_any_lt(rsq12,rcutoff2))
601             {
602
603             r12              = _mm_mul_pd(rsq12,rinv12);
604
605             /* EWALD ELECTROSTATICS */
606
607             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
608             ewrt             = _mm_mul_pd(r12,ewtabscale);
609             ewitab           = _mm_cvttpd_epi32(ewrt);
610 #ifdef __XOP__
611             eweps            = _mm_frcz_pd(ewrt);
612 #else
613             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
614 #endif
615             twoeweps         = _mm_add_pd(eweps,eweps);
616             ewitab           = _mm_slli_epi32(ewitab,2);
617             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
618             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
619             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
620             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
621             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
622             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
623             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
624             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
625             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
626             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
627
628             d                = _mm_sub_pd(r12,rswitch);
629             d                = _mm_max_pd(d,_mm_setzero_pd());
630             d2               = _mm_mul_pd(d,d);
631             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
632
633             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
634
635             /* Evaluate switch function */
636             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
637             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
638             velec            = _mm_mul_pd(velec,sw);
639             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
640
641             /* Update potential sum for this i atom from the interaction with this j atom. */
642             velec            = _mm_and_pd(velec,cutoff_mask);
643             velecsum         = _mm_add_pd(velecsum,velec);
644
645             fscal            = felec;
646
647             fscal            = _mm_and_pd(fscal,cutoff_mask);
648
649             /* Update vectorial force */
650             fix1             = _mm_macc_pd(dx12,fscal,fix1);
651             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
652             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
653             
654             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
655             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
656             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
657
658             }
659
660             /**************************
661              * CALCULATE INTERACTIONS *
662              **************************/
663
664             if (gmx_mm_any_lt(rsq20,rcutoff2))
665             {
666
667             r20              = _mm_mul_pd(rsq20,rinv20);
668
669             /* EWALD ELECTROSTATICS */
670
671             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
672             ewrt             = _mm_mul_pd(r20,ewtabscale);
673             ewitab           = _mm_cvttpd_epi32(ewrt);
674 #ifdef __XOP__
675             eweps            = _mm_frcz_pd(ewrt);
676 #else
677             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
678 #endif
679             twoeweps         = _mm_add_pd(eweps,eweps);
680             ewitab           = _mm_slli_epi32(ewitab,2);
681             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
682             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
683             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
684             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
685             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
686             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
687             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
688             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
689             velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
690             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
691
692             d                = _mm_sub_pd(r20,rswitch);
693             d                = _mm_max_pd(d,_mm_setzero_pd());
694             d2               = _mm_mul_pd(d,d);
695             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
696
697             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
698
699             /* Evaluate switch function */
700             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
701             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
702             velec            = _mm_mul_pd(velec,sw);
703             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
704
705             /* Update potential sum for this i atom from the interaction with this j atom. */
706             velec            = _mm_and_pd(velec,cutoff_mask);
707             velecsum         = _mm_add_pd(velecsum,velec);
708
709             fscal            = felec;
710
711             fscal            = _mm_and_pd(fscal,cutoff_mask);
712
713             /* Update vectorial force */
714             fix2             = _mm_macc_pd(dx20,fscal,fix2);
715             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
716             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
717             
718             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
719             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
720             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
721
722             }
723
724             /**************************
725              * CALCULATE INTERACTIONS *
726              **************************/
727
728             if (gmx_mm_any_lt(rsq21,rcutoff2))
729             {
730
731             r21              = _mm_mul_pd(rsq21,rinv21);
732
733             /* EWALD ELECTROSTATICS */
734
735             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
736             ewrt             = _mm_mul_pd(r21,ewtabscale);
737             ewitab           = _mm_cvttpd_epi32(ewrt);
738 #ifdef __XOP__
739             eweps            = _mm_frcz_pd(ewrt);
740 #else
741             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
742 #endif
743             twoeweps         = _mm_add_pd(eweps,eweps);
744             ewitab           = _mm_slli_epi32(ewitab,2);
745             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
746             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
747             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
748             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
749             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
750             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
751             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
752             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
753             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
754             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
755
756             d                = _mm_sub_pd(r21,rswitch);
757             d                = _mm_max_pd(d,_mm_setzero_pd());
758             d2               = _mm_mul_pd(d,d);
759             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
760
761             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
762
763             /* Evaluate switch function */
764             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
765             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
766             velec            = _mm_mul_pd(velec,sw);
767             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
768
769             /* Update potential sum for this i atom from the interaction with this j atom. */
770             velec            = _mm_and_pd(velec,cutoff_mask);
771             velecsum         = _mm_add_pd(velecsum,velec);
772
773             fscal            = felec;
774
775             fscal            = _mm_and_pd(fscal,cutoff_mask);
776
777             /* Update vectorial force */
778             fix2             = _mm_macc_pd(dx21,fscal,fix2);
779             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
780             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
781             
782             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
783             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
784             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
785
786             }
787
788             /**************************
789              * CALCULATE INTERACTIONS *
790              **************************/
791
792             if (gmx_mm_any_lt(rsq22,rcutoff2))
793             {
794
795             r22              = _mm_mul_pd(rsq22,rinv22);
796
797             /* EWALD ELECTROSTATICS */
798
799             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
800             ewrt             = _mm_mul_pd(r22,ewtabscale);
801             ewitab           = _mm_cvttpd_epi32(ewrt);
802 #ifdef __XOP__
803             eweps            = _mm_frcz_pd(ewrt);
804 #else
805             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
806 #endif
807             twoeweps         = _mm_add_pd(eweps,eweps);
808             ewitab           = _mm_slli_epi32(ewitab,2);
809             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
810             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
811             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
812             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
813             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
814             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
815             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
816             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
817             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
818             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
819
820             d                = _mm_sub_pd(r22,rswitch);
821             d                = _mm_max_pd(d,_mm_setzero_pd());
822             d2               = _mm_mul_pd(d,d);
823             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
824
825             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
826
827             /* Evaluate switch function */
828             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
829             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
830             velec            = _mm_mul_pd(velec,sw);
831             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
832
833             /* Update potential sum for this i atom from the interaction with this j atom. */
834             velec            = _mm_and_pd(velec,cutoff_mask);
835             velecsum         = _mm_add_pd(velecsum,velec);
836
837             fscal            = felec;
838
839             fscal            = _mm_and_pd(fscal,cutoff_mask);
840
841             /* Update vectorial force */
842             fix2             = _mm_macc_pd(dx22,fscal,fix2);
843             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
844             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
845             
846             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
847             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
848             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
849
850             }
851
852             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
853
854             /* Inner loop uses 612 flops */
855         }
856
857         if(jidx<j_index_end)
858         {
859
860             jnrA             = jjnr[jidx];
861             j_coord_offsetA  = DIM*jnrA;
862
863             /* load j atom coordinates */
864             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
865                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
866
867             /* Calculate displacement vector */
868             dx00             = _mm_sub_pd(ix0,jx0);
869             dy00             = _mm_sub_pd(iy0,jy0);
870             dz00             = _mm_sub_pd(iz0,jz0);
871             dx01             = _mm_sub_pd(ix0,jx1);
872             dy01             = _mm_sub_pd(iy0,jy1);
873             dz01             = _mm_sub_pd(iz0,jz1);
874             dx02             = _mm_sub_pd(ix0,jx2);
875             dy02             = _mm_sub_pd(iy0,jy2);
876             dz02             = _mm_sub_pd(iz0,jz2);
877             dx10             = _mm_sub_pd(ix1,jx0);
878             dy10             = _mm_sub_pd(iy1,jy0);
879             dz10             = _mm_sub_pd(iz1,jz0);
880             dx11             = _mm_sub_pd(ix1,jx1);
881             dy11             = _mm_sub_pd(iy1,jy1);
882             dz11             = _mm_sub_pd(iz1,jz1);
883             dx12             = _mm_sub_pd(ix1,jx2);
884             dy12             = _mm_sub_pd(iy1,jy2);
885             dz12             = _mm_sub_pd(iz1,jz2);
886             dx20             = _mm_sub_pd(ix2,jx0);
887             dy20             = _mm_sub_pd(iy2,jy0);
888             dz20             = _mm_sub_pd(iz2,jz0);
889             dx21             = _mm_sub_pd(ix2,jx1);
890             dy21             = _mm_sub_pd(iy2,jy1);
891             dz21             = _mm_sub_pd(iz2,jz1);
892             dx22             = _mm_sub_pd(ix2,jx2);
893             dy22             = _mm_sub_pd(iy2,jy2);
894             dz22             = _mm_sub_pd(iz2,jz2);
895
896             /* Calculate squared distance and things based on it */
897             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
898             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
899             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
900             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
901             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
902             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
903             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
904             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
905             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
906
907             rinv00           = gmx_mm_invsqrt_pd(rsq00);
908             rinv01           = gmx_mm_invsqrt_pd(rsq01);
909             rinv02           = gmx_mm_invsqrt_pd(rsq02);
910             rinv10           = gmx_mm_invsqrt_pd(rsq10);
911             rinv11           = gmx_mm_invsqrt_pd(rsq11);
912             rinv12           = gmx_mm_invsqrt_pd(rsq12);
913             rinv20           = gmx_mm_invsqrt_pd(rsq20);
914             rinv21           = gmx_mm_invsqrt_pd(rsq21);
915             rinv22           = gmx_mm_invsqrt_pd(rsq22);
916
917             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
918             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
919             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
920             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
921             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
922             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
923             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
924             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
925             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
926
927             fjx0             = _mm_setzero_pd();
928             fjy0             = _mm_setzero_pd();
929             fjz0             = _mm_setzero_pd();
930             fjx1             = _mm_setzero_pd();
931             fjy1             = _mm_setzero_pd();
932             fjz1             = _mm_setzero_pd();
933             fjx2             = _mm_setzero_pd();
934             fjy2             = _mm_setzero_pd();
935             fjz2             = _mm_setzero_pd();
936
937             /**************************
938              * CALCULATE INTERACTIONS *
939              **************************/
940
941             if (gmx_mm_any_lt(rsq00,rcutoff2))
942             {
943
944             r00              = _mm_mul_pd(rsq00,rinv00);
945
946             /* EWALD ELECTROSTATICS */
947
948             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
949             ewrt             = _mm_mul_pd(r00,ewtabscale);
950             ewitab           = _mm_cvttpd_epi32(ewrt);
951 #ifdef __XOP__
952             eweps            = _mm_frcz_pd(ewrt);
953 #else
954             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
955 #endif
956             twoeweps         = _mm_add_pd(eweps,eweps);
957             ewitab           = _mm_slli_epi32(ewitab,2);
958             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
959             ewtabD           = _mm_setzero_pd();
960             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
961             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
962             ewtabFn          = _mm_setzero_pd();
963             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
964             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
965             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
966             velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
967             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
968
969             d                = _mm_sub_pd(r00,rswitch);
970             d                = _mm_max_pd(d,_mm_setzero_pd());
971             d2               = _mm_mul_pd(d,d);
972             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
973
974             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
975
976             /* Evaluate switch function */
977             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
978             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
979             velec            = _mm_mul_pd(velec,sw);
980             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
981
982             /* Update potential sum for this i atom from the interaction with this j atom. */
983             velec            = _mm_and_pd(velec,cutoff_mask);
984             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
985             velecsum         = _mm_add_pd(velecsum,velec);
986
987             fscal            = felec;
988
989             fscal            = _mm_and_pd(fscal,cutoff_mask);
990
991             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
992
993             /* Update vectorial force */
994             fix0             = _mm_macc_pd(dx00,fscal,fix0);
995             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
996             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
997             
998             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
999             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1000             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1001
1002             }
1003
1004             /**************************
1005              * CALCULATE INTERACTIONS *
1006              **************************/
1007
1008             if (gmx_mm_any_lt(rsq01,rcutoff2))
1009             {
1010
1011             r01              = _mm_mul_pd(rsq01,rinv01);
1012
1013             /* EWALD ELECTROSTATICS */
1014
1015             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1016             ewrt             = _mm_mul_pd(r01,ewtabscale);
1017             ewitab           = _mm_cvttpd_epi32(ewrt);
1018 #ifdef __XOP__
1019             eweps            = _mm_frcz_pd(ewrt);
1020 #else
1021             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1022 #endif
1023             twoeweps         = _mm_add_pd(eweps,eweps);
1024             ewitab           = _mm_slli_epi32(ewitab,2);
1025             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1026             ewtabD           = _mm_setzero_pd();
1027             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1028             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1029             ewtabFn          = _mm_setzero_pd();
1030             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1031             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1032             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1033             velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1034             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1035
1036             d                = _mm_sub_pd(r01,rswitch);
1037             d                = _mm_max_pd(d,_mm_setzero_pd());
1038             d2               = _mm_mul_pd(d,d);
1039             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1040
1041             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1042
1043             /* Evaluate switch function */
1044             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1045             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1046             velec            = _mm_mul_pd(velec,sw);
1047             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
1048
1049             /* Update potential sum for this i atom from the interaction with this j atom. */
1050             velec            = _mm_and_pd(velec,cutoff_mask);
1051             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1052             velecsum         = _mm_add_pd(velecsum,velec);
1053
1054             fscal            = felec;
1055
1056             fscal            = _mm_and_pd(fscal,cutoff_mask);
1057
1058             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1059
1060             /* Update vectorial force */
1061             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1062             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1063             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1064             
1065             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1066             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1067             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1068
1069             }
1070
1071             /**************************
1072              * CALCULATE INTERACTIONS *
1073              **************************/
1074
1075             if (gmx_mm_any_lt(rsq02,rcutoff2))
1076             {
1077
1078             r02              = _mm_mul_pd(rsq02,rinv02);
1079
1080             /* EWALD ELECTROSTATICS */
1081
1082             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1083             ewrt             = _mm_mul_pd(r02,ewtabscale);
1084             ewitab           = _mm_cvttpd_epi32(ewrt);
1085 #ifdef __XOP__
1086             eweps            = _mm_frcz_pd(ewrt);
1087 #else
1088             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1089 #endif
1090             twoeweps         = _mm_add_pd(eweps,eweps);
1091             ewitab           = _mm_slli_epi32(ewitab,2);
1092             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1093             ewtabD           = _mm_setzero_pd();
1094             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1095             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1096             ewtabFn          = _mm_setzero_pd();
1097             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1098             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1099             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1100             velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
1101             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1102
1103             d                = _mm_sub_pd(r02,rswitch);
1104             d                = _mm_max_pd(d,_mm_setzero_pd());
1105             d2               = _mm_mul_pd(d,d);
1106             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1107
1108             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1109
1110             /* Evaluate switch function */
1111             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1112             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
1113             velec            = _mm_mul_pd(velec,sw);
1114             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1115
1116             /* Update potential sum for this i atom from the interaction with this j atom. */
1117             velec            = _mm_and_pd(velec,cutoff_mask);
1118             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1119             velecsum         = _mm_add_pd(velecsum,velec);
1120
1121             fscal            = felec;
1122
1123             fscal            = _mm_and_pd(fscal,cutoff_mask);
1124
1125             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1126
1127             /* Update vectorial force */
1128             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1129             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1130             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1131             
1132             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1133             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1134             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1135
1136             }
1137
1138             /**************************
1139              * CALCULATE INTERACTIONS *
1140              **************************/
1141
1142             if (gmx_mm_any_lt(rsq10,rcutoff2))
1143             {
1144
1145             r10              = _mm_mul_pd(rsq10,rinv10);
1146
1147             /* EWALD ELECTROSTATICS */
1148
1149             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150             ewrt             = _mm_mul_pd(r10,ewtabscale);
1151             ewitab           = _mm_cvttpd_epi32(ewrt);
1152 #ifdef __XOP__
1153             eweps            = _mm_frcz_pd(ewrt);
1154 #else
1155             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1156 #endif
1157             twoeweps         = _mm_add_pd(eweps,eweps);
1158             ewitab           = _mm_slli_epi32(ewitab,2);
1159             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1160             ewtabD           = _mm_setzero_pd();
1161             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1162             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1163             ewtabFn          = _mm_setzero_pd();
1164             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1165             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1166             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1167             velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1168             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1169
1170             d                = _mm_sub_pd(r10,rswitch);
1171             d                = _mm_max_pd(d,_mm_setzero_pd());
1172             d2               = _mm_mul_pd(d,d);
1173             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1174
1175             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1176
1177             /* Evaluate switch function */
1178             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1179             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1180             velec            = _mm_mul_pd(velec,sw);
1181             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
1182
1183             /* Update potential sum for this i atom from the interaction with this j atom. */
1184             velec            = _mm_and_pd(velec,cutoff_mask);
1185             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1186             velecsum         = _mm_add_pd(velecsum,velec);
1187
1188             fscal            = felec;
1189
1190             fscal            = _mm_and_pd(fscal,cutoff_mask);
1191
1192             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1193
1194             /* Update vectorial force */
1195             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1196             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1197             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1198             
1199             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1200             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1201             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1202
1203             }
1204
1205             /**************************
1206              * CALCULATE INTERACTIONS *
1207              **************************/
1208
1209             if (gmx_mm_any_lt(rsq11,rcutoff2))
1210             {
1211
1212             r11              = _mm_mul_pd(rsq11,rinv11);
1213
1214             /* EWALD ELECTROSTATICS */
1215
1216             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1217             ewrt             = _mm_mul_pd(r11,ewtabscale);
1218             ewitab           = _mm_cvttpd_epi32(ewrt);
1219 #ifdef __XOP__
1220             eweps            = _mm_frcz_pd(ewrt);
1221 #else
1222             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1223 #endif
1224             twoeweps         = _mm_add_pd(eweps,eweps);
1225             ewitab           = _mm_slli_epi32(ewitab,2);
1226             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1227             ewtabD           = _mm_setzero_pd();
1228             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1229             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1230             ewtabFn          = _mm_setzero_pd();
1231             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1232             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1233             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1234             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1235             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1236
1237             d                = _mm_sub_pd(r11,rswitch);
1238             d                = _mm_max_pd(d,_mm_setzero_pd());
1239             d2               = _mm_mul_pd(d,d);
1240             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1241
1242             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1243
1244             /* Evaluate switch function */
1245             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1246             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1247             velec            = _mm_mul_pd(velec,sw);
1248             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1249
1250             /* Update potential sum for this i atom from the interaction with this j atom. */
1251             velec            = _mm_and_pd(velec,cutoff_mask);
1252             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1253             velecsum         = _mm_add_pd(velecsum,velec);
1254
1255             fscal            = felec;
1256
1257             fscal            = _mm_and_pd(fscal,cutoff_mask);
1258
1259             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1260
1261             /* Update vectorial force */
1262             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1263             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1264             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1265             
1266             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1267             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1268             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1269
1270             }
1271
1272             /**************************
1273              * CALCULATE INTERACTIONS *
1274              **************************/
1275
1276             if (gmx_mm_any_lt(rsq12,rcutoff2))
1277             {
1278
1279             r12              = _mm_mul_pd(rsq12,rinv12);
1280
1281             /* EWALD ELECTROSTATICS */
1282
1283             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1284             ewrt             = _mm_mul_pd(r12,ewtabscale);
1285             ewitab           = _mm_cvttpd_epi32(ewrt);
1286 #ifdef __XOP__
1287             eweps            = _mm_frcz_pd(ewrt);
1288 #else
1289             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1290 #endif
1291             twoeweps         = _mm_add_pd(eweps,eweps);
1292             ewitab           = _mm_slli_epi32(ewitab,2);
1293             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1294             ewtabD           = _mm_setzero_pd();
1295             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1296             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1297             ewtabFn          = _mm_setzero_pd();
1298             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1299             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1300             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1301             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1302             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1303
1304             d                = _mm_sub_pd(r12,rswitch);
1305             d                = _mm_max_pd(d,_mm_setzero_pd());
1306             d2               = _mm_mul_pd(d,d);
1307             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1308
1309             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1310
1311             /* Evaluate switch function */
1312             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1313             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1314             velec            = _mm_mul_pd(velec,sw);
1315             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1316
1317             /* Update potential sum for this i atom from the interaction with this j atom. */
1318             velec            = _mm_and_pd(velec,cutoff_mask);
1319             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1320             velecsum         = _mm_add_pd(velecsum,velec);
1321
1322             fscal            = felec;
1323
1324             fscal            = _mm_and_pd(fscal,cutoff_mask);
1325
1326             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1327
1328             /* Update vectorial force */
1329             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1330             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1331             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1332             
1333             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1334             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1335             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1336
1337             }
1338
1339             /**************************
1340              * CALCULATE INTERACTIONS *
1341              **************************/
1342
1343             if (gmx_mm_any_lt(rsq20,rcutoff2))
1344             {
1345
1346             r20              = _mm_mul_pd(rsq20,rinv20);
1347
1348             /* EWALD ELECTROSTATICS */
1349
1350             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1351             ewrt             = _mm_mul_pd(r20,ewtabscale);
1352             ewitab           = _mm_cvttpd_epi32(ewrt);
1353 #ifdef __XOP__
1354             eweps            = _mm_frcz_pd(ewrt);
1355 #else
1356             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1357 #endif
1358             twoeweps         = _mm_add_pd(eweps,eweps);
1359             ewitab           = _mm_slli_epi32(ewitab,2);
1360             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1361             ewtabD           = _mm_setzero_pd();
1362             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1363             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1364             ewtabFn          = _mm_setzero_pd();
1365             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1366             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1367             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1368             velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1369             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1370
1371             d                = _mm_sub_pd(r20,rswitch);
1372             d                = _mm_max_pd(d,_mm_setzero_pd());
1373             d2               = _mm_mul_pd(d,d);
1374             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1375
1376             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1377
1378             /* Evaluate switch function */
1379             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1380             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1381             velec            = _mm_mul_pd(velec,sw);
1382             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
1383
1384             /* Update potential sum for this i atom from the interaction with this j atom. */
1385             velec            = _mm_and_pd(velec,cutoff_mask);
1386             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1387             velecsum         = _mm_add_pd(velecsum,velec);
1388
1389             fscal            = felec;
1390
1391             fscal            = _mm_and_pd(fscal,cutoff_mask);
1392
1393             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1394
1395             /* Update vectorial force */
1396             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1397             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1398             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1399             
1400             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1401             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1402             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1403
1404             }
1405
1406             /**************************
1407              * CALCULATE INTERACTIONS *
1408              **************************/
1409
1410             if (gmx_mm_any_lt(rsq21,rcutoff2))
1411             {
1412
1413             r21              = _mm_mul_pd(rsq21,rinv21);
1414
1415             /* EWALD ELECTROSTATICS */
1416
1417             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1418             ewrt             = _mm_mul_pd(r21,ewtabscale);
1419             ewitab           = _mm_cvttpd_epi32(ewrt);
1420 #ifdef __XOP__
1421             eweps            = _mm_frcz_pd(ewrt);
1422 #else
1423             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1424 #endif
1425             twoeweps         = _mm_add_pd(eweps,eweps);
1426             ewitab           = _mm_slli_epi32(ewitab,2);
1427             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1428             ewtabD           = _mm_setzero_pd();
1429             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1430             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1431             ewtabFn          = _mm_setzero_pd();
1432             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1433             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1434             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1435             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1436             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1437
1438             d                = _mm_sub_pd(r21,rswitch);
1439             d                = _mm_max_pd(d,_mm_setzero_pd());
1440             d2               = _mm_mul_pd(d,d);
1441             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1442
1443             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1444
1445             /* Evaluate switch function */
1446             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1447             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1448             velec            = _mm_mul_pd(velec,sw);
1449             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1450
1451             /* Update potential sum for this i atom from the interaction with this j atom. */
1452             velec            = _mm_and_pd(velec,cutoff_mask);
1453             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1454             velecsum         = _mm_add_pd(velecsum,velec);
1455
1456             fscal            = felec;
1457
1458             fscal            = _mm_and_pd(fscal,cutoff_mask);
1459
1460             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1461
1462             /* Update vectorial force */
1463             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1464             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1465             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1466             
1467             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1468             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1469             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1470
1471             }
1472
1473             /**************************
1474              * CALCULATE INTERACTIONS *
1475              **************************/
1476
1477             if (gmx_mm_any_lt(rsq22,rcutoff2))
1478             {
1479
1480             r22              = _mm_mul_pd(rsq22,rinv22);
1481
1482             /* EWALD ELECTROSTATICS */
1483
1484             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1485             ewrt             = _mm_mul_pd(r22,ewtabscale);
1486             ewitab           = _mm_cvttpd_epi32(ewrt);
1487 #ifdef __XOP__
1488             eweps            = _mm_frcz_pd(ewrt);
1489 #else
1490             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1491 #endif
1492             twoeweps         = _mm_add_pd(eweps,eweps);
1493             ewitab           = _mm_slli_epi32(ewitab,2);
1494             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1495             ewtabD           = _mm_setzero_pd();
1496             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1497             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1498             ewtabFn          = _mm_setzero_pd();
1499             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1500             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1501             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1502             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1503             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1504
1505             d                = _mm_sub_pd(r22,rswitch);
1506             d                = _mm_max_pd(d,_mm_setzero_pd());
1507             d2               = _mm_mul_pd(d,d);
1508             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1509
1510             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1511
1512             /* Evaluate switch function */
1513             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1514             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1515             velec            = _mm_mul_pd(velec,sw);
1516             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1517
1518             /* Update potential sum for this i atom from the interaction with this j atom. */
1519             velec            = _mm_and_pd(velec,cutoff_mask);
1520             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1521             velecsum         = _mm_add_pd(velecsum,velec);
1522
1523             fscal            = felec;
1524
1525             fscal            = _mm_and_pd(fscal,cutoff_mask);
1526
1527             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1528
1529             /* Update vectorial force */
1530             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1531             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1532             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1533             
1534             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1535             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1536             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1537
1538             }
1539
1540             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1541
1542             /* Inner loop uses 612 flops */
1543         }
1544
1545         /* End of innermost loop */
1546
1547         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1548                                               f+i_coord_offset,fshift+i_shift_offset);
1549
1550         ggid                        = gid[iidx];
1551         /* Update potential energies */
1552         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1553
1554         /* Increment number of inner iterations */
1555         inneriter                  += j_index_end - j_index_start;
1556
1557         /* Outer loop uses 19 flops */
1558     }
1559
1560     /* Increment number of outer iterations */
1561     outeriter        += nri;
1562
1563     /* Update outer/inner flops */
1564
1565     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*612);
1566 }
1567 /*
1568  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_double
1569  * Electrostatics interaction: Ewald
1570  * VdW interaction:            None
1571  * Geometry:                   Water3-Water3
1572  * Calculate force/pot:        Force
1573  */
1574 void
1575 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_double
1576                     (t_nblist * gmx_restrict                nlist,
1577                      rvec * gmx_restrict                    xx,
1578                      rvec * gmx_restrict                    ff,
1579                      t_forcerec * gmx_restrict              fr,
1580                      t_mdatoms * gmx_restrict               mdatoms,
1581                      nb_kernel_data_t * gmx_restrict        kernel_data,
1582                      t_nrnb * gmx_restrict                  nrnb)
1583 {
1584     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1585      * just 0 for non-waters.
1586      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1587      * jnr indices corresponding to data put in the four positions in the SIMD register.
1588      */
1589     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1590     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1591     int              jnrA,jnrB;
1592     int              j_coord_offsetA,j_coord_offsetB;
1593     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1594     real             rcutoff_scalar;
1595     real             *shiftvec,*fshift,*x,*f;
1596     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1597     int              vdwioffset0;
1598     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1599     int              vdwioffset1;
1600     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1601     int              vdwioffset2;
1602     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1603     int              vdwjidx0A,vdwjidx0B;
1604     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1605     int              vdwjidx1A,vdwjidx1B;
1606     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1607     int              vdwjidx2A,vdwjidx2B;
1608     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1609     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1610     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1611     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1612     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1613     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1614     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1615     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1616     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1617     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1618     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1619     real             *charge;
1620     __m128i          ewitab;
1621     __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1622     real             *ewtab;
1623     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1624     real             rswitch_scalar,d_scalar;
1625     __m128d          dummy_mask,cutoff_mask;
1626     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1627     __m128d          one     = _mm_set1_pd(1.0);
1628     __m128d          two     = _mm_set1_pd(2.0);
1629     x                = xx[0];
1630     f                = ff[0];
1631
1632     nri              = nlist->nri;
1633     iinr             = nlist->iinr;
1634     jindex           = nlist->jindex;
1635     jjnr             = nlist->jjnr;
1636     shiftidx         = nlist->shift;
1637     gid              = nlist->gid;
1638     shiftvec         = fr->shift_vec[0];
1639     fshift           = fr->fshift[0];
1640     facel            = _mm_set1_pd(fr->epsfac);
1641     charge           = mdatoms->chargeA;
1642
1643     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1644     ewtab            = fr->ic->tabq_coul_FDV0;
1645     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1646     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1647
1648     /* Setup water-specific parameters */
1649     inr              = nlist->iinr[0];
1650     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1651     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1652     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1653
1654     jq0              = _mm_set1_pd(charge[inr+0]);
1655     jq1              = _mm_set1_pd(charge[inr+1]);
1656     jq2              = _mm_set1_pd(charge[inr+2]);
1657     qq00             = _mm_mul_pd(iq0,jq0);
1658     qq01             = _mm_mul_pd(iq0,jq1);
1659     qq02             = _mm_mul_pd(iq0,jq2);
1660     qq10             = _mm_mul_pd(iq1,jq0);
1661     qq11             = _mm_mul_pd(iq1,jq1);
1662     qq12             = _mm_mul_pd(iq1,jq2);
1663     qq20             = _mm_mul_pd(iq2,jq0);
1664     qq21             = _mm_mul_pd(iq2,jq1);
1665     qq22             = _mm_mul_pd(iq2,jq2);
1666
1667     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1668     rcutoff_scalar   = fr->rcoulomb;
1669     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1670     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1671
1672     rswitch_scalar   = fr->rcoulomb_switch;
1673     rswitch          = _mm_set1_pd(rswitch_scalar);
1674     /* Setup switch parameters */
1675     d_scalar         = rcutoff_scalar-rswitch_scalar;
1676     d                = _mm_set1_pd(d_scalar);
1677     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1678     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1679     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1680     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1681     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1682     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1683
1684     /* Avoid stupid compiler warnings */
1685     jnrA = jnrB = 0;
1686     j_coord_offsetA = 0;
1687     j_coord_offsetB = 0;
1688
1689     outeriter        = 0;
1690     inneriter        = 0;
1691
1692     /* Start outer loop over neighborlists */
1693     for(iidx=0; iidx<nri; iidx++)
1694     {
1695         /* Load shift vector for this list */
1696         i_shift_offset   = DIM*shiftidx[iidx];
1697
1698         /* Load limits for loop over neighbors */
1699         j_index_start    = jindex[iidx];
1700         j_index_end      = jindex[iidx+1];
1701
1702         /* Get outer coordinate index */
1703         inr              = iinr[iidx];
1704         i_coord_offset   = DIM*inr;
1705
1706         /* Load i particle coords and add shift vector */
1707         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1708                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1709
1710         fix0             = _mm_setzero_pd();
1711         fiy0             = _mm_setzero_pd();
1712         fiz0             = _mm_setzero_pd();
1713         fix1             = _mm_setzero_pd();
1714         fiy1             = _mm_setzero_pd();
1715         fiz1             = _mm_setzero_pd();
1716         fix2             = _mm_setzero_pd();
1717         fiy2             = _mm_setzero_pd();
1718         fiz2             = _mm_setzero_pd();
1719
1720         /* Start inner kernel loop */
1721         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1722         {
1723
1724             /* Get j neighbor index, and coordinate index */
1725             jnrA             = jjnr[jidx];
1726             jnrB             = jjnr[jidx+1];
1727             j_coord_offsetA  = DIM*jnrA;
1728             j_coord_offsetB  = DIM*jnrB;
1729
1730             /* load j atom coordinates */
1731             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1732                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1733
1734             /* Calculate displacement vector */
1735             dx00             = _mm_sub_pd(ix0,jx0);
1736             dy00             = _mm_sub_pd(iy0,jy0);
1737             dz00             = _mm_sub_pd(iz0,jz0);
1738             dx01             = _mm_sub_pd(ix0,jx1);
1739             dy01             = _mm_sub_pd(iy0,jy1);
1740             dz01             = _mm_sub_pd(iz0,jz1);
1741             dx02             = _mm_sub_pd(ix0,jx2);
1742             dy02             = _mm_sub_pd(iy0,jy2);
1743             dz02             = _mm_sub_pd(iz0,jz2);
1744             dx10             = _mm_sub_pd(ix1,jx0);
1745             dy10             = _mm_sub_pd(iy1,jy0);
1746             dz10             = _mm_sub_pd(iz1,jz0);
1747             dx11             = _mm_sub_pd(ix1,jx1);
1748             dy11             = _mm_sub_pd(iy1,jy1);
1749             dz11             = _mm_sub_pd(iz1,jz1);
1750             dx12             = _mm_sub_pd(ix1,jx2);
1751             dy12             = _mm_sub_pd(iy1,jy2);
1752             dz12             = _mm_sub_pd(iz1,jz2);
1753             dx20             = _mm_sub_pd(ix2,jx0);
1754             dy20             = _mm_sub_pd(iy2,jy0);
1755             dz20             = _mm_sub_pd(iz2,jz0);
1756             dx21             = _mm_sub_pd(ix2,jx1);
1757             dy21             = _mm_sub_pd(iy2,jy1);
1758             dz21             = _mm_sub_pd(iz2,jz1);
1759             dx22             = _mm_sub_pd(ix2,jx2);
1760             dy22             = _mm_sub_pd(iy2,jy2);
1761             dz22             = _mm_sub_pd(iz2,jz2);
1762
1763             /* Calculate squared distance and things based on it */
1764             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1765             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1766             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1767             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1768             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1769             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1770             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1771             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1772             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1773
1774             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1775             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1776             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1777             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1778             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1779             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1780             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1781             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1782             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1783
1784             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1785             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1786             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1787             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1788             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1789             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1790             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1791             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1792             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1793
1794             fjx0             = _mm_setzero_pd();
1795             fjy0             = _mm_setzero_pd();
1796             fjz0             = _mm_setzero_pd();
1797             fjx1             = _mm_setzero_pd();
1798             fjy1             = _mm_setzero_pd();
1799             fjz1             = _mm_setzero_pd();
1800             fjx2             = _mm_setzero_pd();
1801             fjy2             = _mm_setzero_pd();
1802             fjz2             = _mm_setzero_pd();
1803
1804             /**************************
1805              * CALCULATE INTERACTIONS *
1806              **************************/
1807
1808             if (gmx_mm_any_lt(rsq00,rcutoff2))
1809             {
1810
1811             r00              = _mm_mul_pd(rsq00,rinv00);
1812
1813             /* EWALD ELECTROSTATICS */
1814
1815             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1816             ewrt             = _mm_mul_pd(r00,ewtabscale);
1817             ewitab           = _mm_cvttpd_epi32(ewrt);
1818 #ifdef __XOP__
1819             eweps            = _mm_frcz_pd(ewrt);
1820 #else
1821             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1822 #endif
1823             twoeweps         = _mm_add_pd(eweps,eweps);
1824             ewitab           = _mm_slli_epi32(ewitab,2);
1825             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1826             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1827             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1828             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1829             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1830             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1831             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1832             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1833             velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1834             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1835
1836             d                = _mm_sub_pd(r00,rswitch);
1837             d                = _mm_max_pd(d,_mm_setzero_pd());
1838             d2               = _mm_mul_pd(d,d);
1839             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1840
1841             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1842
1843             /* Evaluate switch function */
1844             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1845             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1846             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1847
1848             fscal            = felec;
1849
1850             fscal            = _mm_and_pd(fscal,cutoff_mask);
1851
1852             /* Update vectorial force */
1853             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1854             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1855             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1856             
1857             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1858             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1859             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1860
1861             }
1862
1863             /**************************
1864              * CALCULATE INTERACTIONS *
1865              **************************/
1866
1867             if (gmx_mm_any_lt(rsq01,rcutoff2))
1868             {
1869
1870             r01              = _mm_mul_pd(rsq01,rinv01);
1871
1872             /* EWALD ELECTROSTATICS */
1873
1874             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1875             ewrt             = _mm_mul_pd(r01,ewtabscale);
1876             ewitab           = _mm_cvttpd_epi32(ewrt);
1877 #ifdef __XOP__
1878             eweps            = _mm_frcz_pd(ewrt);
1879 #else
1880             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1881 #endif
1882             twoeweps         = _mm_add_pd(eweps,eweps);
1883             ewitab           = _mm_slli_epi32(ewitab,2);
1884             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1885             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1886             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1887             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1888             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1889             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1890             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1891             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1892             velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1893             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1894
1895             d                = _mm_sub_pd(r01,rswitch);
1896             d                = _mm_max_pd(d,_mm_setzero_pd());
1897             d2               = _mm_mul_pd(d,d);
1898             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1899
1900             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1901
1902             /* Evaluate switch function */
1903             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1904             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1905             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
1906
1907             fscal            = felec;
1908
1909             fscal            = _mm_and_pd(fscal,cutoff_mask);
1910
1911             /* Update vectorial force */
1912             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1913             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1914             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1915             
1916             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1917             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1918             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1919
1920             }
1921
1922             /**************************
1923              * CALCULATE INTERACTIONS *
1924              **************************/
1925
1926             if (gmx_mm_any_lt(rsq02,rcutoff2))
1927             {
1928
1929             r02              = _mm_mul_pd(rsq02,rinv02);
1930
1931             /* EWALD ELECTROSTATICS */
1932
1933             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1934             ewrt             = _mm_mul_pd(r02,ewtabscale);
1935             ewitab           = _mm_cvttpd_epi32(ewrt);
1936 #ifdef __XOP__
1937             eweps            = _mm_frcz_pd(ewrt);
1938 #else
1939             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1940 #endif
1941             twoeweps         = _mm_add_pd(eweps,eweps);
1942             ewitab           = _mm_slli_epi32(ewitab,2);
1943             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1944             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1945             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1946             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1947             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1948             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1949             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1950             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1951             velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
1952             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1953
1954             d                = _mm_sub_pd(r02,rswitch);
1955             d                = _mm_max_pd(d,_mm_setzero_pd());
1956             d2               = _mm_mul_pd(d,d);
1957             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1958
1959             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1960
1961             /* Evaluate switch function */
1962             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1963             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
1964             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1965
1966             fscal            = felec;
1967
1968             fscal            = _mm_and_pd(fscal,cutoff_mask);
1969
1970             /* Update vectorial force */
1971             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1972             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1973             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1974             
1975             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1976             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1977             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1978
1979             }
1980
1981             /**************************
1982              * CALCULATE INTERACTIONS *
1983              **************************/
1984
1985             if (gmx_mm_any_lt(rsq10,rcutoff2))
1986             {
1987
1988             r10              = _mm_mul_pd(rsq10,rinv10);
1989
1990             /* EWALD ELECTROSTATICS */
1991
1992             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1993             ewrt             = _mm_mul_pd(r10,ewtabscale);
1994             ewitab           = _mm_cvttpd_epi32(ewrt);
1995 #ifdef __XOP__
1996             eweps            = _mm_frcz_pd(ewrt);
1997 #else
1998             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1999 #endif
2000             twoeweps         = _mm_add_pd(eweps,eweps);
2001             ewitab           = _mm_slli_epi32(ewitab,2);
2002             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2003             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2004             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2005             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2006             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2007             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2008             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2009             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2010             velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2011             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2012
2013             d                = _mm_sub_pd(r10,rswitch);
2014             d                = _mm_max_pd(d,_mm_setzero_pd());
2015             d2               = _mm_mul_pd(d,d);
2016             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2017
2018             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2019
2020             /* Evaluate switch function */
2021             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2022             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2023             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
2024
2025             fscal            = felec;
2026
2027             fscal            = _mm_and_pd(fscal,cutoff_mask);
2028
2029             /* Update vectorial force */
2030             fix1             = _mm_macc_pd(dx10,fscal,fix1);
2031             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
2032             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
2033             
2034             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
2035             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
2036             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
2037
2038             }
2039
2040             /**************************
2041              * CALCULATE INTERACTIONS *
2042              **************************/
2043
2044             if (gmx_mm_any_lt(rsq11,rcutoff2))
2045             {
2046
2047             r11              = _mm_mul_pd(rsq11,rinv11);
2048
2049             /* EWALD ELECTROSTATICS */
2050
2051             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2052             ewrt             = _mm_mul_pd(r11,ewtabscale);
2053             ewitab           = _mm_cvttpd_epi32(ewrt);
2054 #ifdef __XOP__
2055             eweps            = _mm_frcz_pd(ewrt);
2056 #else
2057             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2058 #endif
2059             twoeweps         = _mm_add_pd(eweps,eweps);
2060             ewitab           = _mm_slli_epi32(ewitab,2);
2061             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2062             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2063             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2064             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2065             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2066             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2067             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2068             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2069             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2070             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2071
2072             d                = _mm_sub_pd(r11,rswitch);
2073             d                = _mm_max_pd(d,_mm_setzero_pd());
2074             d2               = _mm_mul_pd(d,d);
2075             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2076
2077             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2078
2079             /* Evaluate switch function */
2080             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2081             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2082             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2083
2084             fscal            = felec;
2085
2086             fscal            = _mm_and_pd(fscal,cutoff_mask);
2087
2088             /* Update vectorial force */
2089             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2090             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2091             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2092             
2093             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2094             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2095             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2096
2097             }
2098
2099             /**************************
2100              * CALCULATE INTERACTIONS *
2101              **************************/
2102
2103             if (gmx_mm_any_lt(rsq12,rcutoff2))
2104             {
2105
2106             r12              = _mm_mul_pd(rsq12,rinv12);
2107
2108             /* EWALD ELECTROSTATICS */
2109
2110             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111             ewrt             = _mm_mul_pd(r12,ewtabscale);
2112             ewitab           = _mm_cvttpd_epi32(ewrt);
2113 #ifdef __XOP__
2114             eweps            = _mm_frcz_pd(ewrt);
2115 #else
2116             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2117 #endif
2118             twoeweps         = _mm_add_pd(eweps,eweps);
2119             ewitab           = _mm_slli_epi32(ewitab,2);
2120             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2121             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2122             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2123             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2124             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2125             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2126             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2127             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2128             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2129             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2130
2131             d                = _mm_sub_pd(r12,rswitch);
2132             d                = _mm_max_pd(d,_mm_setzero_pd());
2133             d2               = _mm_mul_pd(d,d);
2134             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2135
2136             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2137
2138             /* Evaluate switch function */
2139             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2140             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2141             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2142
2143             fscal            = felec;
2144
2145             fscal            = _mm_and_pd(fscal,cutoff_mask);
2146
2147             /* Update vectorial force */
2148             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2149             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2150             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2151             
2152             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2153             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2154             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2155
2156             }
2157
2158             /**************************
2159              * CALCULATE INTERACTIONS *
2160              **************************/
2161
2162             if (gmx_mm_any_lt(rsq20,rcutoff2))
2163             {
2164
2165             r20              = _mm_mul_pd(rsq20,rinv20);
2166
2167             /* EWALD ELECTROSTATICS */
2168
2169             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2170             ewrt             = _mm_mul_pd(r20,ewtabscale);
2171             ewitab           = _mm_cvttpd_epi32(ewrt);
2172 #ifdef __XOP__
2173             eweps            = _mm_frcz_pd(ewrt);
2174 #else
2175             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2176 #endif
2177             twoeweps         = _mm_add_pd(eweps,eweps);
2178             ewitab           = _mm_slli_epi32(ewitab,2);
2179             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2180             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2181             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2182             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2183             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2184             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2185             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2186             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2187             velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2188             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2189
2190             d                = _mm_sub_pd(r20,rswitch);
2191             d                = _mm_max_pd(d,_mm_setzero_pd());
2192             d2               = _mm_mul_pd(d,d);
2193             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2194
2195             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2196
2197             /* Evaluate switch function */
2198             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2199             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2200             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
2201
2202             fscal            = felec;
2203
2204             fscal            = _mm_and_pd(fscal,cutoff_mask);
2205
2206             /* Update vectorial force */
2207             fix2             = _mm_macc_pd(dx20,fscal,fix2);
2208             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
2209             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
2210             
2211             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
2212             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
2213             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
2214
2215             }
2216
2217             /**************************
2218              * CALCULATE INTERACTIONS *
2219              **************************/
2220
2221             if (gmx_mm_any_lt(rsq21,rcutoff2))
2222             {
2223
2224             r21              = _mm_mul_pd(rsq21,rinv21);
2225
2226             /* EWALD ELECTROSTATICS */
2227
2228             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2229             ewrt             = _mm_mul_pd(r21,ewtabscale);
2230             ewitab           = _mm_cvttpd_epi32(ewrt);
2231 #ifdef __XOP__
2232             eweps            = _mm_frcz_pd(ewrt);
2233 #else
2234             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2235 #endif
2236             twoeweps         = _mm_add_pd(eweps,eweps);
2237             ewitab           = _mm_slli_epi32(ewitab,2);
2238             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2239             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2240             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2241             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2242             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2243             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2244             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2245             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2246             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2247             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2248
2249             d                = _mm_sub_pd(r21,rswitch);
2250             d                = _mm_max_pd(d,_mm_setzero_pd());
2251             d2               = _mm_mul_pd(d,d);
2252             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2253
2254             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2255
2256             /* Evaluate switch function */
2257             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2258             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2259             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2260
2261             fscal            = felec;
2262
2263             fscal            = _mm_and_pd(fscal,cutoff_mask);
2264
2265             /* Update vectorial force */
2266             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2267             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2268             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2269             
2270             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2271             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2272             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2273
2274             }
2275
2276             /**************************
2277              * CALCULATE INTERACTIONS *
2278              **************************/
2279
2280             if (gmx_mm_any_lt(rsq22,rcutoff2))
2281             {
2282
2283             r22              = _mm_mul_pd(rsq22,rinv22);
2284
2285             /* EWALD ELECTROSTATICS */
2286
2287             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2288             ewrt             = _mm_mul_pd(r22,ewtabscale);
2289             ewitab           = _mm_cvttpd_epi32(ewrt);
2290 #ifdef __XOP__
2291             eweps            = _mm_frcz_pd(ewrt);
2292 #else
2293             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2294 #endif
2295             twoeweps         = _mm_add_pd(eweps,eweps);
2296             ewitab           = _mm_slli_epi32(ewitab,2);
2297             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2298             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2299             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2300             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2301             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2302             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2303             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2304             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2305             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2306             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2307
2308             d                = _mm_sub_pd(r22,rswitch);
2309             d                = _mm_max_pd(d,_mm_setzero_pd());
2310             d2               = _mm_mul_pd(d,d);
2311             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2312
2313             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2314
2315             /* Evaluate switch function */
2316             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2317             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2318             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2319
2320             fscal            = felec;
2321
2322             fscal            = _mm_and_pd(fscal,cutoff_mask);
2323
2324             /* Update vectorial force */
2325             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2326             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2327             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2328             
2329             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2330             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2331             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2332
2333             }
2334
2335             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2336
2337             /* Inner loop uses 585 flops */
2338         }
2339
2340         if(jidx<j_index_end)
2341         {
2342
2343             jnrA             = jjnr[jidx];
2344             j_coord_offsetA  = DIM*jnrA;
2345
2346             /* load j atom coordinates */
2347             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2348                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2349
2350             /* Calculate displacement vector */
2351             dx00             = _mm_sub_pd(ix0,jx0);
2352             dy00             = _mm_sub_pd(iy0,jy0);
2353             dz00             = _mm_sub_pd(iz0,jz0);
2354             dx01             = _mm_sub_pd(ix0,jx1);
2355             dy01             = _mm_sub_pd(iy0,jy1);
2356             dz01             = _mm_sub_pd(iz0,jz1);
2357             dx02             = _mm_sub_pd(ix0,jx2);
2358             dy02             = _mm_sub_pd(iy0,jy2);
2359             dz02             = _mm_sub_pd(iz0,jz2);
2360             dx10             = _mm_sub_pd(ix1,jx0);
2361             dy10             = _mm_sub_pd(iy1,jy0);
2362             dz10             = _mm_sub_pd(iz1,jz0);
2363             dx11             = _mm_sub_pd(ix1,jx1);
2364             dy11             = _mm_sub_pd(iy1,jy1);
2365             dz11             = _mm_sub_pd(iz1,jz1);
2366             dx12             = _mm_sub_pd(ix1,jx2);
2367             dy12             = _mm_sub_pd(iy1,jy2);
2368             dz12             = _mm_sub_pd(iz1,jz2);
2369             dx20             = _mm_sub_pd(ix2,jx0);
2370             dy20             = _mm_sub_pd(iy2,jy0);
2371             dz20             = _mm_sub_pd(iz2,jz0);
2372             dx21             = _mm_sub_pd(ix2,jx1);
2373             dy21             = _mm_sub_pd(iy2,jy1);
2374             dz21             = _mm_sub_pd(iz2,jz1);
2375             dx22             = _mm_sub_pd(ix2,jx2);
2376             dy22             = _mm_sub_pd(iy2,jy2);
2377             dz22             = _mm_sub_pd(iz2,jz2);
2378
2379             /* Calculate squared distance and things based on it */
2380             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2381             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2382             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2383             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2384             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2385             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2386             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2387             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2388             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2389
2390             rinv00           = gmx_mm_invsqrt_pd(rsq00);
2391             rinv01           = gmx_mm_invsqrt_pd(rsq01);
2392             rinv02           = gmx_mm_invsqrt_pd(rsq02);
2393             rinv10           = gmx_mm_invsqrt_pd(rsq10);
2394             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2395             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2396             rinv20           = gmx_mm_invsqrt_pd(rsq20);
2397             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2398             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2399
2400             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
2401             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
2402             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
2403             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
2404             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2405             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2406             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
2407             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2408             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2409
2410             fjx0             = _mm_setzero_pd();
2411             fjy0             = _mm_setzero_pd();
2412             fjz0             = _mm_setzero_pd();
2413             fjx1             = _mm_setzero_pd();
2414             fjy1             = _mm_setzero_pd();
2415             fjz1             = _mm_setzero_pd();
2416             fjx2             = _mm_setzero_pd();
2417             fjy2             = _mm_setzero_pd();
2418             fjz2             = _mm_setzero_pd();
2419
2420             /**************************
2421              * CALCULATE INTERACTIONS *
2422              **************************/
2423
2424             if (gmx_mm_any_lt(rsq00,rcutoff2))
2425             {
2426
2427             r00              = _mm_mul_pd(rsq00,rinv00);
2428
2429             /* EWALD ELECTROSTATICS */
2430
2431             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2432             ewrt             = _mm_mul_pd(r00,ewtabscale);
2433             ewitab           = _mm_cvttpd_epi32(ewrt);
2434 #ifdef __XOP__
2435             eweps            = _mm_frcz_pd(ewrt);
2436 #else
2437             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2438 #endif
2439             twoeweps         = _mm_add_pd(eweps,eweps);
2440             ewitab           = _mm_slli_epi32(ewitab,2);
2441             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2442             ewtabD           = _mm_setzero_pd();
2443             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2444             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2445             ewtabFn          = _mm_setzero_pd();
2446             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2447             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2448             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2449             velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
2450             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2451
2452             d                = _mm_sub_pd(r00,rswitch);
2453             d                = _mm_max_pd(d,_mm_setzero_pd());
2454             d2               = _mm_mul_pd(d,d);
2455             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2456
2457             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2458
2459             /* Evaluate switch function */
2460             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2461             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
2462             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2463
2464             fscal            = felec;
2465
2466             fscal            = _mm_and_pd(fscal,cutoff_mask);
2467
2468             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2469
2470             /* Update vectorial force */
2471             fix0             = _mm_macc_pd(dx00,fscal,fix0);
2472             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
2473             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
2474             
2475             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
2476             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
2477             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
2478
2479             }
2480
2481             /**************************
2482              * CALCULATE INTERACTIONS *
2483              **************************/
2484
2485             if (gmx_mm_any_lt(rsq01,rcutoff2))
2486             {
2487
2488             r01              = _mm_mul_pd(rsq01,rinv01);
2489
2490             /* EWALD ELECTROSTATICS */
2491
2492             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2493             ewrt             = _mm_mul_pd(r01,ewtabscale);
2494             ewitab           = _mm_cvttpd_epi32(ewrt);
2495 #ifdef __XOP__
2496             eweps            = _mm_frcz_pd(ewrt);
2497 #else
2498             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2499 #endif
2500             twoeweps         = _mm_add_pd(eweps,eweps);
2501             ewitab           = _mm_slli_epi32(ewitab,2);
2502             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2503             ewtabD           = _mm_setzero_pd();
2504             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2505             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2506             ewtabFn          = _mm_setzero_pd();
2507             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2508             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2509             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2510             velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
2511             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2512
2513             d                = _mm_sub_pd(r01,rswitch);
2514             d                = _mm_max_pd(d,_mm_setzero_pd());
2515             d2               = _mm_mul_pd(d,d);
2516             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2517
2518             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2519
2520             /* Evaluate switch function */
2521             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2522             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
2523             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
2524
2525             fscal            = felec;
2526
2527             fscal            = _mm_and_pd(fscal,cutoff_mask);
2528
2529             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2530
2531             /* Update vectorial force */
2532             fix0             = _mm_macc_pd(dx01,fscal,fix0);
2533             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
2534             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
2535             
2536             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
2537             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
2538             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
2539
2540             }
2541
2542             /**************************
2543              * CALCULATE INTERACTIONS *
2544              **************************/
2545
2546             if (gmx_mm_any_lt(rsq02,rcutoff2))
2547             {
2548
2549             r02              = _mm_mul_pd(rsq02,rinv02);
2550
2551             /* EWALD ELECTROSTATICS */
2552
2553             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2554             ewrt             = _mm_mul_pd(r02,ewtabscale);
2555             ewitab           = _mm_cvttpd_epi32(ewrt);
2556 #ifdef __XOP__
2557             eweps            = _mm_frcz_pd(ewrt);
2558 #else
2559             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2560 #endif
2561             twoeweps         = _mm_add_pd(eweps,eweps);
2562             ewitab           = _mm_slli_epi32(ewitab,2);
2563             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2564             ewtabD           = _mm_setzero_pd();
2565             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2566             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2567             ewtabFn          = _mm_setzero_pd();
2568             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2569             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2570             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2571             velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2572             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2573
2574             d                = _mm_sub_pd(r02,rswitch);
2575             d                = _mm_max_pd(d,_mm_setzero_pd());
2576             d2               = _mm_mul_pd(d,d);
2577             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2578
2579             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2580
2581             /* Evaluate switch function */
2582             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2583             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2584             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
2585
2586             fscal            = felec;
2587
2588             fscal            = _mm_and_pd(fscal,cutoff_mask);
2589
2590             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2591
2592             /* Update vectorial force */
2593             fix0             = _mm_macc_pd(dx02,fscal,fix0);
2594             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
2595             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
2596             
2597             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
2598             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
2599             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
2600
2601             }
2602
2603             /**************************
2604              * CALCULATE INTERACTIONS *
2605              **************************/
2606
2607             if (gmx_mm_any_lt(rsq10,rcutoff2))
2608             {
2609
2610             r10              = _mm_mul_pd(rsq10,rinv10);
2611
2612             /* EWALD ELECTROSTATICS */
2613
2614             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2615             ewrt             = _mm_mul_pd(r10,ewtabscale);
2616             ewitab           = _mm_cvttpd_epi32(ewrt);
2617 #ifdef __XOP__
2618             eweps            = _mm_frcz_pd(ewrt);
2619 #else
2620             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2621 #endif
2622             twoeweps         = _mm_add_pd(eweps,eweps);
2623             ewitab           = _mm_slli_epi32(ewitab,2);
2624             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2625             ewtabD           = _mm_setzero_pd();
2626             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2627             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2628             ewtabFn          = _mm_setzero_pd();
2629             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2630             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2631             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2632             velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2633             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2634
2635             d                = _mm_sub_pd(r10,rswitch);
2636             d                = _mm_max_pd(d,_mm_setzero_pd());
2637             d2               = _mm_mul_pd(d,d);
2638             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2639
2640             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2641
2642             /* Evaluate switch function */
2643             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2644             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2645             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
2646
2647             fscal            = felec;
2648
2649             fscal            = _mm_and_pd(fscal,cutoff_mask);
2650
2651             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2652
2653             /* Update vectorial force */
2654             fix1             = _mm_macc_pd(dx10,fscal,fix1);
2655             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
2656             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
2657             
2658             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
2659             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
2660             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
2661
2662             }
2663
2664             /**************************
2665              * CALCULATE INTERACTIONS *
2666              **************************/
2667
2668             if (gmx_mm_any_lt(rsq11,rcutoff2))
2669             {
2670
2671             r11              = _mm_mul_pd(rsq11,rinv11);
2672
2673             /* EWALD ELECTROSTATICS */
2674
2675             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2676             ewrt             = _mm_mul_pd(r11,ewtabscale);
2677             ewitab           = _mm_cvttpd_epi32(ewrt);
2678 #ifdef __XOP__
2679             eweps            = _mm_frcz_pd(ewrt);
2680 #else
2681             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2682 #endif
2683             twoeweps         = _mm_add_pd(eweps,eweps);
2684             ewitab           = _mm_slli_epi32(ewitab,2);
2685             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2686             ewtabD           = _mm_setzero_pd();
2687             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2688             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2689             ewtabFn          = _mm_setzero_pd();
2690             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2691             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2692             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2693             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2694             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2695
2696             d                = _mm_sub_pd(r11,rswitch);
2697             d                = _mm_max_pd(d,_mm_setzero_pd());
2698             d2               = _mm_mul_pd(d,d);
2699             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2700
2701             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2702
2703             /* Evaluate switch function */
2704             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2705             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2706             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2707
2708             fscal            = felec;
2709
2710             fscal            = _mm_and_pd(fscal,cutoff_mask);
2711
2712             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2713
2714             /* Update vectorial force */
2715             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2716             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2717             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2718             
2719             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2720             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2721             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2722
2723             }
2724
2725             /**************************
2726              * CALCULATE INTERACTIONS *
2727              **************************/
2728
2729             if (gmx_mm_any_lt(rsq12,rcutoff2))
2730             {
2731
2732             r12              = _mm_mul_pd(rsq12,rinv12);
2733
2734             /* EWALD ELECTROSTATICS */
2735
2736             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2737             ewrt             = _mm_mul_pd(r12,ewtabscale);
2738             ewitab           = _mm_cvttpd_epi32(ewrt);
2739 #ifdef __XOP__
2740             eweps            = _mm_frcz_pd(ewrt);
2741 #else
2742             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2743 #endif
2744             twoeweps         = _mm_add_pd(eweps,eweps);
2745             ewitab           = _mm_slli_epi32(ewitab,2);
2746             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2747             ewtabD           = _mm_setzero_pd();
2748             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2749             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2750             ewtabFn          = _mm_setzero_pd();
2751             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2752             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2753             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2754             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2755             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2756
2757             d                = _mm_sub_pd(r12,rswitch);
2758             d                = _mm_max_pd(d,_mm_setzero_pd());
2759             d2               = _mm_mul_pd(d,d);
2760             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2761
2762             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2763
2764             /* Evaluate switch function */
2765             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2766             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2767             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2768
2769             fscal            = felec;
2770
2771             fscal            = _mm_and_pd(fscal,cutoff_mask);
2772
2773             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2774
2775             /* Update vectorial force */
2776             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2777             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2778             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2779             
2780             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2781             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2782             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2783
2784             }
2785
2786             /**************************
2787              * CALCULATE INTERACTIONS *
2788              **************************/
2789
2790             if (gmx_mm_any_lt(rsq20,rcutoff2))
2791             {
2792
2793             r20              = _mm_mul_pd(rsq20,rinv20);
2794
2795             /* EWALD ELECTROSTATICS */
2796
2797             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2798             ewrt             = _mm_mul_pd(r20,ewtabscale);
2799             ewitab           = _mm_cvttpd_epi32(ewrt);
2800 #ifdef __XOP__
2801             eweps            = _mm_frcz_pd(ewrt);
2802 #else
2803             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2804 #endif
2805             twoeweps         = _mm_add_pd(eweps,eweps);
2806             ewitab           = _mm_slli_epi32(ewitab,2);
2807             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2808             ewtabD           = _mm_setzero_pd();
2809             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2810             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2811             ewtabFn          = _mm_setzero_pd();
2812             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2813             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2814             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2815             velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2816             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2817
2818             d                = _mm_sub_pd(r20,rswitch);
2819             d                = _mm_max_pd(d,_mm_setzero_pd());
2820             d2               = _mm_mul_pd(d,d);
2821             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2822
2823             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2824
2825             /* Evaluate switch function */
2826             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2827             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2828             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
2829
2830             fscal            = felec;
2831
2832             fscal            = _mm_and_pd(fscal,cutoff_mask);
2833
2834             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2835
2836             /* Update vectorial force */
2837             fix2             = _mm_macc_pd(dx20,fscal,fix2);
2838             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
2839             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
2840             
2841             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
2842             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
2843             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
2844
2845             }
2846
2847             /**************************
2848              * CALCULATE INTERACTIONS *
2849              **************************/
2850
2851             if (gmx_mm_any_lt(rsq21,rcutoff2))
2852             {
2853
2854             r21              = _mm_mul_pd(rsq21,rinv21);
2855
2856             /* EWALD ELECTROSTATICS */
2857
2858             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2859             ewrt             = _mm_mul_pd(r21,ewtabscale);
2860             ewitab           = _mm_cvttpd_epi32(ewrt);
2861 #ifdef __XOP__
2862             eweps            = _mm_frcz_pd(ewrt);
2863 #else
2864             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2865 #endif
2866             twoeweps         = _mm_add_pd(eweps,eweps);
2867             ewitab           = _mm_slli_epi32(ewitab,2);
2868             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2869             ewtabD           = _mm_setzero_pd();
2870             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2871             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2872             ewtabFn          = _mm_setzero_pd();
2873             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2874             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2875             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2876             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2877             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2878
2879             d                = _mm_sub_pd(r21,rswitch);
2880             d                = _mm_max_pd(d,_mm_setzero_pd());
2881             d2               = _mm_mul_pd(d,d);
2882             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2883
2884             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2885
2886             /* Evaluate switch function */
2887             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2888             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2889             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2890
2891             fscal            = felec;
2892
2893             fscal            = _mm_and_pd(fscal,cutoff_mask);
2894
2895             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2896
2897             /* Update vectorial force */
2898             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2899             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2900             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2901             
2902             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2903             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2904             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2905
2906             }
2907
2908             /**************************
2909              * CALCULATE INTERACTIONS *
2910              **************************/
2911
2912             if (gmx_mm_any_lt(rsq22,rcutoff2))
2913             {
2914
2915             r22              = _mm_mul_pd(rsq22,rinv22);
2916
2917             /* EWALD ELECTROSTATICS */
2918
2919             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2920             ewrt             = _mm_mul_pd(r22,ewtabscale);
2921             ewitab           = _mm_cvttpd_epi32(ewrt);
2922 #ifdef __XOP__
2923             eweps            = _mm_frcz_pd(ewrt);
2924 #else
2925             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2926 #endif
2927             twoeweps         = _mm_add_pd(eweps,eweps);
2928             ewitab           = _mm_slli_epi32(ewitab,2);
2929             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2930             ewtabD           = _mm_setzero_pd();
2931             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2932             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2933             ewtabFn          = _mm_setzero_pd();
2934             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2935             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2936             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2937             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2938             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2939
2940             d                = _mm_sub_pd(r22,rswitch);
2941             d                = _mm_max_pd(d,_mm_setzero_pd());
2942             d2               = _mm_mul_pd(d,d);
2943             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2944
2945             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2946
2947             /* Evaluate switch function */
2948             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2949             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2950             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2951
2952             fscal            = felec;
2953
2954             fscal            = _mm_and_pd(fscal,cutoff_mask);
2955
2956             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2957
2958             /* Update vectorial force */
2959             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2960             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2961             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2962             
2963             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2964             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2965             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2966
2967             }
2968
2969             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2970
2971             /* Inner loop uses 585 flops */
2972         }
2973
2974         /* End of innermost loop */
2975
2976         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2977                                               f+i_coord_offset,fshift+i_shift_offset);
2978
2979         /* Increment number of inner iterations */
2980         inneriter                  += j_index_end - j_index_start;
2981
2982         /* Outer loop uses 18 flops */
2983     }
2984
2985     /* Increment number of outer iterations */
2986     outeriter        += nri;
2987
2988     /* Update outer/inner flops */
2989
2990     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*585);
2991 }