Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_128_fma_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
50 #include "kernelutil_x86_avx_128_fma_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_double
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LJEwald
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB;
77     int              j_coord_offsetA,j_coord_offsetB;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwioffset3;
89     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90     int              vdwjidx0A,vdwjidx0B;
91     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B;
93     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B;
95     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B;
97     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
115     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
116     __m128d           c6grid_00;
117     __m128d           c6grid_11;
118     __m128d           c6grid_12;
119     __m128d           c6grid_13;
120     __m128d           c6grid_21;
121     __m128d           c6grid_22;
122     __m128d           c6grid_23;
123     __m128d           c6grid_31;
124     __m128d           c6grid_32;
125     __m128d           c6grid_33;
126     real             *vdwgridparam;
127     __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
128     __m128d           one_half  = _mm_set1_pd(0.5);
129     __m128d           minus_one = _mm_set1_pd(-1.0);
130     __m128i          ewitab;
131     __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
132     real             *ewtab;
133     __m128d          dummy_mask,cutoff_mask;
134     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
135     __m128d          one     = _mm_set1_pd(1.0);
136     __m128d          two     = _mm_set1_pd(2.0);
137     x                = xx[0];
138     f                = ff[0];
139
140     nri              = nlist->nri;
141     iinr             = nlist->iinr;
142     jindex           = nlist->jindex;
143     jjnr             = nlist->jjnr;
144     shiftidx         = nlist->shift;
145     gid              = nlist->gid;
146     shiftvec         = fr->shift_vec[0];
147     fshift           = fr->fshift[0];
148     facel            = _mm_set1_pd(fr->epsfac);
149     charge           = mdatoms->chargeA;
150     nvdwtype         = fr->ntype;
151     vdwparam         = fr->nbfp;
152     vdwtype          = mdatoms->typeA;
153     vdwgridparam     = fr->ljpme_c6grid;
154     sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
155     ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
156     ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
157
158     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
159     ewtab            = fr->ic->tabq_coul_FDV0;
160     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
161     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
162
163     /* Setup water-specific parameters */
164     inr              = nlist->iinr[0];
165     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
166     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
167     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
168     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
169
170     jq1              = _mm_set1_pd(charge[inr+1]);
171     jq2              = _mm_set1_pd(charge[inr+2]);
172     jq3              = _mm_set1_pd(charge[inr+3]);
173     vdwjidx0A        = 2*vdwtype[inr+0];
174     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
175     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
176     c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
177     qq11             = _mm_mul_pd(iq1,jq1);
178     qq12             = _mm_mul_pd(iq1,jq2);
179     qq13             = _mm_mul_pd(iq1,jq3);
180     qq21             = _mm_mul_pd(iq2,jq1);
181     qq22             = _mm_mul_pd(iq2,jq2);
182     qq23             = _mm_mul_pd(iq2,jq3);
183     qq31             = _mm_mul_pd(iq3,jq1);
184     qq32             = _mm_mul_pd(iq3,jq2);
185     qq33             = _mm_mul_pd(iq3,jq3);
186
187     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
188     rcutoff_scalar   = fr->rcoulomb;
189     rcutoff          = _mm_set1_pd(rcutoff_scalar);
190     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
191
192     sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
193     rvdw             = _mm_set1_pd(fr->rvdw);
194
195     /* Avoid stupid compiler warnings */
196     jnrA = jnrB = 0;
197     j_coord_offsetA = 0;
198     j_coord_offsetB = 0;
199
200     outeriter        = 0;
201     inneriter        = 0;
202
203     /* Start outer loop over neighborlists */
204     for(iidx=0; iidx<nri; iidx++)
205     {
206         /* Load shift vector for this list */
207         i_shift_offset   = DIM*shiftidx[iidx];
208
209         /* Load limits for loop over neighbors */
210         j_index_start    = jindex[iidx];
211         j_index_end      = jindex[iidx+1];
212
213         /* Get outer coordinate index */
214         inr              = iinr[iidx];
215         i_coord_offset   = DIM*inr;
216
217         /* Load i particle coords and add shift vector */
218         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
219                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
220
221         fix0             = _mm_setzero_pd();
222         fiy0             = _mm_setzero_pd();
223         fiz0             = _mm_setzero_pd();
224         fix1             = _mm_setzero_pd();
225         fiy1             = _mm_setzero_pd();
226         fiz1             = _mm_setzero_pd();
227         fix2             = _mm_setzero_pd();
228         fiy2             = _mm_setzero_pd();
229         fiz2             = _mm_setzero_pd();
230         fix3             = _mm_setzero_pd();
231         fiy3             = _mm_setzero_pd();
232         fiz3             = _mm_setzero_pd();
233
234         /* Reset potential sums */
235         velecsum         = _mm_setzero_pd();
236         vvdwsum          = _mm_setzero_pd();
237
238         /* Start inner kernel loop */
239         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
240         {
241
242             /* Get j neighbor index, and coordinate index */
243             jnrA             = jjnr[jidx];
244             jnrB             = jjnr[jidx+1];
245             j_coord_offsetA  = DIM*jnrA;
246             j_coord_offsetB  = DIM*jnrB;
247
248             /* load j atom coordinates */
249             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
250                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
251                                               &jy2,&jz2,&jx3,&jy3,&jz3);
252
253             /* Calculate displacement vector */
254             dx00             = _mm_sub_pd(ix0,jx0);
255             dy00             = _mm_sub_pd(iy0,jy0);
256             dz00             = _mm_sub_pd(iz0,jz0);
257             dx11             = _mm_sub_pd(ix1,jx1);
258             dy11             = _mm_sub_pd(iy1,jy1);
259             dz11             = _mm_sub_pd(iz1,jz1);
260             dx12             = _mm_sub_pd(ix1,jx2);
261             dy12             = _mm_sub_pd(iy1,jy2);
262             dz12             = _mm_sub_pd(iz1,jz2);
263             dx13             = _mm_sub_pd(ix1,jx3);
264             dy13             = _mm_sub_pd(iy1,jy3);
265             dz13             = _mm_sub_pd(iz1,jz3);
266             dx21             = _mm_sub_pd(ix2,jx1);
267             dy21             = _mm_sub_pd(iy2,jy1);
268             dz21             = _mm_sub_pd(iz2,jz1);
269             dx22             = _mm_sub_pd(ix2,jx2);
270             dy22             = _mm_sub_pd(iy2,jy2);
271             dz22             = _mm_sub_pd(iz2,jz2);
272             dx23             = _mm_sub_pd(ix2,jx3);
273             dy23             = _mm_sub_pd(iy2,jy3);
274             dz23             = _mm_sub_pd(iz2,jz3);
275             dx31             = _mm_sub_pd(ix3,jx1);
276             dy31             = _mm_sub_pd(iy3,jy1);
277             dz31             = _mm_sub_pd(iz3,jz1);
278             dx32             = _mm_sub_pd(ix3,jx2);
279             dy32             = _mm_sub_pd(iy3,jy2);
280             dz32             = _mm_sub_pd(iz3,jz2);
281             dx33             = _mm_sub_pd(ix3,jx3);
282             dy33             = _mm_sub_pd(iy3,jy3);
283             dz33             = _mm_sub_pd(iz3,jz3);
284
285             /* Calculate squared distance and things based on it */
286             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
287             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
288             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
289             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
290             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
291             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
292             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
293             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
294             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
295             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
296
297             rinv00           = gmx_mm_invsqrt_pd(rsq00);
298             rinv11           = gmx_mm_invsqrt_pd(rsq11);
299             rinv12           = gmx_mm_invsqrt_pd(rsq12);
300             rinv13           = gmx_mm_invsqrt_pd(rsq13);
301             rinv21           = gmx_mm_invsqrt_pd(rsq21);
302             rinv22           = gmx_mm_invsqrt_pd(rsq22);
303             rinv23           = gmx_mm_invsqrt_pd(rsq23);
304             rinv31           = gmx_mm_invsqrt_pd(rsq31);
305             rinv32           = gmx_mm_invsqrt_pd(rsq32);
306             rinv33           = gmx_mm_invsqrt_pd(rsq33);
307
308             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
309             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
310             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
311             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
312             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
313             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
314             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
315             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
316             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
317             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
318
319             fjx0             = _mm_setzero_pd();
320             fjy0             = _mm_setzero_pd();
321             fjz0             = _mm_setzero_pd();
322             fjx1             = _mm_setzero_pd();
323             fjy1             = _mm_setzero_pd();
324             fjz1             = _mm_setzero_pd();
325             fjx2             = _mm_setzero_pd();
326             fjy2             = _mm_setzero_pd();
327             fjz2             = _mm_setzero_pd();
328             fjx3             = _mm_setzero_pd();
329             fjy3             = _mm_setzero_pd();
330             fjz3             = _mm_setzero_pd();
331
332             /**************************
333              * CALCULATE INTERACTIONS *
334              **************************/
335
336             if (gmx_mm_any_lt(rsq00,rcutoff2))
337             {
338
339             r00              = _mm_mul_pd(rsq00,rinv00);
340
341             /* Analytical LJ-PME */
342             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
343             ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
344             ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
345             exponent         = gmx_simd_exp_d(ewcljrsq);
346             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
347             poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
348             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
349             vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
350             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
351             vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
352                                _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
353             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
354             fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
355
356             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
357
358             /* Update potential sum for this i atom from the interaction with this j atom. */
359             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
360             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
361
362             fscal            = fvdw;
363
364             fscal            = _mm_and_pd(fscal,cutoff_mask);
365
366             /* Update vectorial force */
367             fix0             = _mm_macc_pd(dx00,fscal,fix0);
368             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
369             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
370             
371             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
372             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
373             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
374
375             }
376
377             /**************************
378              * CALCULATE INTERACTIONS *
379              **************************/
380
381             if (gmx_mm_any_lt(rsq11,rcutoff2))
382             {
383
384             r11              = _mm_mul_pd(rsq11,rinv11);
385
386             /* EWALD ELECTROSTATICS */
387
388             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
389             ewrt             = _mm_mul_pd(r11,ewtabscale);
390             ewitab           = _mm_cvttpd_epi32(ewrt);
391 #ifdef __XOP__
392             eweps            = _mm_frcz_pd(ewrt);
393 #else
394             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
395 #endif
396             twoeweps         = _mm_add_pd(eweps,eweps);
397             ewitab           = _mm_slli_epi32(ewitab,2);
398             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
399             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
400             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
401             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
402             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
403             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
404             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
405             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
406             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
407             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
408
409             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
410
411             /* Update potential sum for this i atom from the interaction with this j atom. */
412             velec            = _mm_and_pd(velec,cutoff_mask);
413             velecsum         = _mm_add_pd(velecsum,velec);
414
415             fscal            = felec;
416
417             fscal            = _mm_and_pd(fscal,cutoff_mask);
418
419             /* Update vectorial force */
420             fix1             = _mm_macc_pd(dx11,fscal,fix1);
421             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
422             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
423             
424             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
425             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
426             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
427
428             }
429
430             /**************************
431              * CALCULATE INTERACTIONS *
432              **************************/
433
434             if (gmx_mm_any_lt(rsq12,rcutoff2))
435             {
436
437             r12              = _mm_mul_pd(rsq12,rinv12);
438
439             /* EWALD ELECTROSTATICS */
440
441             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
442             ewrt             = _mm_mul_pd(r12,ewtabscale);
443             ewitab           = _mm_cvttpd_epi32(ewrt);
444 #ifdef __XOP__
445             eweps            = _mm_frcz_pd(ewrt);
446 #else
447             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
448 #endif
449             twoeweps         = _mm_add_pd(eweps,eweps);
450             ewitab           = _mm_slli_epi32(ewitab,2);
451             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
452             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
453             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
454             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
455             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
456             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
457             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
458             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
459             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
460             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
461
462             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
463
464             /* Update potential sum for this i atom from the interaction with this j atom. */
465             velec            = _mm_and_pd(velec,cutoff_mask);
466             velecsum         = _mm_add_pd(velecsum,velec);
467
468             fscal            = felec;
469
470             fscal            = _mm_and_pd(fscal,cutoff_mask);
471
472             /* Update vectorial force */
473             fix1             = _mm_macc_pd(dx12,fscal,fix1);
474             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
475             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
476             
477             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
478             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
479             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
480
481             }
482
483             /**************************
484              * CALCULATE INTERACTIONS *
485              **************************/
486
487             if (gmx_mm_any_lt(rsq13,rcutoff2))
488             {
489
490             r13              = _mm_mul_pd(rsq13,rinv13);
491
492             /* EWALD ELECTROSTATICS */
493
494             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
495             ewrt             = _mm_mul_pd(r13,ewtabscale);
496             ewitab           = _mm_cvttpd_epi32(ewrt);
497 #ifdef __XOP__
498             eweps            = _mm_frcz_pd(ewrt);
499 #else
500             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
501 #endif
502             twoeweps         = _mm_add_pd(eweps,eweps);
503             ewitab           = _mm_slli_epi32(ewitab,2);
504             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
505             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
506             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
507             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
508             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
509             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
510             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
511             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
512             velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
513             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
514
515             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
516
517             /* Update potential sum for this i atom from the interaction with this j atom. */
518             velec            = _mm_and_pd(velec,cutoff_mask);
519             velecsum         = _mm_add_pd(velecsum,velec);
520
521             fscal            = felec;
522
523             fscal            = _mm_and_pd(fscal,cutoff_mask);
524
525             /* Update vectorial force */
526             fix1             = _mm_macc_pd(dx13,fscal,fix1);
527             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
528             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
529             
530             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
531             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
532             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
533
534             }
535
536             /**************************
537              * CALCULATE INTERACTIONS *
538              **************************/
539
540             if (gmx_mm_any_lt(rsq21,rcutoff2))
541             {
542
543             r21              = _mm_mul_pd(rsq21,rinv21);
544
545             /* EWALD ELECTROSTATICS */
546
547             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
548             ewrt             = _mm_mul_pd(r21,ewtabscale);
549             ewitab           = _mm_cvttpd_epi32(ewrt);
550 #ifdef __XOP__
551             eweps            = _mm_frcz_pd(ewrt);
552 #else
553             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
554 #endif
555             twoeweps         = _mm_add_pd(eweps,eweps);
556             ewitab           = _mm_slli_epi32(ewitab,2);
557             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
558             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
559             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
560             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
561             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
562             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
563             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
564             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
565             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
566             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
567
568             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
569
570             /* Update potential sum for this i atom from the interaction with this j atom. */
571             velec            = _mm_and_pd(velec,cutoff_mask);
572             velecsum         = _mm_add_pd(velecsum,velec);
573
574             fscal            = felec;
575
576             fscal            = _mm_and_pd(fscal,cutoff_mask);
577
578             /* Update vectorial force */
579             fix2             = _mm_macc_pd(dx21,fscal,fix2);
580             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
581             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
582             
583             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
584             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
585             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
586
587             }
588
589             /**************************
590              * CALCULATE INTERACTIONS *
591              **************************/
592
593             if (gmx_mm_any_lt(rsq22,rcutoff2))
594             {
595
596             r22              = _mm_mul_pd(rsq22,rinv22);
597
598             /* EWALD ELECTROSTATICS */
599
600             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
601             ewrt             = _mm_mul_pd(r22,ewtabscale);
602             ewitab           = _mm_cvttpd_epi32(ewrt);
603 #ifdef __XOP__
604             eweps            = _mm_frcz_pd(ewrt);
605 #else
606             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
607 #endif
608             twoeweps         = _mm_add_pd(eweps,eweps);
609             ewitab           = _mm_slli_epi32(ewitab,2);
610             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
611             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
612             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
613             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
614             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
615             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
616             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
617             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
618             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
619             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
620
621             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
622
623             /* Update potential sum for this i atom from the interaction with this j atom. */
624             velec            = _mm_and_pd(velec,cutoff_mask);
625             velecsum         = _mm_add_pd(velecsum,velec);
626
627             fscal            = felec;
628
629             fscal            = _mm_and_pd(fscal,cutoff_mask);
630
631             /* Update vectorial force */
632             fix2             = _mm_macc_pd(dx22,fscal,fix2);
633             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
634             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
635             
636             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
637             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
638             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
639
640             }
641
642             /**************************
643              * CALCULATE INTERACTIONS *
644              **************************/
645
646             if (gmx_mm_any_lt(rsq23,rcutoff2))
647             {
648
649             r23              = _mm_mul_pd(rsq23,rinv23);
650
651             /* EWALD ELECTROSTATICS */
652
653             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
654             ewrt             = _mm_mul_pd(r23,ewtabscale);
655             ewitab           = _mm_cvttpd_epi32(ewrt);
656 #ifdef __XOP__
657             eweps            = _mm_frcz_pd(ewrt);
658 #else
659             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
660 #endif
661             twoeweps         = _mm_add_pd(eweps,eweps);
662             ewitab           = _mm_slli_epi32(ewitab,2);
663             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
664             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
665             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
666             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
667             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
668             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
669             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
670             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
671             velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
672             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
673
674             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
675
676             /* Update potential sum for this i atom from the interaction with this j atom. */
677             velec            = _mm_and_pd(velec,cutoff_mask);
678             velecsum         = _mm_add_pd(velecsum,velec);
679
680             fscal            = felec;
681
682             fscal            = _mm_and_pd(fscal,cutoff_mask);
683
684             /* Update vectorial force */
685             fix2             = _mm_macc_pd(dx23,fscal,fix2);
686             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
687             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
688             
689             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
690             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
691             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
692
693             }
694
695             /**************************
696              * CALCULATE INTERACTIONS *
697              **************************/
698
699             if (gmx_mm_any_lt(rsq31,rcutoff2))
700             {
701
702             r31              = _mm_mul_pd(rsq31,rinv31);
703
704             /* EWALD ELECTROSTATICS */
705
706             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
707             ewrt             = _mm_mul_pd(r31,ewtabscale);
708             ewitab           = _mm_cvttpd_epi32(ewrt);
709 #ifdef __XOP__
710             eweps            = _mm_frcz_pd(ewrt);
711 #else
712             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
713 #endif
714             twoeweps         = _mm_add_pd(eweps,eweps);
715             ewitab           = _mm_slli_epi32(ewitab,2);
716             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
717             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
718             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
719             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
720             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
721             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
722             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
723             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
724             velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
725             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
726
727             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
728
729             /* Update potential sum for this i atom from the interaction with this j atom. */
730             velec            = _mm_and_pd(velec,cutoff_mask);
731             velecsum         = _mm_add_pd(velecsum,velec);
732
733             fscal            = felec;
734
735             fscal            = _mm_and_pd(fscal,cutoff_mask);
736
737             /* Update vectorial force */
738             fix3             = _mm_macc_pd(dx31,fscal,fix3);
739             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
740             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
741             
742             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
743             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
744             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
745
746             }
747
748             /**************************
749              * CALCULATE INTERACTIONS *
750              **************************/
751
752             if (gmx_mm_any_lt(rsq32,rcutoff2))
753             {
754
755             r32              = _mm_mul_pd(rsq32,rinv32);
756
757             /* EWALD ELECTROSTATICS */
758
759             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
760             ewrt             = _mm_mul_pd(r32,ewtabscale);
761             ewitab           = _mm_cvttpd_epi32(ewrt);
762 #ifdef __XOP__
763             eweps            = _mm_frcz_pd(ewrt);
764 #else
765             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
766 #endif
767             twoeweps         = _mm_add_pd(eweps,eweps);
768             ewitab           = _mm_slli_epi32(ewitab,2);
769             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
770             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
771             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
772             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
773             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
774             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
775             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
776             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
777             velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
778             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
779
780             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
781
782             /* Update potential sum for this i atom from the interaction with this j atom. */
783             velec            = _mm_and_pd(velec,cutoff_mask);
784             velecsum         = _mm_add_pd(velecsum,velec);
785
786             fscal            = felec;
787
788             fscal            = _mm_and_pd(fscal,cutoff_mask);
789
790             /* Update vectorial force */
791             fix3             = _mm_macc_pd(dx32,fscal,fix3);
792             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
793             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
794             
795             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
796             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
797             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
798
799             }
800
801             /**************************
802              * CALCULATE INTERACTIONS *
803              **************************/
804
805             if (gmx_mm_any_lt(rsq33,rcutoff2))
806             {
807
808             r33              = _mm_mul_pd(rsq33,rinv33);
809
810             /* EWALD ELECTROSTATICS */
811
812             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
813             ewrt             = _mm_mul_pd(r33,ewtabscale);
814             ewitab           = _mm_cvttpd_epi32(ewrt);
815 #ifdef __XOP__
816             eweps            = _mm_frcz_pd(ewrt);
817 #else
818             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
819 #endif
820             twoeweps         = _mm_add_pd(eweps,eweps);
821             ewitab           = _mm_slli_epi32(ewitab,2);
822             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
823             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
824             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
825             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
826             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
827             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
828             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
829             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
830             velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
831             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
832
833             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
834
835             /* Update potential sum for this i atom from the interaction with this j atom. */
836             velec            = _mm_and_pd(velec,cutoff_mask);
837             velecsum         = _mm_add_pd(velecsum,velec);
838
839             fscal            = felec;
840
841             fscal            = _mm_and_pd(fscal,cutoff_mask);
842
843             /* Update vectorial force */
844             fix3             = _mm_macc_pd(dx33,fscal,fix3);
845             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
846             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
847             
848             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
849             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
850             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
851
852             }
853
854             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
855
856             /* Inner loop uses 502 flops */
857         }
858
859         if(jidx<j_index_end)
860         {
861
862             jnrA             = jjnr[jidx];
863             j_coord_offsetA  = DIM*jnrA;
864
865             /* load j atom coordinates */
866             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
867                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
868                                               &jy2,&jz2,&jx3,&jy3,&jz3);
869
870             /* Calculate displacement vector */
871             dx00             = _mm_sub_pd(ix0,jx0);
872             dy00             = _mm_sub_pd(iy0,jy0);
873             dz00             = _mm_sub_pd(iz0,jz0);
874             dx11             = _mm_sub_pd(ix1,jx1);
875             dy11             = _mm_sub_pd(iy1,jy1);
876             dz11             = _mm_sub_pd(iz1,jz1);
877             dx12             = _mm_sub_pd(ix1,jx2);
878             dy12             = _mm_sub_pd(iy1,jy2);
879             dz12             = _mm_sub_pd(iz1,jz2);
880             dx13             = _mm_sub_pd(ix1,jx3);
881             dy13             = _mm_sub_pd(iy1,jy3);
882             dz13             = _mm_sub_pd(iz1,jz3);
883             dx21             = _mm_sub_pd(ix2,jx1);
884             dy21             = _mm_sub_pd(iy2,jy1);
885             dz21             = _mm_sub_pd(iz2,jz1);
886             dx22             = _mm_sub_pd(ix2,jx2);
887             dy22             = _mm_sub_pd(iy2,jy2);
888             dz22             = _mm_sub_pd(iz2,jz2);
889             dx23             = _mm_sub_pd(ix2,jx3);
890             dy23             = _mm_sub_pd(iy2,jy3);
891             dz23             = _mm_sub_pd(iz2,jz3);
892             dx31             = _mm_sub_pd(ix3,jx1);
893             dy31             = _mm_sub_pd(iy3,jy1);
894             dz31             = _mm_sub_pd(iz3,jz1);
895             dx32             = _mm_sub_pd(ix3,jx2);
896             dy32             = _mm_sub_pd(iy3,jy2);
897             dz32             = _mm_sub_pd(iz3,jz2);
898             dx33             = _mm_sub_pd(ix3,jx3);
899             dy33             = _mm_sub_pd(iy3,jy3);
900             dz33             = _mm_sub_pd(iz3,jz3);
901
902             /* Calculate squared distance and things based on it */
903             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
904             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
905             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
906             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
907             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
908             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
909             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
910             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
911             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
912             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
913
914             rinv00           = gmx_mm_invsqrt_pd(rsq00);
915             rinv11           = gmx_mm_invsqrt_pd(rsq11);
916             rinv12           = gmx_mm_invsqrt_pd(rsq12);
917             rinv13           = gmx_mm_invsqrt_pd(rsq13);
918             rinv21           = gmx_mm_invsqrt_pd(rsq21);
919             rinv22           = gmx_mm_invsqrt_pd(rsq22);
920             rinv23           = gmx_mm_invsqrt_pd(rsq23);
921             rinv31           = gmx_mm_invsqrt_pd(rsq31);
922             rinv32           = gmx_mm_invsqrt_pd(rsq32);
923             rinv33           = gmx_mm_invsqrt_pd(rsq33);
924
925             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
926             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
927             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
928             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
929             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
930             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
931             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
932             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
933             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
934             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
935
936             fjx0             = _mm_setzero_pd();
937             fjy0             = _mm_setzero_pd();
938             fjz0             = _mm_setzero_pd();
939             fjx1             = _mm_setzero_pd();
940             fjy1             = _mm_setzero_pd();
941             fjz1             = _mm_setzero_pd();
942             fjx2             = _mm_setzero_pd();
943             fjy2             = _mm_setzero_pd();
944             fjz2             = _mm_setzero_pd();
945             fjx3             = _mm_setzero_pd();
946             fjy3             = _mm_setzero_pd();
947             fjz3             = _mm_setzero_pd();
948
949             /**************************
950              * CALCULATE INTERACTIONS *
951              **************************/
952
953             if (gmx_mm_any_lt(rsq00,rcutoff2))
954             {
955
956             r00              = _mm_mul_pd(rsq00,rinv00);
957
958             /* Analytical LJ-PME */
959             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
960             ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
961             ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
962             exponent         = gmx_simd_exp_d(ewcljrsq);
963             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
964             poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
965             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
966             vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
967             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
968             vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
969                                _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
970             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
971             fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
972
973             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
974
975             /* Update potential sum for this i atom from the interaction with this j atom. */
976             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
977             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
978             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
979
980             fscal            = fvdw;
981
982             fscal            = _mm_and_pd(fscal,cutoff_mask);
983
984             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
985
986             /* Update vectorial force */
987             fix0             = _mm_macc_pd(dx00,fscal,fix0);
988             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
989             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
990             
991             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
992             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
993             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
994
995             }
996
997             /**************************
998              * CALCULATE INTERACTIONS *
999              **************************/
1000
1001             if (gmx_mm_any_lt(rsq11,rcutoff2))
1002             {
1003
1004             r11              = _mm_mul_pd(rsq11,rinv11);
1005
1006             /* EWALD ELECTROSTATICS */
1007
1008             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1009             ewrt             = _mm_mul_pd(r11,ewtabscale);
1010             ewitab           = _mm_cvttpd_epi32(ewrt);
1011 #ifdef __XOP__
1012             eweps            = _mm_frcz_pd(ewrt);
1013 #else
1014             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1015 #endif
1016             twoeweps         = _mm_add_pd(eweps,eweps);
1017             ewitab           = _mm_slli_epi32(ewitab,2);
1018             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1019             ewtabD           = _mm_setzero_pd();
1020             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1021             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1022             ewtabFn          = _mm_setzero_pd();
1023             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1024             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1025             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1026             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
1027             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1028
1029             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1030
1031             /* Update potential sum for this i atom from the interaction with this j atom. */
1032             velec            = _mm_and_pd(velec,cutoff_mask);
1033             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1034             velecsum         = _mm_add_pd(velecsum,velec);
1035
1036             fscal            = felec;
1037
1038             fscal            = _mm_and_pd(fscal,cutoff_mask);
1039
1040             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1041
1042             /* Update vectorial force */
1043             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1044             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1045             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1046             
1047             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1048             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1049             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1050
1051             }
1052
1053             /**************************
1054              * CALCULATE INTERACTIONS *
1055              **************************/
1056
1057             if (gmx_mm_any_lt(rsq12,rcutoff2))
1058             {
1059
1060             r12              = _mm_mul_pd(rsq12,rinv12);
1061
1062             /* EWALD ELECTROSTATICS */
1063
1064             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1065             ewrt             = _mm_mul_pd(r12,ewtabscale);
1066             ewitab           = _mm_cvttpd_epi32(ewrt);
1067 #ifdef __XOP__
1068             eweps            = _mm_frcz_pd(ewrt);
1069 #else
1070             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1071 #endif
1072             twoeweps         = _mm_add_pd(eweps,eweps);
1073             ewitab           = _mm_slli_epi32(ewitab,2);
1074             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1075             ewtabD           = _mm_setzero_pd();
1076             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1077             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1078             ewtabFn          = _mm_setzero_pd();
1079             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1080             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1081             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1082             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
1083             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1084
1085             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1086
1087             /* Update potential sum for this i atom from the interaction with this j atom. */
1088             velec            = _mm_and_pd(velec,cutoff_mask);
1089             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1090             velecsum         = _mm_add_pd(velecsum,velec);
1091
1092             fscal            = felec;
1093
1094             fscal            = _mm_and_pd(fscal,cutoff_mask);
1095
1096             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1097
1098             /* Update vectorial force */
1099             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1100             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1101             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1102             
1103             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1104             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1105             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1106
1107             }
1108
1109             /**************************
1110              * CALCULATE INTERACTIONS *
1111              **************************/
1112
1113             if (gmx_mm_any_lt(rsq13,rcutoff2))
1114             {
1115
1116             r13              = _mm_mul_pd(rsq13,rinv13);
1117
1118             /* EWALD ELECTROSTATICS */
1119
1120             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1121             ewrt             = _mm_mul_pd(r13,ewtabscale);
1122             ewitab           = _mm_cvttpd_epi32(ewrt);
1123 #ifdef __XOP__
1124             eweps            = _mm_frcz_pd(ewrt);
1125 #else
1126             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1127 #endif
1128             twoeweps         = _mm_add_pd(eweps,eweps);
1129             ewitab           = _mm_slli_epi32(ewitab,2);
1130             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1131             ewtabD           = _mm_setzero_pd();
1132             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1133             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1134             ewtabFn          = _mm_setzero_pd();
1135             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1136             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1137             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1138             velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
1139             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1140
1141             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
1142
1143             /* Update potential sum for this i atom from the interaction with this j atom. */
1144             velec            = _mm_and_pd(velec,cutoff_mask);
1145             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1146             velecsum         = _mm_add_pd(velecsum,velec);
1147
1148             fscal            = felec;
1149
1150             fscal            = _mm_and_pd(fscal,cutoff_mask);
1151
1152             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1153
1154             /* Update vectorial force */
1155             fix1             = _mm_macc_pd(dx13,fscal,fix1);
1156             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
1157             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
1158             
1159             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
1160             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
1161             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
1162
1163             }
1164
1165             /**************************
1166              * CALCULATE INTERACTIONS *
1167              **************************/
1168
1169             if (gmx_mm_any_lt(rsq21,rcutoff2))
1170             {
1171
1172             r21              = _mm_mul_pd(rsq21,rinv21);
1173
1174             /* EWALD ELECTROSTATICS */
1175
1176             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1177             ewrt             = _mm_mul_pd(r21,ewtabscale);
1178             ewitab           = _mm_cvttpd_epi32(ewrt);
1179 #ifdef __XOP__
1180             eweps            = _mm_frcz_pd(ewrt);
1181 #else
1182             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1183 #endif
1184             twoeweps         = _mm_add_pd(eweps,eweps);
1185             ewitab           = _mm_slli_epi32(ewitab,2);
1186             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1187             ewtabD           = _mm_setzero_pd();
1188             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1189             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1190             ewtabFn          = _mm_setzero_pd();
1191             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1192             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1193             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1194             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
1195             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1196
1197             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1198
1199             /* Update potential sum for this i atom from the interaction with this j atom. */
1200             velec            = _mm_and_pd(velec,cutoff_mask);
1201             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1202             velecsum         = _mm_add_pd(velecsum,velec);
1203
1204             fscal            = felec;
1205
1206             fscal            = _mm_and_pd(fscal,cutoff_mask);
1207
1208             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1209
1210             /* Update vectorial force */
1211             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1212             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1213             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1214             
1215             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1216             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1217             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1218
1219             }
1220
1221             /**************************
1222              * CALCULATE INTERACTIONS *
1223              **************************/
1224
1225             if (gmx_mm_any_lt(rsq22,rcutoff2))
1226             {
1227
1228             r22              = _mm_mul_pd(rsq22,rinv22);
1229
1230             /* EWALD ELECTROSTATICS */
1231
1232             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1233             ewrt             = _mm_mul_pd(r22,ewtabscale);
1234             ewitab           = _mm_cvttpd_epi32(ewrt);
1235 #ifdef __XOP__
1236             eweps            = _mm_frcz_pd(ewrt);
1237 #else
1238             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1239 #endif
1240             twoeweps         = _mm_add_pd(eweps,eweps);
1241             ewitab           = _mm_slli_epi32(ewitab,2);
1242             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1243             ewtabD           = _mm_setzero_pd();
1244             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1245             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1246             ewtabFn          = _mm_setzero_pd();
1247             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1248             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1249             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1250             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
1251             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1252
1253             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1254
1255             /* Update potential sum for this i atom from the interaction with this j atom. */
1256             velec            = _mm_and_pd(velec,cutoff_mask);
1257             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1258             velecsum         = _mm_add_pd(velecsum,velec);
1259
1260             fscal            = felec;
1261
1262             fscal            = _mm_and_pd(fscal,cutoff_mask);
1263
1264             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1265
1266             /* Update vectorial force */
1267             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1268             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1269             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1270             
1271             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1272             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1273             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1274
1275             }
1276
1277             /**************************
1278              * CALCULATE INTERACTIONS *
1279              **************************/
1280
1281             if (gmx_mm_any_lt(rsq23,rcutoff2))
1282             {
1283
1284             r23              = _mm_mul_pd(rsq23,rinv23);
1285
1286             /* EWALD ELECTROSTATICS */
1287
1288             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1289             ewrt             = _mm_mul_pd(r23,ewtabscale);
1290             ewitab           = _mm_cvttpd_epi32(ewrt);
1291 #ifdef __XOP__
1292             eweps            = _mm_frcz_pd(ewrt);
1293 #else
1294             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1295 #endif
1296             twoeweps         = _mm_add_pd(eweps,eweps);
1297             ewitab           = _mm_slli_epi32(ewitab,2);
1298             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1299             ewtabD           = _mm_setzero_pd();
1300             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1301             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1302             ewtabFn          = _mm_setzero_pd();
1303             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1304             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1305             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1306             velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
1307             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1308
1309             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
1310
1311             /* Update potential sum for this i atom from the interaction with this j atom. */
1312             velec            = _mm_and_pd(velec,cutoff_mask);
1313             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1314             velecsum         = _mm_add_pd(velecsum,velec);
1315
1316             fscal            = felec;
1317
1318             fscal            = _mm_and_pd(fscal,cutoff_mask);
1319
1320             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1321
1322             /* Update vectorial force */
1323             fix2             = _mm_macc_pd(dx23,fscal,fix2);
1324             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
1325             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
1326             
1327             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
1328             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
1329             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
1330
1331             }
1332
1333             /**************************
1334              * CALCULATE INTERACTIONS *
1335              **************************/
1336
1337             if (gmx_mm_any_lt(rsq31,rcutoff2))
1338             {
1339
1340             r31              = _mm_mul_pd(rsq31,rinv31);
1341
1342             /* EWALD ELECTROSTATICS */
1343
1344             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1345             ewrt             = _mm_mul_pd(r31,ewtabscale);
1346             ewitab           = _mm_cvttpd_epi32(ewrt);
1347 #ifdef __XOP__
1348             eweps            = _mm_frcz_pd(ewrt);
1349 #else
1350             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1351 #endif
1352             twoeweps         = _mm_add_pd(eweps,eweps);
1353             ewitab           = _mm_slli_epi32(ewitab,2);
1354             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1355             ewtabD           = _mm_setzero_pd();
1356             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1357             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1358             ewtabFn          = _mm_setzero_pd();
1359             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1360             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1361             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1362             velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
1363             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1364
1365             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
1366
1367             /* Update potential sum for this i atom from the interaction with this j atom. */
1368             velec            = _mm_and_pd(velec,cutoff_mask);
1369             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1370             velecsum         = _mm_add_pd(velecsum,velec);
1371
1372             fscal            = felec;
1373
1374             fscal            = _mm_and_pd(fscal,cutoff_mask);
1375
1376             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1377
1378             /* Update vectorial force */
1379             fix3             = _mm_macc_pd(dx31,fscal,fix3);
1380             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
1381             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
1382             
1383             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
1384             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
1385             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
1386
1387             }
1388
1389             /**************************
1390              * CALCULATE INTERACTIONS *
1391              **************************/
1392
1393             if (gmx_mm_any_lt(rsq32,rcutoff2))
1394             {
1395
1396             r32              = _mm_mul_pd(rsq32,rinv32);
1397
1398             /* EWALD ELECTROSTATICS */
1399
1400             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1401             ewrt             = _mm_mul_pd(r32,ewtabscale);
1402             ewitab           = _mm_cvttpd_epi32(ewrt);
1403 #ifdef __XOP__
1404             eweps            = _mm_frcz_pd(ewrt);
1405 #else
1406             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1407 #endif
1408             twoeweps         = _mm_add_pd(eweps,eweps);
1409             ewitab           = _mm_slli_epi32(ewitab,2);
1410             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1411             ewtabD           = _mm_setzero_pd();
1412             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1413             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1414             ewtabFn          = _mm_setzero_pd();
1415             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1416             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1417             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1418             velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
1419             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1420
1421             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
1422
1423             /* Update potential sum for this i atom from the interaction with this j atom. */
1424             velec            = _mm_and_pd(velec,cutoff_mask);
1425             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1426             velecsum         = _mm_add_pd(velecsum,velec);
1427
1428             fscal            = felec;
1429
1430             fscal            = _mm_and_pd(fscal,cutoff_mask);
1431
1432             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1433
1434             /* Update vectorial force */
1435             fix3             = _mm_macc_pd(dx32,fscal,fix3);
1436             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
1437             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
1438             
1439             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
1440             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
1441             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
1442
1443             }
1444
1445             /**************************
1446              * CALCULATE INTERACTIONS *
1447              **************************/
1448
1449             if (gmx_mm_any_lt(rsq33,rcutoff2))
1450             {
1451
1452             r33              = _mm_mul_pd(rsq33,rinv33);
1453
1454             /* EWALD ELECTROSTATICS */
1455
1456             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1457             ewrt             = _mm_mul_pd(r33,ewtabscale);
1458             ewitab           = _mm_cvttpd_epi32(ewrt);
1459 #ifdef __XOP__
1460             eweps            = _mm_frcz_pd(ewrt);
1461 #else
1462             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1463 #endif
1464             twoeweps         = _mm_add_pd(eweps,eweps);
1465             ewitab           = _mm_slli_epi32(ewitab,2);
1466             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1467             ewtabD           = _mm_setzero_pd();
1468             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1469             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1470             ewtabFn          = _mm_setzero_pd();
1471             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1472             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1473             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1474             velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
1475             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1476
1477             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
1478
1479             /* Update potential sum for this i atom from the interaction with this j atom. */
1480             velec            = _mm_and_pd(velec,cutoff_mask);
1481             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1482             velecsum         = _mm_add_pd(velecsum,velec);
1483
1484             fscal            = felec;
1485
1486             fscal            = _mm_and_pd(fscal,cutoff_mask);
1487
1488             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1489
1490             /* Update vectorial force */
1491             fix3             = _mm_macc_pd(dx33,fscal,fix3);
1492             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
1493             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
1494             
1495             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
1496             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
1497             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
1498
1499             }
1500
1501             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1502
1503             /* Inner loop uses 502 flops */
1504         }
1505
1506         /* End of innermost loop */
1507
1508         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1509                                               f+i_coord_offset,fshift+i_shift_offset);
1510
1511         ggid                        = gid[iidx];
1512         /* Update potential energies */
1513         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1514         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1515
1516         /* Increment number of inner iterations */
1517         inneriter                  += j_index_end - j_index_start;
1518
1519         /* Outer loop uses 26 flops */
1520     }
1521
1522     /* Increment number of outer iterations */
1523     outeriter        += nri;
1524
1525     /* Update outer/inner flops */
1526
1527     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*502);
1528 }
1529 /*
1530  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_double
1531  * Electrostatics interaction: Ewald
1532  * VdW interaction:            LJEwald
1533  * Geometry:                   Water4-Water4
1534  * Calculate force/pot:        Force
1535  */
1536 void
1537 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_double
1538                     (t_nblist                    * gmx_restrict       nlist,
1539                      rvec                        * gmx_restrict          xx,
1540                      rvec                        * gmx_restrict          ff,
1541                      t_forcerec                  * gmx_restrict          fr,
1542                      t_mdatoms                   * gmx_restrict     mdatoms,
1543                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1544                      t_nrnb                      * gmx_restrict        nrnb)
1545 {
1546     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1547      * just 0 for non-waters.
1548      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1549      * jnr indices corresponding to data put in the four positions in the SIMD register.
1550      */
1551     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1552     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1553     int              jnrA,jnrB;
1554     int              j_coord_offsetA,j_coord_offsetB;
1555     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1556     real             rcutoff_scalar;
1557     real             *shiftvec,*fshift,*x,*f;
1558     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1559     int              vdwioffset0;
1560     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1561     int              vdwioffset1;
1562     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1563     int              vdwioffset2;
1564     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1565     int              vdwioffset3;
1566     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1567     int              vdwjidx0A,vdwjidx0B;
1568     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1569     int              vdwjidx1A,vdwjidx1B;
1570     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1571     int              vdwjidx2A,vdwjidx2B;
1572     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1573     int              vdwjidx3A,vdwjidx3B;
1574     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1575     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1576     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1577     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1578     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1579     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1580     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1581     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1582     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1583     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1584     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1585     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1586     real             *charge;
1587     int              nvdwtype;
1588     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1589     int              *vdwtype;
1590     real             *vdwparam;
1591     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1592     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1593     __m128d           c6grid_00;
1594     __m128d           c6grid_11;
1595     __m128d           c6grid_12;
1596     __m128d           c6grid_13;
1597     __m128d           c6grid_21;
1598     __m128d           c6grid_22;
1599     __m128d           c6grid_23;
1600     __m128d           c6grid_31;
1601     __m128d           c6grid_32;
1602     __m128d           c6grid_33;
1603     real             *vdwgridparam;
1604     __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1605     __m128d           one_half  = _mm_set1_pd(0.5);
1606     __m128d           minus_one = _mm_set1_pd(-1.0);
1607     __m128i          ewitab;
1608     __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1609     real             *ewtab;
1610     __m128d          dummy_mask,cutoff_mask;
1611     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1612     __m128d          one     = _mm_set1_pd(1.0);
1613     __m128d          two     = _mm_set1_pd(2.0);
1614     x                = xx[0];
1615     f                = ff[0];
1616
1617     nri              = nlist->nri;
1618     iinr             = nlist->iinr;
1619     jindex           = nlist->jindex;
1620     jjnr             = nlist->jjnr;
1621     shiftidx         = nlist->shift;
1622     gid              = nlist->gid;
1623     shiftvec         = fr->shift_vec[0];
1624     fshift           = fr->fshift[0];
1625     facel            = _mm_set1_pd(fr->epsfac);
1626     charge           = mdatoms->chargeA;
1627     nvdwtype         = fr->ntype;
1628     vdwparam         = fr->nbfp;
1629     vdwtype          = mdatoms->typeA;
1630     vdwgridparam     = fr->ljpme_c6grid;
1631     sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
1632     ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
1633     ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
1634
1635     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1636     ewtab            = fr->ic->tabq_coul_F;
1637     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1638     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1639
1640     /* Setup water-specific parameters */
1641     inr              = nlist->iinr[0];
1642     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1643     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1644     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1645     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1646
1647     jq1              = _mm_set1_pd(charge[inr+1]);
1648     jq2              = _mm_set1_pd(charge[inr+2]);
1649     jq3              = _mm_set1_pd(charge[inr+3]);
1650     vdwjidx0A        = 2*vdwtype[inr+0];
1651     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1652     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1653     c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
1654     qq11             = _mm_mul_pd(iq1,jq1);
1655     qq12             = _mm_mul_pd(iq1,jq2);
1656     qq13             = _mm_mul_pd(iq1,jq3);
1657     qq21             = _mm_mul_pd(iq2,jq1);
1658     qq22             = _mm_mul_pd(iq2,jq2);
1659     qq23             = _mm_mul_pd(iq2,jq3);
1660     qq31             = _mm_mul_pd(iq3,jq1);
1661     qq32             = _mm_mul_pd(iq3,jq2);
1662     qq33             = _mm_mul_pd(iq3,jq3);
1663
1664     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1665     rcutoff_scalar   = fr->rcoulomb;
1666     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1667     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1668
1669     sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
1670     rvdw             = _mm_set1_pd(fr->rvdw);
1671
1672     /* Avoid stupid compiler warnings */
1673     jnrA = jnrB = 0;
1674     j_coord_offsetA = 0;
1675     j_coord_offsetB = 0;
1676
1677     outeriter        = 0;
1678     inneriter        = 0;
1679
1680     /* Start outer loop over neighborlists */
1681     for(iidx=0; iidx<nri; iidx++)
1682     {
1683         /* Load shift vector for this list */
1684         i_shift_offset   = DIM*shiftidx[iidx];
1685
1686         /* Load limits for loop over neighbors */
1687         j_index_start    = jindex[iidx];
1688         j_index_end      = jindex[iidx+1];
1689
1690         /* Get outer coordinate index */
1691         inr              = iinr[iidx];
1692         i_coord_offset   = DIM*inr;
1693
1694         /* Load i particle coords and add shift vector */
1695         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1696                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1697
1698         fix0             = _mm_setzero_pd();
1699         fiy0             = _mm_setzero_pd();
1700         fiz0             = _mm_setzero_pd();
1701         fix1             = _mm_setzero_pd();
1702         fiy1             = _mm_setzero_pd();
1703         fiz1             = _mm_setzero_pd();
1704         fix2             = _mm_setzero_pd();
1705         fiy2             = _mm_setzero_pd();
1706         fiz2             = _mm_setzero_pd();
1707         fix3             = _mm_setzero_pd();
1708         fiy3             = _mm_setzero_pd();
1709         fiz3             = _mm_setzero_pd();
1710
1711         /* Start inner kernel loop */
1712         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1713         {
1714
1715             /* Get j neighbor index, and coordinate index */
1716             jnrA             = jjnr[jidx];
1717             jnrB             = jjnr[jidx+1];
1718             j_coord_offsetA  = DIM*jnrA;
1719             j_coord_offsetB  = DIM*jnrB;
1720
1721             /* load j atom coordinates */
1722             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1723                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1724                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1725
1726             /* Calculate displacement vector */
1727             dx00             = _mm_sub_pd(ix0,jx0);
1728             dy00             = _mm_sub_pd(iy0,jy0);
1729             dz00             = _mm_sub_pd(iz0,jz0);
1730             dx11             = _mm_sub_pd(ix1,jx1);
1731             dy11             = _mm_sub_pd(iy1,jy1);
1732             dz11             = _mm_sub_pd(iz1,jz1);
1733             dx12             = _mm_sub_pd(ix1,jx2);
1734             dy12             = _mm_sub_pd(iy1,jy2);
1735             dz12             = _mm_sub_pd(iz1,jz2);
1736             dx13             = _mm_sub_pd(ix1,jx3);
1737             dy13             = _mm_sub_pd(iy1,jy3);
1738             dz13             = _mm_sub_pd(iz1,jz3);
1739             dx21             = _mm_sub_pd(ix2,jx1);
1740             dy21             = _mm_sub_pd(iy2,jy1);
1741             dz21             = _mm_sub_pd(iz2,jz1);
1742             dx22             = _mm_sub_pd(ix2,jx2);
1743             dy22             = _mm_sub_pd(iy2,jy2);
1744             dz22             = _mm_sub_pd(iz2,jz2);
1745             dx23             = _mm_sub_pd(ix2,jx3);
1746             dy23             = _mm_sub_pd(iy2,jy3);
1747             dz23             = _mm_sub_pd(iz2,jz3);
1748             dx31             = _mm_sub_pd(ix3,jx1);
1749             dy31             = _mm_sub_pd(iy3,jy1);
1750             dz31             = _mm_sub_pd(iz3,jz1);
1751             dx32             = _mm_sub_pd(ix3,jx2);
1752             dy32             = _mm_sub_pd(iy3,jy2);
1753             dz32             = _mm_sub_pd(iz3,jz2);
1754             dx33             = _mm_sub_pd(ix3,jx3);
1755             dy33             = _mm_sub_pd(iy3,jy3);
1756             dz33             = _mm_sub_pd(iz3,jz3);
1757
1758             /* Calculate squared distance and things based on it */
1759             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1760             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1761             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1762             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1763             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1764             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1765             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1766             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1767             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1768             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1769
1770             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1771             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1772             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1773             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1774             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1775             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1776             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1777             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1778             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1779             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1780
1781             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1782             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1783             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1784             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1785             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1786             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1787             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1788             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1789             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1790             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1791
1792             fjx0             = _mm_setzero_pd();
1793             fjy0             = _mm_setzero_pd();
1794             fjz0             = _mm_setzero_pd();
1795             fjx1             = _mm_setzero_pd();
1796             fjy1             = _mm_setzero_pd();
1797             fjz1             = _mm_setzero_pd();
1798             fjx2             = _mm_setzero_pd();
1799             fjy2             = _mm_setzero_pd();
1800             fjz2             = _mm_setzero_pd();
1801             fjx3             = _mm_setzero_pd();
1802             fjy3             = _mm_setzero_pd();
1803             fjz3             = _mm_setzero_pd();
1804
1805             /**************************
1806              * CALCULATE INTERACTIONS *
1807              **************************/
1808
1809             if (gmx_mm_any_lt(rsq00,rcutoff2))
1810             {
1811
1812             r00              = _mm_mul_pd(rsq00,rinv00);
1813
1814             /* Analytical LJ-PME */
1815             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1816             ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
1817             ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
1818             exponent         = gmx_simd_exp_d(ewcljrsq);
1819             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1820             poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
1821             /* f6A = 6 * C6grid * (1 - poly) */
1822             f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
1823             /* f6B = C6grid * exponent * beta^6 */
1824             f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
1825             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1826             fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1827
1828             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1829
1830             fscal            = fvdw;
1831
1832             fscal            = _mm_and_pd(fscal,cutoff_mask);
1833
1834             /* Update vectorial force */
1835             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1836             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1837             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1838             
1839             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1840             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1841             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1842
1843             }
1844
1845             /**************************
1846              * CALCULATE INTERACTIONS *
1847              **************************/
1848
1849             if (gmx_mm_any_lt(rsq11,rcutoff2))
1850             {
1851
1852             r11              = _mm_mul_pd(rsq11,rinv11);
1853
1854             /* EWALD ELECTROSTATICS */
1855
1856             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1857             ewrt             = _mm_mul_pd(r11,ewtabscale);
1858             ewitab           = _mm_cvttpd_epi32(ewrt);
1859 #ifdef __XOP__
1860             eweps            = _mm_frcz_pd(ewrt);
1861 #else
1862             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1863 #endif
1864             twoeweps         = _mm_add_pd(eweps,eweps);
1865             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1866                                          &ewtabF,&ewtabFn);
1867             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1868             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1869
1870             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1871
1872             fscal            = felec;
1873
1874             fscal            = _mm_and_pd(fscal,cutoff_mask);
1875
1876             /* Update vectorial force */
1877             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1878             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1879             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1880             
1881             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1882             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1883             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1884
1885             }
1886
1887             /**************************
1888              * CALCULATE INTERACTIONS *
1889              **************************/
1890
1891             if (gmx_mm_any_lt(rsq12,rcutoff2))
1892             {
1893
1894             r12              = _mm_mul_pd(rsq12,rinv12);
1895
1896             /* EWALD ELECTROSTATICS */
1897
1898             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1899             ewrt             = _mm_mul_pd(r12,ewtabscale);
1900             ewitab           = _mm_cvttpd_epi32(ewrt);
1901 #ifdef __XOP__
1902             eweps            = _mm_frcz_pd(ewrt);
1903 #else
1904             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1905 #endif
1906             twoeweps         = _mm_add_pd(eweps,eweps);
1907             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1908                                          &ewtabF,&ewtabFn);
1909             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1910             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1911
1912             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1913
1914             fscal            = felec;
1915
1916             fscal            = _mm_and_pd(fscal,cutoff_mask);
1917
1918             /* Update vectorial force */
1919             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1920             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1921             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1922             
1923             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1924             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1925             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1926
1927             }
1928
1929             /**************************
1930              * CALCULATE INTERACTIONS *
1931              **************************/
1932
1933             if (gmx_mm_any_lt(rsq13,rcutoff2))
1934             {
1935
1936             r13              = _mm_mul_pd(rsq13,rinv13);
1937
1938             /* EWALD ELECTROSTATICS */
1939
1940             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1941             ewrt             = _mm_mul_pd(r13,ewtabscale);
1942             ewitab           = _mm_cvttpd_epi32(ewrt);
1943 #ifdef __XOP__
1944             eweps            = _mm_frcz_pd(ewrt);
1945 #else
1946             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1947 #endif
1948             twoeweps         = _mm_add_pd(eweps,eweps);
1949             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1950                                          &ewtabF,&ewtabFn);
1951             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1952             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1953
1954             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
1955
1956             fscal            = felec;
1957
1958             fscal            = _mm_and_pd(fscal,cutoff_mask);
1959
1960             /* Update vectorial force */
1961             fix1             = _mm_macc_pd(dx13,fscal,fix1);
1962             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
1963             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
1964             
1965             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
1966             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
1967             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
1968
1969             }
1970
1971             /**************************
1972              * CALCULATE INTERACTIONS *
1973              **************************/
1974
1975             if (gmx_mm_any_lt(rsq21,rcutoff2))
1976             {
1977
1978             r21              = _mm_mul_pd(rsq21,rinv21);
1979
1980             /* EWALD ELECTROSTATICS */
1981
1982             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1983             ewrt             = _mm_mul_pd(r21,ewtabscale);
1984             ewitab           = _mm_cvttpd_epi32(ewrt);
1985 #ifdef __XOP__
1986             eweps            = _mm_frcz_pd(ewrt);
1987 #else
1988             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1989 #endif
1990             twoeweps         = _mm_add_pd(eweps,eweps);
1991             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1992                                          &ewtabF,&ewtabFn);
1993             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1994             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1995
1996             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1997
1998             fscal            = felec;
1999
2000             fscal            = _mm_and_pd(fscal,cutoff_mask);
2001
2002             /* Update vectorial force */
2003             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2004             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2005             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2006             
2007             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2008             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2009             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2010
2011             }
2012
2013             /**************************
2014              * CALCULATE INTERACTIONS *
2015              **************************/
2016
2017             if (gmx_mm_any_lt(rsq22,rcutoff2))
2018             {
2019
2020             r22              = _mm_mul_pd(rsq22,rinv22);
2021
2022             /* EWALD ELECTROSTATICS */
2023
2024             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2025             ewrt             = _mm_mul_pd(r22,ewtabscale);
2026             ewitab           = _mm_cvttpd_epi32(ewrt);
2027 #ifdef __XOP__
2028             eweps            = _mm_frcz_pd(ewrt);
2029 #else
2030             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2031 #endif
2032             twoeweps         = _mm_add_pd(eweps,eweps);
2033             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2034                                          &ewtabF,&ewtabFn);
2035             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2036             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2037
2038             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2039
2040             fscal            = felec;
2041
2042             fscal            = _mm_and_pd(fscal,cutoff_mask);
2043
2044             /* Update vectorial force */
2045             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2046             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2047             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2048             
2049             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2050             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2051             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2052
2053             }
2054
2055             /**************************
2056              * CALCULATE INTERACTIONS *
2057              **************************/
2058
2059             if (gmx_mm_any_lt(rsq23,rcutoff2))
2060             {
2061
2062             r23              = _mm_mul_pd(rsq23,rinv23);
2063
2064             /* EWALD ELECTROSTATICS */
2065
2066             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2067             ewrt             = _mm_mul_pd(r23,ewtabscale);
2068             ewitab           = _mm_cvttpd_epi32(ewrt);
2069 #ifdef __XOP__
2070             eweps            = _mm_frcz_pd(ewrt);
2071 #else
2072             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2073 #endif
2074             twoeweps         = _mm_add_pd(eweps,eweps);
2075             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2076                                          &ewtabF,&ewtabFn);
2077             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2078             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2079
2080             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
2081
2082             fscal            = felec;
2083
2084             fscal            = _mm_and_pd(fscal,cutoff_mask);
2085
2086             /* Update vectorial force */
2087             fix2             = _mm_macc_pd(dx23,fscal,fix2);
2088             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
2089             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
2090             
2091             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
2092             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
2093             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
2094
2095             }
2096
2097             /**************************
2098              * CALCULATE INTERACTIONS *
2099              **************************/
2100
2101             if (gmx_mm_any_lt(rsq31,rcutoff2))
2102             {
2103
2104             r31              = _mm_mul_pd(rsq31,rinv31);
2105
2106             /* EWALD ELECTROSTATICS */
2107
2108             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2109             ewrt             = _mm_mul_pd(r31,ewtabscale);
2110             ewitab           = _mm_cvttpd_epi32(ewrt);
2111 #ifdef __XOP__
2112             eweps            = _mm_frcz_pd(ewrt);
2113 #else
2114             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2115 #endif
2116             twoeweps         = _mm_add_pd(eweps,eweps);
2117             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2118                                          &ewtabF,&ewtabFn);
2119             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2120             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2121
2122             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
2123
2124             fscal            = felec;
2125
2126             fscal            = _mm_and_pd(fscal,cutoff_mask);
2127
2128             /* Update vectorial force */
2129             fix3             = _mm_macc_pd(dx31,fscal,fix3);
2130             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
2131             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
2132             
2133             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
2134             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
2135             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
2136
2137             }
2138
2139             /**************************
2140              * CALCULATE INTERACTIONS *
2141              **************************/
2142
2143             if (gmx_mm_any_lt(rsq32,rcutoff2))
2144             {
2145
2146             r32              = _mm_mul_pd(rsq32,rinv32);
2147
2148             /* EWALD ELECTROSTATICS */
2149
2150             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2151             ewrt             = _mm_mul_pd(r32,ewtabscale);
2152             ewitab           = _mm_cvttpd_epi32(ewrt);
2153 #ifdef __XOP__
2154             eweps            = _mm_frcz_pd(ewrt);
2155 #else
2156             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2157 #endif
2158             twoeweps         = _mm_add_pd(eweps,eweps);
2159             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2160                                          &ewtabF,&ewtabFn);
2161             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2162             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2163
2164             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
2165
2166             fscal            = felec;
2167
2168             fscal            = _mm_and_pd(fscal,cutoff_mask);
2169
2170             /* Update vectorial force */
2171             fix3             = _mm_macc_pd(dx32,fscal,fix3);
2172             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
2173             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
2174             
2175             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
2176             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
2177             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
2178
2179             }
2180
2181             /**************************
2182              * CALCULATE INTERACTIONS *
2183              **************************/
2184
2185             if (gmx_mm_any_lt(rsq33,rcutoff2))
2186             {
2187
2188             r33              = _mm_mul_pd(rsq33,rinv33);
2189
2190             /* EWALD ELECTROSTATICS */
2191
2192             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2193             ewrt             = _mm_mul_pd(r33,ewtabscale);
2194             ewitab           = _mm_cvttpd_epi32(ewrt);
2195 #ifdef __XOP__
2196             eweps            = _mm_frcz_pd(ewrt);
2197 #else
2198             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2199 #endif
2200             twoeweps         = _mm_add_pd(eweps,eweps);
2201             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2202                                          &ewtabF,&ewtabFn);
2203             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2204             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2205
2206             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
2207
2208             fscal            = felec;
2209
2210             fscal            = _mm_and_pd(fscal,cutoff_mask);
2211
2212             /* Update vectorial force */
2213             fix3             = _mm_macc_pd(dx33,fscal,fix3);
2214             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
2215             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
2216             
2217             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
2218             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
2219             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
2220
2221             }
2222
2223             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2224
2225             /* Inner loop uses 431 flops */
2226         }
2227
2228         if(jidx<j_index_end)
2229         {
2230
2231             jnrA             = jjnr[jidx];
2232             j_coord_offsetA  = DIM*jnrA;
2233
2234             /* load j atom coordinates */
2235             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2236                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2237                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2238
2239             /* Calculate displacement vector */
2240             dx00             = _mm_sub_pd(ix0,jx0);
2241             dy00             = _mm_sub_pd(iy0,jy0);
2242             dz00             = _mm_sub_pd(iz0,jz0);
2243             dx11             = _mm_sub_pd(ix1,jx1);
2244             dy11             = _mm_sub_pd(iy1,jy1);
2245             dz11             = _mm_sub_pd(iz1,jz1);
2246             dx12             = _mm_sub_pd(ix1,jx2);
2247             dy12             = _mm_sub_pd(iy1,jy2);
2248             dz12             = _mm_sub_pd(iz1,jz2);
2249             dx13             = _mm_sub_pd(ix1,jx3);
2250             dy13             = _mm_sub_pd(iy1,jy3);
2251             dz13             = _mm_sub_pd(iz1,jz3);
2252             dx21             = _mm_sub_pd(ix2,jx1);
2253             dy21             = _mm_sub_pd(iy2,jy1);
2254             dz21             = _mm_sub_pd(iz2,jz1);
2255             dx22             = _mm_sub_pd(ix2,jx2);
2256             dy22             = _mm_sub_pd(iy2,jy2);
2257             dz22             = _mm_sub_pd(iz2,jz2);
2258             dx23             = _mm_sub_pd(ix2,jx3);
2259             dy23             = _mm_sub_pd(iy2,jy3);
2260             dz23             = _mm_sub_pd(iz2,jz3);
2261             dx31             = _mm_sub_pd(ix3,jx1);
2262             dy31             = _mm_sub_pd(iy3,jy1);
2263             dz31             = _mm_sub_pd(iz3,jz1);
2264             dx32             = _mm_sub_pd(ix3,jx2);
2265             dy32             = _mm_sub_pd(iy3,jy2);
2266             dz32             = _mm_sub_pd(iz3,jz2);
2267             dx33             = _mm_sub_pd(ix3,jx3);
2268             dy33             = _mm_sub_pd(iy3,jy3);
2269             dz33             = _mm_sub_pd(iz3,jz3);
2270
2271             /* Calculate squared distance and things based on it */
2272             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2273             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2274             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2275             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2276             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2277             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2278             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2279             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2280             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2281             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2282
2283             rinv00           = gmx_mm_invsqrt_pd(rsq00);
2284             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2285             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2286             rinv13           = gmx_mm_invsqrt_pd(rsq13);
2287             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2288             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2289             rinv23           = gmx_mm_invsqrt_pd(rsq23);
2290             rinv31           = gmx_mm_invsqrt_pd(rsq31);
2291             rinv32           = gmx_mm_invsqrt_pd(rsq32);
2292             rinv33           = gmx_mm_invsqrt_pd(rsq33);
2293
2294             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
2295             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2296             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2297             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
2298             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2299             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2300             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
2301             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
2302             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
2303             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
2304
2305             fjx0             = _mm_setzero_pd();
2306             fjy0             = _mm_setzero_pd();
2307             fjz0             = _mm_setzero_pd();
2308             fjx1             = _mm_setzero_pd();
2309             fjy1             = _mm_setzero_pd();
2310             fjz1             = _mm_setzero_pd();
2311             fjx2             = _mm_setzero_pd();
2312             fjy2             = _mm_setzero_pd();
2313             fjz2             = _mm_setzero_pd();
2314             fjx3             = _mm_setzero_pd();
2315             fjy3             = _mm_setzero_pd();
2316             fjz3             = _mm_setzero_pd();
2317
2318             /**************************
2319              * CALCULATE INTERACTIONS *
2320              **************************/
2321
2322             if (gmx_mm_any_lt(rsq00,rcutoff2))
2323             {
2324
2325             r00              = _mm_mul_pd(rsq00,rinv00);
2326
2327             /* Analytical LJ-PME */
2328             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2329             ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
2330             ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
2331             exponent         = gmx_simd_exp_d(ewcljrsq);
2332             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2333             poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
2334             /* f6A = 6 * C6grid * (1 - poly) */
2335             f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
2336             /* f6B = C6grid * exponent * beta^6 */
2337             f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
2338             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2339             fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
2340
2341             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2342
2343             fscal            = fvdw;
2344
2345             fscal            = _mm_and_pd(fscal,cutoff_mask);
2346
2347             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2348
2349             /* Update vectorial force */
2350             fix0             = _mm_macc_pd(dx00,fscal,fix0);
2351             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
2352             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
2353             
2354             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
2355             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
2356             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
2357
2358             }
2359
2360             /**************************
2361              * CALCULATE INTERACTIONS *
2362              **************************/
2363
2364             if (gmx_mm_any_lt(rsq11,rcutoff2))
2365             {
2366
2367             r11              = _mm_mul_pd(rsq11,rinv11);
2368
2369             /* EWALD ELECTROSTATICS */
2370
2371             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2372             ewrt             = _mm_mul_pd(r11,ewtabscale);
2373             ewitab           = _mm_cvttpd_epi32(ewrt);
2374 #ifdef __XOP__
2375             eweps            = _mm_frcz_pd(ewrt);
2376 #else
2377             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2378 #endif
2379             twoeweps         = _mm_add_pd(eweps,eweps);
2380             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2381             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2382             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2383
2384             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2385
2386             fscal            = felec;
2387
2388             fscal            = _mm_and_pd(fscal,cutoff_mask);
2389
2390             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2391
2392             /* Update vectorial force */
2393             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2394             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2395             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2396             
2397             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2398             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2399             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2400
2401             }
2402
2403             /**************************
2404              * CALCULATE INTERACTIONS *
2405              **************************/
2406
2407             if (gmx_mm_any_lt(rsq12,rcutoff2))
2408             {
2409
2410             r12              = _mm_mul_pd(rsq12,rinv12);
2411
2412             /* EWALD ELECTROSTATICS */
2413
2414             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2415             ewrt             = _mm_mul_pd(r12,ewtabscale);
2416             ewitab           = _mm_cvttpd_epi32(ewrt);
2417 #ifdef __XOP__
2418             eweps            = _mm_frcz_pd(ewrt);
2419 #else
2420             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2421 #endif
2422             twoeweps         = _mm_add_pd(eweps,eweps);
2423             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2424             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2425             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2426
2427             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2428
2429             fscal            = felec;
2430
2431             fscal            = _mm_and_pd(fscal,cutoff_mask);
2432
2433             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2434
2435             /* Update vectorial force */
2436             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2437             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2438             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2439             
2440             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2441             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2442             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2443
2444             }
2445
2446             /**************************
2447              * CALCULATE INTERACTIONS *
2448              **************************/
2449
2450             if (gmx_mm_any_lt(rsq13,rcutoff2))
2451             {
2452
2453             r13              = _mm_mul_pd(rsq13,rinv13);
2454
2455             /* EWALD ELECTROSTATICS */
2456
2457             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2458             ewrt             = _mm_mul_pd(r13,ewtabscale);
2459             ewitab           = _mm_cvttpd_epi32(ewrt);
2460 #ifdef __XOP__
2461             eweps            = _mm_frcz_pd(ewrt);
2462 #else
2463             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2464 #endif
2465             twoeweps         = _mm_add_pd(eweps,eweps);
2466             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2467             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2468             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2469
2470             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2471
2472             fscal            = felec;
2473
2474             fscal            = _mm_and_pd(fscal,cutoff_mask);
2475
2476             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2477
2478             /* Update vectorial force */
2479             fix1             = _mm_macc_pd(dx13,fscal,fix1);
2480             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
2481             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
2482             
2483             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
2484             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
2485             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
2486
2487             }
2488
2489             /**************************
2490              * CALCULATE INTERACTIONS *
2491              **************************/
2492
2493             if (gmx_mm_any_lt(rsq21,rcutoff2))
2494             {
2495
2496             r21              = _mm_mul_pd(rsq21,rinv21);
2497
2498             /* EWALD ELECTROSTATICS */
2499
2500             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2501             ewrt             = _mm_mul_pd(r21,ewtabscale);
2502             ewitab           = _mm_cvttpd_epi32(ewrt);
2503 #ifdef __XOP__
2504             eweps            = _mm_frcz_pd(ewrt);
2505 #else
2506             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2507 #endif
2508             twoeweps         = _mm_add_pd(eweps,eweps);
2509             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2510             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2511             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2512
2513             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2514
2515             fscal            = felec;
2516
2517             fscal            = _mm_and_pd(fscal,cutoff_mask);
2518
2519             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2520
2521             /* Update vectorial force */
2522             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2523             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2524             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2525             
2526             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2527             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2528             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2529
2530             }
2531
2532             /**************************
2533              * CALCULATE INTERACTIONS *
2534              **************************/
2535
2536             if (gmx_mm_any_lt(rsq22,rcutoff2))
2537             {
2538
2539             r22              = _mm_mul_pd(rsq22,rinv22);
2540
2541             /* EWALD ELECTROSTATICS */
2542
2543             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2544             ewrt             = _mm_mul_pd(r22,ewtabscale);
2545             ewitab           = _mm_cvttpd_epi32(ewrt);
2546 #ifdef __XOP__
2547             eweps            = _mm_frcz_pd(ewrt);
2548 #else
2549             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2550 #endif
2551             twoeweps         = _mm_add_pd(eweps,eweps);
2552             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2553             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2554             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2555
2556             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2557
2558             fscal            = felec;
2559
2560             fscal            = _mm_and_pd(fscal,cutoff_mask);
2561
2562             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2563
2564             /* Update vectorial force */
2565             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2566             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2567             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2568             
2569             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2570             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2571             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2572
2573             }
2574
2575             /**************************
2576              * CALCULATE INTERACTIONS *
2577              **************************/
2578
2579             if (gmx_mm_any_lt(rsq23,rcutoff2))
2580             {
2581
2582             r23              = _mm_mul_pd(rsq23,rinv23);
2583
2584             /* EWALD ELECTROSTATICS */
2585
2586             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2587             ewrt             = _mm_mul_pd(r23,ewtabscale);
2588             ewitab           = _mm_cvttpd_epi32(ewrt);
2589 #ifdef __XOP__
2590             eweps            = _mm_frcz_pd(ewrt);
2591 #else
2592             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2593 #endif
2594             twoeweps         = _mm_add_pd(eweps,eweps);
2595             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2596             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2597             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2598
2599             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
2600
2601             fscal            = felec;
2602
2603             fscal            = _mm_and_pd(fscal,cutoff_mask);
2604
2605             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2606
2607             /* Update vectorial force */
2608             fix2             = _mm_macc_pd(dx23,fscal,fix2);
2609             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
2610             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
2611             
2612             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
2613             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
2614             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
2615
2616             }
2617
2618             /**************************
2619              * CALCULATE INTERACTIONS *
2620              **************************/
2621
2622             if (gmx_mm_any_lt(rsq31,rcutoff2))
2623             {
2624
2625             r31              = _mm_mul_pd(rsq31,rinv31);
2626
2627             /* EWALD ELECTROSTATICS */
2628
2629             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2630             ewrt             = _mm_mul_pd(r31,ewtabscale);
2631             ewitab           = _mm_cvttpd_epi32(ewrt);
2632 #ifdef __XOP__
2633             eweps            = _mm_frcz_pd(ewrt);
2634 #else
2635             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2636 #endif
2637             twoeweps         = _mm_add_pd(eweps,eweps);
2638             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2639             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2640             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2641
2642             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
2643
2644             fscal            = felec;
2645
2646             fscal            = _mm_and_pd(fscal,cutoff_mask);
2647
2648             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2649
2650             /* Update vectorial force */
2651             fix3             = _mm_macc_pd(dx31,fscal,fix3);
2652             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
2653             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
2654             
2655             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
2656             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
2657             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
2658
2659             }
2660
2661             /**************************
2662              * CALCULATE INTERACTIONS *
2663              **************************/
2664
2665             if (gmx_mm_any_lt(rsq32,rcutoff2))
2666             {
2667
2668             r32              = _mm_mul_pd(rsq32,rinv32);
2669
2670             /* EWALD ELECTROSTATICS */
2671
2672             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2673             ewrt             = _mm_mul_pd(r32,ewtabscale);
2674             ewitab           = _mm_cvttpd_epi32(ewrt);
2675 #ifdef __XOP__
2676             eweps            = _mm_frcz_pd(ewrt);
2677 #else
2678             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2679 #endif
2680             twoeweps         = _mm_add_pd(eweps,eweps);
2681             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2682             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2683             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2684
2685             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
2686
2687             fscal            = felec;
2688
2689             fscal            = _mm_and_pd(fscal,cutoff_mask);
2690
2691             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2692
2693             /* Update vectorial force */
2694             fix3             = _mm_macc_pd(dx32,fscal,fix3);
2695             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
2696             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
2697             
2698             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
2699             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
2700             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
2701
2702             }
2703
2704             /**************************
2705              * CALCULATE INTERACTIONS *
2706              **************************/
2707
2708             if (gmx_mm_any_lt(rsq33,rcutoff2))
2709             {
2710
2711             r33              = _mm_mul_pd(rsq33,rinv33);
2712
2713             /* EWALD ELECTROSTATICS */
2714
2715             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2716             ewrt             = _mm_mul_pd(r33,ewtabscale);
2717             ewitab           = _mm_cvttpd_epi32(ewrt);
2718 #ifdef __XOP__
2719             eweps            = _mm_frcz_pd(ewrt);
2720 #else
2721             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2722 #endif
2723             twoeweps         = _mm_add_pd(eweps,eweps);
2724             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2725             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2726             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2727
2728             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
2729
2730             fscal            = felec;
2731
2732             fscal            = _mm_and_pd(fscal,cutoff_mask);
2733
2734             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2735
2736             /* Update vectorial force */
2737             fix3             = _mm_macc_pd(dx33,fscal,fix3);
2738             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
2739             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
2740             
2741             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
2742             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
2743             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
2744
2745             }
2746
2747             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2748
2749             /* Inner loop uses 431 flops */
2750         }
2751
2752         /* End of innermost loop */
2753
2754         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2755                                               f+i_coord_offset,fshift+i_shift_offset);
2756
2757         /* Increment number of inner iterations */
2758         inneriter                  += j_index_end - j_index_start;
2759
2760         /* Outer loop uses 24 flops */
2761     }
2762
2763     /* Increment number of outer iterations */
2764     outeriter        += nri;
2765
2766     /* Update outer/inner flops */
2767
2768     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*431);
2769 }