Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_avx_128_fma_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
50 #include "kernelutil_x86_avx_128_fma_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_double
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB;
77     int              j_coord_offsetA,j_coord_offsetB;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwioffset3;
89     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90     int              vdwjidx0A,vdwjidx0B;
91     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B;
93     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B;
95     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B;
97     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
115     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
116     __m128i          ewitab;
117     __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
118     real             *ewtab;
119     __m128d          dummy_mask,cutoff_mask;
120     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
121     __m128d          one     = _mm_set1_pd(1.0);
122     __m128d          two     = _mm_set1_pd(2.0);
123     x                = xx[0];
124     f                = ff[0];
125
126     nri              = nlist->nri;
127     iinr             = nlist->iinr;
128     jindex           = nlist->jindex;
129     jjnr             = nlist->jjnr;
130     shiftidx         = nlist->shift;
131     gid              = nlist->gid;
132     shiftvec         = fr->shift_vec[0];
133     fshift           = fr->fshift[0];
134     facel            = _mm_set1_pd(fr->epsfac);
135     charge           = mdatoms->chargeA;
136     nvdwtype         = fr->ntype;
137     vdwparam         = fr->nbfp;
138     vdwtype          = mdatoms->typeA;
139
140     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
141     ewtab            = fr->ic->tabq_coul_FDV0;
142     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
143     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
144
145     /* Setup water-specific parameters */
146     inr              = nlist->iinr[0];
147     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
148     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
149     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
150     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
151
152     jq1              = _mm_set1_pd(charge[inr+1]);
153     jq2              = _mm_set1_pd(charge[inr+2]);
154     jq3              = _mm_set1_pd(charge[inr+3]);
155     vdwjidx0A        = 2*vdwtype[inr+0];
156     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
157     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
158     qq11             = _mm_mul_pd(iq1,jq1);
159     qq12             = _mm_mul_pd(iq1,jq2);
160     qq13             = _mm_mul_pd(iq1,jq3);
161     qq21             = _mm_mul_pd(iq2,jq1);
162     qq22             = _mm_mul_pd(iq2,jq2);
163     qq23             = _mm_mul_pd(iq2,jq3);
164     qq31             = _mm_mul_pd(iq3,jq1);
165     qq32             = _mm_mul_pd(iq3,jq2);
166     qq33             = _mm_mul_pd(iq3,jq3);
167
168     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169     rcutoff_scalar   = fr->rcoulomb;
170     rcutoff          = _mm_set1_pd(rcutoff_scalar);
171     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
172
173     sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
174     rvdw             = _mm_set1_pd(fr->rvdw);
175
176     /* Avoid stupid compiler warnings */
177     jnrA = jnrB = 0;
178     j_coord_offsetA = 0;
179     j_coord_offsetB = 0;
180
181     outeriter        = 0;
182     inneriter        = 0;
183
184     /* Start outer loop over neighborlists */
185     for(iidx=0; iidx<nri; iidx++)
186     {
187         /* Load shift vector for this list */
188         i_shift_offset   = DIM*shiftidx[iidx];
189
190         /* Load limits for loop over neighbors */
191         j_index_start    = jindex[iidx];
192         j_index_end      = jindex[iidx+1];
193
194         /* Get outer coordinate index */
195         inr              = iinr[iidx];
196         i_coord_offset   = DIM*inr;
197
198         /* Load i particle coords and add shift vector */
199         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
200                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
201
202         fix0             = _mm_setzero_pd();
203         fiy0             = _mm_setzero_pd();
204         fiz0             = _mm_setzero_pd();
205         fix1             = _mm_setzero_pd();
206         fiy1             = _mm_setzero_pd();
207         fiz1             = _mm_setzero_pd();
208         fix2             = _mm_setzero_pd();
209         fiy2             = _mm_setzero_pd();
210         fiz2             = _mm_setzero_pd();
211         fix3             = _mm_setzero_pd();
212         fiy3             = _mm_setzero_pd();
213         fiz3             = _mm_setzero_pd();
214
215         /* Reset potential sums */
216         velecsum         = _mm_setzero_pd();
217         vvdwsum          = _mm_setzero_pd();
218
219         /* Start inner kernel loop */
220         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
221         {
222
223             /* Get j neighbor index, and coordinate index */
224             jnrA             = jjnr[jidx];
225             jnrB             = jjnr[jidx+1];
226             j_coord_offsetA  = DIM*jnrA;
227             j_coord_offsetB  = DIM*jnrB;
228
229             /* load j atom coordinates */
230             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
231                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
232                                               &jy2,&jz2,&jx3,&jy3,&jz3);
233
234             /* Calculate displacement vector */
235             dx00             = _mm_sub_pd(ix0,jx0);
236             dy00             = _mm_sub_pd(iy0,jy0);
237             dz00             = _mm_sub_pd(iz0,jz0);
238             dx11             = _mm_sub_pd(ix1,jx1);
239             dy11             = _mm_sub_pd(iy1,jy1);
240             dz11             = _mm_sub_pd(iz1,jz1);
241             dx12             = _mm_sub_pd(ix1,jx2);
242             dy12             = _mm_sub_pd(iy1,jy2);
243             dz12             = _mm_sub_pd(iz1,jz2);
244             dx13             = _mm_sub_pd(ix1,jx3);
245             dy13             = _mm_sub_pd(iy1,jy3);
246             dz13             = _mm_sub_pd(iz1,jz3);
247             dx21             = _mm_sub_pd(ix2,jx1);
248             dy21             = _mm_sub_pd(iy2,jy1);
249             dz21             = _mm_sub_pd(iz2,jz1);
250             dx22             = _mm_sub_pd(ix2,jx2);
251             dy22             = _mm_sub_pd(iy2,jy2);
252             dz22             = _mm_sub_pd(iz2,jz2);
253             dx23             = _mm_sub_pd(ix2,jx3);
254             dy23             = _mm_sub_pd(iy2,jy3);
255             dz23             = _mm_sub_pd(iz2,jz3);
256             dx31             = _mm_sub_pd(ix3,jx1);
257             dy31             = _mm_sub_pd(iy3,jy1);
258             dz31             = _mm_sub_pd(iz3,jz1);
259             dx32             = _mm_sub_pd(ix3,jx2);
260             dy32             = _mm_sub_pd(iy3,jy2);
261             dz32             = _mm_sub_pd(iz3,jz2);
262             dx33             = _mm_sub_pd(ix3,jx3);
263             dy33             = _mm_sub_pd(iy3,jy3);
264             dz33             = _mm_sub_pd(iz3,jz3);
265
266             /* Calculate squared distance and things based on it */
267             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
268             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
269             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
270             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
271             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
272             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
273             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
274             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
275             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
276             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
277
278             rinv11           = gmx_mm_invsqrt_pd(rsq11);
279             rinv12           = gmx_mm_invsqrt_pd(rsq12);
280             rinv13           = gmx_mm_invsqrt_pd(rsq13);
281             rinv21           = gmx_mm_invsqrt_pd(rsq21);
282             rinv22           = gmx_mm_invsqrt_pd(rsq22);
283             rinv23           = gmx_mm_invsqrt_pd(rsq23);
284             rinv31           = gmx_mm_invsqrt_pd(rsq31);
285             rinv32           = gmx_mm_invsqrt_pd(rsq32);
286             rinv33           = gmx_mm_invsqrt_pd(rsq33);
287
288             rinvsq00         = gmx_mm_inv_pd(rsq00);
289             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
290             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
291             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
292             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
293             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
294             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
295             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
296             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
297             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
298
299             fjx0             = _mm_setzero_pd();
300             fjy0             = _mm_setzero_pd();
301             fjz0             = _mm_setzero_pd();
302             fjx1             = _mm_setzero_pd();
303             fjy1             = _mm_setzero_pd();
304             fjz1             = _mm_setzero_pd();
305             fjx2             = _mm_setzero_pd();
306             fjy2             = _mm_setzero_pd();
307             fjz2             = _mm_setzero_pd();
308             fjx3             = _mm_setzero_pd();
309             fjy3             = _mm_setzero_pd();
310             fjz3             = _mm_setzero_pd();
311
312             /**************************
313              * CALCULATE INTERACTIONS *
314              **************************/
315
316             if (gmx_mm_any_lt(rsq00,rcutoff2))
317             {
318
319             /* LENNARD-JONES DISPERSION/REPULSION */
320
321             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
322             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
323             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
324             vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
325                                            _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
326             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
327
328             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
329
330             /* Update potential sum for this i atom from the interaction with this j atom. */
331             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
332             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
333
334             fscal            = fvdw;
335
336             fscal            = _mm_and_pd(fscal,cutoff_mask);
337
338             /* Update vectorial force */
339             fix0             = _mm_macc_pd(dx00,fscal,fix0);
340             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
341             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
342             
343             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
344             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
345             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
346
347             }
348
349             /**************************
350              * CALCULATE INTERACTIONS *
351              **************************/
352
353             if (gmx_mm_any_lt(rsq11,rcutoff2))
354             {
355
356             r11              = _mm_mul_pd(rsq11,rinv11);
357
358             /* EWALD ELECTROSTATICS */
359
360             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
361             ewrt             = _mm_mul_pd(r11,ewtabscale);
362             ewitab           = _mm_cvttpd_epi32(ewrt);
363 #ifdef __XOP__
364             eweps            = _mm_frcz_pd(ewrt);
365 #else
366             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
367 #endif
368             twoeweps         = _mm_add_pd(eweps,eweps);
369             ewitab           = _mm_slli_epi32(ewitab,2);
370             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
371             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
372             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
373             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
374             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
375             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
376             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
377             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
378             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
379             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
380
381             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
382
383             /* Update potential sum for this i atom from the interaction with this j atom. */
384             velec            = _mm_and_pd(velec,cutoff_mask);
385             velecsum         = _mm_add_pd(velecsum,velec);
386
387             fscal            = felec;
388
389             fscal            = _mm_and_pd(fscal,cutoff_mask);
390
391             /* Update vectorial force */
392             fix1             = _mm_macc_pd(dx11,fscal,fix1);
393             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
394             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
395             
396             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
397             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
398             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
399
400             }
401
402             /**************************
403              * CALCULATE INTERACTIONS *
404              **************************/
405
406             if (gmx_mm_any_lt(rsq12,rcutoff2))
407             {
408
409             r12              = _mm_mul_pd(rsq12,rinv12);
410
411             /* EWALD ELECTROSTATICS */
412
413             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
414             ewrt             = _mm_mul_pd(r12,ewtabscale);
415             ewitab           = _mm_cvttpd_epi32(ewrt);
416 #ifdef __XOP__
417             eweps            = _mm_frcz_pd(ewrt);
418 #else
419             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
420 #endif
421             twoeweps         = _mm_add_pd(eweps,eweps);
422             ewitab           = _mm_slli_epi32(ewitab,2);
423             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
424             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
425             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
426             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
427             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
428             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
429             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
430             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
431             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
432             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
433
434             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
435
436             /* Update potential sum for this i atom from the interaction with this j atom. */
437             velec            = _mm_and_pd(velec,cutoff_mask);
438             velecsum         = _mm_add_pd(velecsum,velec);
439
440             fscal            = felec;
441
442             fscal            = _mm_and_pd(fscal,cutoff_mask);
443
444             /* Update vectorial force */
445             fix1             = _mm_macc_pd(dx12,fscal,fix1);
446             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
447             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
448             
449             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
450             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
451             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
452
453             }
454
455             /**************************
456              * CALCULATE INTERACTIONS *
457              **************************/
458
459             if (gmx_mm_any_lt(rsq13,rcutoff2))
460             {
461
462             r13              = _mm_mul_pd(rsq13,rinv13);
463
464             /* EWALD ELECTROSTATICS */
465
466             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
467             ewrt             = _mm_mul_pd(r13,ewtabscale);
468             ewitab           = _mm_cvttpd_epi32(ewrt);
469 #ifdef __XOP__
470             eweps            = _mm_frcz_pd(ewrt);
471 #else
472             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
473 #endif
474             twoeweps         = _mm_add_pd(eweps,eweps);
475             ewitab           = _mm_slli_epi32(ewitab,2);
476             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
477             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
478             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
479             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
480             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
481             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
482             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
483             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
484             velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
485             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
486
487             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
488
489             /* Update potential sum for this i atom from the interaction with this j atom. */
490             velec            = _mm_and_pd(velec,cutoff_mask);
491             velecsum         = _mm_add_pd(velecsum,velec);
492
493             fscal            = felec;
494
495             fscal            = _mm_and_pd(fscal,cutoff_mask);
496
497             /* Update vectorial force */
498             fix1             = _mm_macc_pd(dx13,fscal,fix1);
499             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
500             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
501             
502             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
503             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
504             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
505
506             }
507
508             /**************************
509              * CALCULATE INTERACTIONS *
510              **************************/
511
512             if (gmx_mm_any_lt(rsq21,rcutoff2))
513             {
514
515             r21              = _mm_mul_pd(rsq21,rinv21);
516
517             /* EWALD ELECTROSTATICS */
518
519             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
520             ewrt             = _mm_mul_pd(r21,ewtabscale);
521             ewitab           = _mm_cvttpd_epi32(ewrt);
522 #ifdef __XOP__
523             eweps            = _mm_frcz_pd(ewrt);
524 #else
525             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
526 #endif
527             twoeweps         = _mm_add_pd(eweps,eweps);
528             ewitab           = _mm_slli_epi32(ewitab,2);
529             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
530             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
531             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
532             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
533             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
534             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
535             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
536             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
537             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
538             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
539
540             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
541
542             /* Update potential sum for this i atom from the interaction with this j atom. */
543             velec            = _mm_and_pd(velec,cutoff_mask);
544             velecsum         = _mm_add_pd(velecsum,velec);
545
546             fscal            = felec;
547
548             fscal            = _mm_and_pd(fscal,cutoff_mask);
549
550             /* Update vectorial force */
551             fix2             = _mm_macc_pd(dx21,fscal,fix2);
552             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
553             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
554             
555             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
556             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
557             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
558
559             }
560
561             /**************************
562              * CALCULATE INTERACTIONS *
563              **************************/
564
565             if (gmx_mm_any_lt(rsq22,rcutoff2))
566             {
567
568             r22              = _mm_mul_pd(rsq22,rinv22);
569
570             /* EWALD ELECTROSTATICS */
571
572             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
573             ewrt             = _mm_mul_pd(r22,ewtabscale);
574             ewitab           = _mm_cvttpd_epi32(ewrt);
575 #ifdef __XOP__
576             eweps            = _mm_frcz_pd(ewrt);
577 #else
578             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
579 #endif
580             twoeweps         = _mm_add_pd(eweps,eweps);
581             ewitab           = _mm_slli_epi32(ewitab,2);
582             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
583             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
584             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
585             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
586             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
587             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
588             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
589             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
590             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
591             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
592
593             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
594
595             /* Update potential sum for this i atom from the interaction with this j atom. */
596             velec            = _mm_and_pd(velec,cutoff_mask);
597             velecsum         = _mm_add_pd(velecsum,velec);
598
599             fscal            = felec;
600
601             fscal            = _mm_and_pd(fscal,cutoff_mask);
602
603             /* Update vectorial force */
604             fix2             = _mm_macc_pd(dx22,fscal,fix2);
605             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
606             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
607             
608             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
609             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
610             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
611
612             }
613
614             /**************************
615              * CALCULATE INTERACTIONS *
616              **************************/
617
618             if (gmx_mm_any_lt(rsq23,rcutoff2))
619             {
620
621             r23              = _mm_mul_pd(rsq23,rinv23);
622
623             /* EWALD ELECTROSTATICS */
624
625             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
626             ewrt             = _mm_mul_pd(r23,ewtabscale);
627             ewitab           = _mm_cvttpd_epi32(ewrt);
628 #ifdef __XOP__
629             eweps            = _mm_frcz_pd(ewrt);
630 #else
631             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
632 #endif
633             twoeweps         = _mm_add_pd(eweps,eweps);
634             ewitab           = _mm_slli_epi32(ewitab,2);
635             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
636             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
637             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
638             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
639             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
640             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
641             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
642             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
643             velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
644             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
645
646             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
647
648             /* Update potential sum for this i atom from the interaction with this j atom. */
649             velec            = _mm_and_pd(velec,cutoff_mask);
650             velecsum         = _mm_add_pd(velecsum,velec);
651
652             fscal            = felec;
653
654             fscal            = _mm_and_pd(fscal,cutoff_mask);
655
656             /* Update vectorial force */
657             fix2             = _mm_macc_pd(dx23,fscal,fix2);
658             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
659             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
660             
661             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
662             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
663             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
664
665             }
666
667             /**************************
668              * CALCULATE INTERACTIONS *
669              **************************/
670
671             if (gmx_mm_any_lt(rsq31,rcutoff2))
672             {
673
674             r31              = _mm_mul_pd(rsq31,rinv31);
675
676             /* EWALD ELECTROSTATICS */
677
678             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
679             ewrt             = _mm_mul_pd(r31,ewtabscale);
680             ewitab           = _mm_cvttpd_epi32(ewrt);
681 #ifdef __XOP__
682             eweps            = _mm_frcz_pd(ewrt);
683 #else
684             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
685 #endif
686             twoeweps         = _mm_add_pd(eweps,eweps);
687             ewitab           = _mm_slli_epi32(ewitab,2);
688             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
689             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
690             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
691             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
692             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
693             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
694             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
695             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
696             velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
697             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
698
699             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
700
701             /* Update potential sum for this i atom from the interaction with this j atom. */
702             velec            = _mm_and_pd(velec,cutoff_mask);
703             velecsum         = _mm_add_pd(velecsum,velec);
704
705             fscal            = felec;
706
707             fscal            = _mm_and_pd(fscal,cutoff_mask);
708
709             /* Update vectorial force */
710             fix3             = _mm_macc_pd(dx31,fscal,fix3);
711             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
712             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
713             
714             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
715             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
716             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
717
718             }
719
720             /**************************
721              * CALCULATE INTERACTIONS *
722              **************************/
723
724             if (gmx_mm_any_lt(rsq32,rcutoff2))
725             {
726
727             r32              = _mm_mul_pd(rsq32,rinv32);
728
729             /* EWALD ELECTROSTATICS */
730
731             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
732             ewrt             = _mm_mul_pd(r32,ewtabscale);
733             ewitab           = _mm_cvttpd_epi32(ewrt);
734 #ifdef __XOP__
735             eweps            = _mm_frcz_pd(ewrt);
736 #else
737             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
738 #endif
739             twoeweps         = _mm_add_pd(eweps,eweps);
740             ewitab           = _mm_slli_epi32(ewitab,2);
741             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
742             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
743             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
744             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
745             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
746             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
747             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
748             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
749             velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
750             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
751
752             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
753
754             /* Update potential sum for this i atom from the interaction with this j atom. */
755             velec            = _mm_and_pd(velec,cutoff_mask);
756             velecsum         = _mm_add_pd(velecsum,velec);
757
758             fscal            = felec;
759
760             fscal            = _mm_and_pd(fscal,cutoff_mask);
761
762             /* Update vectorial force */
763             fix3             = _mm_macc_pd(dx32,fscal,fix3);
764             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
765             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
766             
767             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
768             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
769             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
770
771             }
772
773             /**************************
774              * CALCULATE INTERACTIONS *
775              **************************/
776
777             if (gmx_mm_any_lt(rsq33,rcutoff2))
778             {
779
780             r33              = _mm_mul_pd(rsq33,rinv33);
781
782             /* EWALD ELECTROSTATICS */
783
784             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
785             ewrt             = _mm_mul_pd(r33,ewtabscale);
786             ewitab           = _mm_cvttpd_epi32(ewrt);
787 #ifdef __XOP__
788             eweps            = _mm_frcz_pd(ewrt);
789 #else
790             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
791 #endif
792             twoeweps         = _mm_add_pd(eweps,eweps);
793             ewitab           = _mm_slli_epi32(ewitab,2);
794             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
795             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
796             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
797             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
798             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
799             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
800             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
801             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
802             velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
803             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
804
805             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
806
807             /* Update potential sum for this i atom from the interaction with this j atom. */
808             velec            = _mm_and_pd(velec,cutoff_mask);
809             velecsum         = _mm_add_pd(velecsum,velec);
810
811             fscal            = felec;
812
813             fscal            = _mm_and_pd(fscal,cutoff_mask);
814
815             /* Update vectorial force */
816             fix3             = _mm_macc_pd(dx33,fscal,fix3);
817             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
818             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
819             
820             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
821             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
822             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
823
824             }
825
826             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
827
828             /* Inner loop uses 488 flops */
829         }
830
831         if(jidx<j_index_end)
832         {
833
834             jnrA             = jjnr[jidx];
835             j_coord_offsetA  = DIM*jnrA;
836
837             /* load j atom coordinates */
838             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
839                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
840                                               &jy2,&jz2,&jx3,&jy3,&jz3);
841
842             /* Calculate displacement vector */
843             dx00             = _mm_sub_pd(ix0,jx0);
844             dy00             = _mm_sub_pd(iy0,jy0);
845             dz00             = _mm_sub_pd(iz0,jz0);
846             dx11             = _mm_sub_pd(ix1,jx1);
847             dy11             = _mm_sub_pd(iy1,jy1);
848             dz11             = _mm_sub_pd(iz1,jz1);
849             dx12             = _mm_sub_pd(ix1,jx2);
850             dy12             = _mm_sub_pd(iy1,jy2);
851             dz12             = _mm_sub_pd(iz1,jz2);
852             dx13             = _mm_sub_pd(ix1,jx3);
853             dy13             = _mm_sub_pd(iy1,jy3);
854             dz13             = _mm_sub_pd(iz1,jz3);
855             dx21             = _mm_sub_pd(ix2,jx1);
856             dy21             = _mm_sub_pd(iy2,jy1);
857             dz21             = _mm_sub_pd(iz2,jz1);
858             dx22             = _mm_sub_pd(ix2,jx2);
859             dy22             = _mm_sub_pd(iy2,jy2);
860             dz22             = _mm_sub_pd(iz2,jz2);
861             dx23             = _mm_sub_pd(ix2,jx3);
862             dy23             = _mm_sub_pd(iy2,jy3);
863             dz23             = _mm_sub_pd(iz2,jz3);
864             dx31             = _mm_sub_pd(ix3,jx1);
865             dy31             = _mm_sub_pd(iy3,jy1);
866             dz31             = _mm_sub_pd(iz3,jz1);
867             dx32             = _mm_sub_pd(ix3,jx2);
868             dy32             = _mm_sub_pd(iy3,jy2);
869             dz32             = _mm_sub_pd(iz3,jz2);
870             dx33             = _mm_sub_pd(ix3,jx3);
871             dy33             = _mm_sub_pd(iy3,jy3);
872             dz33             = _mm_sub_pd(iz3,jz3);
873
874             /* Calculate squared distance and things based on it */
875             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
876             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
877             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
878             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
879             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
880             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
881             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
882             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
883             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
884             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
885
886             rinv11           = gmx_mm_invsqrt_pd(rsq11);
887             rinv12           = gmx_mm_invsqrt_pd(rsq12);
888             rinv13           = gmx_mm_invsqrt_pd(rsq13);
889             rinv21           = gmx_mm_invsqrt_pd(rsq21);
890             rinv22           = gmx_mm_invsqrt_pd(rsq22);
891             rinv23           = gmx_mm_invsqrt_pd(rsq23);
892             rinv31           = gmx_mm_invsqrt_pd(rsq31);
893             rinv32           = gmx_mm_invsqrt_pd(rsq32);
894             rinv33           = gmx_mm_invsqrt_pd(rsq33);
895
896             rinvsq00         = gmx_mm_inv_pd(rsq00);
897             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
898             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
899             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
900             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
901             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
902             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
903             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
904             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
905             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
906
907             fjx0             = _mm_setzero_pd();
908             fjy0             = _mm_setzero_pd();
909             fjz0             = _mm_setzero_pd();
910             fjx1             = _mm_setzero_pd();
911             fjy1             = _mm_setzero_pd();
912             fjz1             = _mm_setzero_pd();
913             fjx2             = _mm_setzero_pd();
914             fjy2             = _mm_setzero_pd();
915             fjz2             = _mm_setzero_pd();
916             fjx3             = _mm_setzero_pd();
917             fjy3             = _mm_setzero_pd();
918             fjz3             = _mm_setzero_pd();
919
920             /**************************
921              * CALCULATE INTERACTIONS *
922              **************************/
923
924             if (gmx_mm_any_lt(rsq00,rcutoff2))
925             {
926
927             /* LENNARD-JONES DISPERSION/REPULSION */
928
929             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
930             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
931             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
932             vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
933                                            _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
934             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
935
936             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
937
938             /* Update potential sum for this i atom from the interaction with this j atom. */
939             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
940             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
941             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
942
943             fscal            = fvdw;
944
945             fscal            = _mm_and_pd(fscal,cutoff_mask);
946
947             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
948
949             /* Update vectorial force */
950             fix0             = _mm_macc_pd(dx00,fscal,fix0);
951             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
952             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
953             
954             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
955             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
956             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
957
958             }
959
960             /**************************
961              * CALCULATE INTERACTIONS *
962              **************************/
963
964             if (gmx_mm_any_lt(rsq11,rcutoff2))
965             {
966
967             r11              = _mm_mul_pd(rsq11,rinv11);
968
969             /* EWALD ELECTROSTATICS */
970
971             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
972             ewrt             = _mm_mul_pd(r11,ewtabscale);
973             ewitab           = _mm_cvttpd_epi32(ewrt);
974 #ifdef __XOP__
975             eweps            = _mm_frcz_pd(ewrt);
976 #else
977             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
978 #endif
979             twoeweps         = _mm_add_pd(eweps,eweps);
980             ewitab           = _mm_slli_epi32(ewitab,2);
981             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
982             ewtabD           = _mm_setzero_pd();
983             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
984             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
985             ewtabFn          = _mm_setzero_pd();
986             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
987             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
988             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
989             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
990             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
991
992             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
993
994             /* Update potential sum for this i atom from the interaction with this j atom. */
995             velec            = _mm_and_pd(velec,cutoff_mask);
996             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
997             velecsum         = _mm_add_pd(velecsum,velec);
998
999             fscal            = felec;
1000
1001             fscal            = _mm_and_pd(fscal,cutoff_mask);
1002
1003             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1004
1005             /* Update vectorial force */
1006             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1007             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1008             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1009             
1010             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1011             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1012             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1013
1014             }
1015
1016             /**************************
1017              * CALCULATE INTERACTIONS *
1018              **************************/
1019
1020             if (gmx_mm_any_lt(rsq12,rcutoff2))
1021             {
1022
1023             r12              = _mm_mul_pd(rsq12,rinv12);
1024
1025             /* EWALD ELECTROSTATICS */
1026
1027             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1028             ewrt             = _mm_mul_pd(r12,ewtabscale);
1029             ewitab           = _mm_cvttpd_epi32(ewrt);
1030 #ifdef __XOP__
1031             eweps            = _mm_frcz_pd(ewrt);
1032 #else
1033             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1034 #endif
1035             twoeweps         = _mm_add_pd(eweps,eweps);
1036             ewitab           = _mm_slli_epi32(ewitab,2);
1037             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1038             ewtabD           = _mm_setzero_pd();
1039             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1040             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1041             ewtabFn          = _mm_setzero_pd();
1042             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1043             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1044             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1045             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
1046             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1047
1048             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1049
1050             /* Update potential sum for this i atom from the interaction with this j atom. */
1051             velec            = _mm_and_pd(velec,cutoff_mask);
1052             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1053             velecsum         = _mm_add_pd(velecsum,velec);
1054
1055             fscal            = felec;
1056
1057             fscal            = _mm_and_pd(fscal,cutoff_mask);
1058
1059             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1060
1061             /* Update vectorial force */
1062             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1063             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1064             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1065             
1066             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1067             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1068             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1069
1070             }
1071
1072             /**************************
1073              * CALCULATE INTERACTIONS *
1074              **************************/
1075
1076             if (gmx_mm_any_lt(rsq13,rcutoff2))
1077             {
1078
1079             r13              = _mm_mul_pd(rsq13,rinv13);
1080
1081             /* EWALD ELECTROSTATICS */
1082
1083             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1084             ewrt             = _mm_mul_pd(r13,ewtabscale);
1085             ewitab           = _mm_cvttpd_epi32(ewrt);
1086 #ifdef __XOP__
1087             eweps            = _mm_frcz_pd(ewrt);
1088 #else
1089             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1090 #endif
1091             twoeweps         = _mm_add_pd(eweps,eweps);
1092             ewitab           = _mm_slli_epi32(ewitab,2);
1093             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1094             ewtabD           = _mm_setzero_pd();
1095             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1096             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1097             ewtabFn          = _mm_setzero_pd();
1098             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1099             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1100             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1101             velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
1102             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1103
1104             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
1105
1106             /* Update potential sum for this i atom from the interaction with this j atom. */
1107             velec            = _mm_and_pd(velec,cutoff_mask);
1108             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1109             velecsum         = _mm_add_pd(velecsum,velec);
1110
1111             fscal            = felec;
1112
1113             fscal            = _mm_and_pd(fscal,cutoff_mask);
1114
1115             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1116
1117             /* Update vectorial force */
1118             fix1             = _mm_macc_pd(dx13,fscal,fix1);
1119             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
1120             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
1121             
1122             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
1123             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
1124             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
1125
1126             }
1127
1128             /**************************
1129              * CALCULATE INTERACTIONS *
1130              **************************/
1131
1132             if (gmx_mm_any_lt(rsq21,rcutoff2))
1133             {
1134
1135             r21              = _mm_mul_pd(rsq21,rinv21);
1136
1137             /* EWALD ELECTROSTATICS */
1138
1139             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1140             ewrt             = _mm_mul_pd(r21,ewtabscale);
1141             ewitab           = _mm_cvttpd_epi32(ewrt);
1142 #ifdef __XOP__
1143             eweps            = _mm_frcz_pd(ewrt);
1144 #else
1145             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1146 #endif
1147             twoeweps         = _mm_add_pd(eweps,eweps);
1148             ewitab           = _mm_slli_epi32(ewitab,2);
1149             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1150             ewtabD           = _mm_setzero_pd();
1151             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1152             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1153             ewtabFn          = _mm_setzero_pd();
1154             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1155             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1156             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1157             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
1158             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1159
1160             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1161
1162             /* Update potential sum for this i atom from the interaction with this j atom. */
1163             velec            = _mm_and_pd(velec,cutoff_mask);
1164             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1165             velecsum         = _mm_add_pd(velecsum,velec);
1166
1167             fscal            = felec;
1168
1169             fscal            = _mm_and_pd(fscal,cutoff_mask);
1170
1171             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1172
1173             /* Update vectorial force */
1174             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1175             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1176             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1177             
1178             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1179             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1180             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1181
1182             }
1183
1184             /**************************
1185              * CALCULATE INTERACTIONS *
1186              **************************/
1187
1188             if (gmx_mm_any_lt(rsq22,rcutoff2))
1189             {
1190
1191             r22              = _mm_mul_pd(rsq22,rinv22);
1192
1193             /* EWALD ELECTROSTATICS */
1194
1195             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1196             ewrt             = _mm_mul_pd(r22,ewtabscale);
1197             ewitab           = _mm_cvttpd_epi32(ewrt);
1198 #ifdef __XOP__
1199             eweps            = _mm_frcz_pd(ewrt);
1200 #else
1201             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1202 #endif
1203             twoeweps         = _mm_add_pd(eweps,eweps);
1204             ewitab           = _mm_slli_epi32(ewitab,2);
1205             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1206             ewtabD           = _mm_setzero_pd();
1207             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1208             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1209             ewtabFn          = _mm_setzero_pd();
1210             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1211             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1212             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1213             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
1214             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1215
1216             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1217
1218             /* Update potential sum for this i atom from the interaction with this j atom. */
1219             velec            = _mm_and_pd(velec,cutoff_mask);
1220             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1221             velecsum         = _mm_add_pd(velecsum,velec);
1222
1223             fscal            = felec;
1224
1225             fscal            = _mm_and_pd(fscal,cutoff_mask);
1226
1227             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1228
1229             /* Update vectorial force */
1230             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1231             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1232             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1233             
1234             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1235             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1236             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1237
1238             }
1239
1240             /**************************
1241              * CALCULATE INTERACTIONS *
1242              **************************/
1243
1244             if (gmx_mm_any_lt(rsq23,rcutoff2))
1245             {
1246
1247             r23              = _mm_mul_pd(rsq23,rinv23);
1248
1249             /* EWALD ELECTROSTATICS */
1250
1251             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1252             ewrt             = _mm_mul_pd(r23,ewtabscale);
1253             ewitab           = _mm_cvttpd_epi32(ewrt);
1254 #ifdef __XOP__
1255             eweps            = _mm_frcz_pd(ewrt);
1256 #else
1257             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1258 #endif
1259             twoeweps         = _mm_add_pd(eweps,eweps);
1260             ewitab           = _mm_slli_epi32(ewitab,2);
1261             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1262             ewtabD           = _mm_setzero_pd();
1263             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1264             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1265             ewtabFn          = _mm_setzero_pd();
1266             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1267             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1268             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1269             velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
1270             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1271
1272             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
1273
1274             /* Update potential sum for this i atom from the interaction with this j atom. */
1275             velec            = _mm_and_pd(velec,cutoff_mask);
1276             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1277             velecsum         = _mm_add_pd(velecsum,velec);
1278
1279             fscal            = felec;
1280
1281             fscal            = _mm_and_pd(fscal,cutoff_mask);
1282
1283             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1284
1285             /* Update vectorial force */
1286             fix2             = _mm_macc_pd(dx23,fscal,fix2);
1287             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
1288             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
1289             
1290             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
1291             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
1292             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
1293
1294             }
1295
1296             /**************************
1297              * CALCULATE INTERACTIONS *
1298              **************************/
1299
1300             if (gmx_mm_any_lt(rsq31,rcutoff2))
1301             {
1302
1303             r31              = _mm_mul_pd(rsq31,rinv31);
1304
1305             /* EWALD ELECTROSTATICS */
1306
1307             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1308             ewrt             = _mm_mul_pd(r31,ewtabscale);
1309             ewitab           = _mm_cvttpd_epi32(ewrt);
1310 #ifdef __XOP__
1311             eweps            = _mm_frcz_pd(ewrt);
1312 #else
1313             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1314 #endif
1315             twoeweps         = _mm_add_pd(eweps,eweps);
1316             ewitab           = _mm_slli_epi32(ewitab,2);
1317             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1318             ewtabD           = _mm_setzero_pd();
1319             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1320             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1321             ewtabFn          = _mm_setzero_pd();
1322             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1323             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1324             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1325             velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
1326             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1327
1328             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
1329
1330             /* Update potential sum for this i atom from the interaction with this j atom. */
1331             velec            = _mm_and_pd(velec,cutoff_mask);
1332             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1333             velecsum         = _mm_add_pd(velecsum,velec);
1334
1335             fscal            = felec;
1336
1337             fscal            = _mm_and_pd(fscal,cutoff_mask);
1338
1339             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1340
1341             /* Update vectorial force */
1342             fix3             = _mm_macc_pd(dx31,fscal,fix3);
1343             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
1344             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
1345             
1346             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
1347             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
1348             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
1349
1350             }
1351
1352             /**************************
1353              * CALCULATE INTERACTIONS *
1354              **************************/
1355
1356             if (gmx_mm_any_lt(rsq32,rcutoff2))
1357             {
1358
1359             r32              = _mm_mul_pd(rsq32,rinv32);
1360
1361             /* EWALD ELECTROSTATICS */
1362
1363             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1364             ewrt             = _mm_mul_pd(r32,ewtabscale);
1365             ewitab           = _mm_cvttpd_epi32(ewrt);
1366 #ifdef __XOP__
1367             eweps            = _mm_frcz_pd(ewrt);
1368 #else
1369             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1370 #endif
1371             twoeweps         = _mm_add_pd(eweps,eweps);
1372             ewitab           = _mm_slli_epi32(ewitab,2);
1373             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1374             ewtabD           = _mm_setzero_pd();
1375             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1376             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1377             ewtabFn          = _mm_setzero_pd();
1378             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1379             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1380             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1381             velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
1382             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1383
1384             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
1385
1386             /* Update potential sum for this i atom from the interaction with this j atom. */
1387             velec            = _mm_and_pd(velec,cutoff_mask);
1388             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1389             velecsum         = _mm_add_pd(velecsum,velec);
1390
1391             fscal            = felec;
1392
1393             fscal            = _mm_and_pd(fscal,cutoff_mask);
1394
1395             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1396
1397             /* Update vectorial force */
1398             fix3             = _mm_macc_pd(dx32,fscal,fix3);
1399             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
1400             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
1401             
1402             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
1403             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
1404             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
1405
1406             }
1407
1408             /**************************
1409              * CALCULATE INTERACTIONS *
1410              **************************/
1411
1412             if (gmx_mm_any_lt(rsq33,rcutoff2))
1413             {
1414
1415             r33              = _mm_mul_pd(rsq33,rinv33);
1416
1417             /* EWALD ELECTROSTATICS */
1418
1419             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1420             ewrt             = _mm_mul_pd(r33,ewtabscale);
1421             ewitab           = _mm_cvttpd_epi32(ewrt);
1422 #ifdef __XOP__
1423             eweps            = _mm_frcz_pd(ewrt);
1424 #else
1425             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1426 #endif
1427             twoeweps         = _mm_add_pd(eweps,eweps);
1428             ewitab           = _mm_slli_epi32(ewitab,2);
1429             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1430             ewtabD           = _mm_setzero_pd();
1431             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1432             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1433             ewtabFn          = _mm_setzero_pd();
1434             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1435             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1436             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1437             velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
1438             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1439
1440             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
1441
1442             /* Update potential sum for this i atom from the interaction with this j atom. */
1443             velec            = _mm_and_pd(velec,cutoff_mask);
1444             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1445             velecsum         = _mm_add_pd(velecsum,velec);
1446
1447             fscal            = felec;
1448
1449             fscal            = _mm_and_pd(fscal,cutoff_mask);
1450
1451             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1452
1453             /* Update vectorial force */
1454             fix3             = _mm_macc_pd(dx33,fscal,fix3);
1455             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
1456             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
1457             
1458             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
1459             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
1460             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
1461
1462             }
1463
1464             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1465
1466             /* Inner loop uses 488 flops */
1467         }
1468
1469         /* End of innermost loop */
1470
1471         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1472                                               f+i_coord_offset,fshift+i_shift_offset);
1473
1474         ggid                        = gid[iidx];
1475         /* Update potential energies */
1476         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1477         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1478
1479         /* Increment number of inner iterations */
1480         inneriter                  += j_index_end - j_index_start;
1481
1482         /* Outer loop uses 26 flops */
1483     }
1484
1485     /* Increment number of outer iterations */
1486     outeriter        += nri;
1487
1488     /* Update outer/inner flops */
1489
1490     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*488);
1491 }
1492 /*
1493  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_double
1494  * Electrostatics interaction: Ewald
1495  * VdW interaction:            LennardJones
1496  * Geometry:                   Water4-Water4
1497  * Calculate force/pot:        Force
1498  */
1499 void
1500 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_double
1501                     (t_nblist                    * gmx_restrict       nlist,
1502                      rvec                        * gmx_restrict          xx,
1503                      rvec                        * gmx_restrict          ff,
1504                      t_forcerec                  * gmx_restrict          fr,
1505                      t_mdatoms                   * gmx_restrict     mdatoms,
1506                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1507                      t_nrnb                      * gmx_restrict        nrnb)
1508 {
1509     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1510      * just 0 for non-waters.
1511      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1512      * jnr indices corresponding to data put in the four positions in the SIMD register.
1513      */
1514     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1515     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1516     int              jnrA,jnrB;
1517     int              j_coord_offsetA,j_coord_offsetB;
1518     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1519     real             rcutoff_scalar;
1520     real             *shiftvec,*fshift,*x,*f;
1521     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1522     int              vdwioffset0;
1523     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1524     int              vdwioffset1;
1525     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1526     int              vdwioffset2;
1527     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1528     int              vdwioffset3;
1529     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1530     int              vdwjidx0A,vdwjidx0B;
1531     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1532     int              vdwjidx1A,vdwjidx1B;
1533     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1534     int              vdwjidx2A,vdwjidx2B;
1535     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1536     int              vdwjidx3A,vdwjidx3B;
1537     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1538     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1539     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1540     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1541     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1542     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1543     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1544     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1545     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1546     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1547     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1548     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1549     real             *charge;
1550     int              nvdwtype;
1551     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1552     int              *vdwtype;
1553     real             *vdwparam;
1554     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1555     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1556     __m128i          ewitab;
1557     __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1558     real             *ewtab;
1559     __m128d          dummy_mask,cutoff_mask;
1560     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1561     __m128d          one     = _mm_set1_pd(1.0);
1562     __m128d          two     = _mm_set1_pd(2.0);
1563     x                = xx[0];
1564     f                = ff[0];
1565
1566     nri              = nlist->nri;
1567     iinr             = nlist->iinr;
1568     jindex           = nlist->jindex;
1569     jjnr             = nlist->jjnr;
1570     shiftidx         = nlist->shift;
1571     gid              = nlist->gid;
1572     shiftvec         = fr->shift_vec[0];
1573     fshift           = fr->fshift[0];
1574     facel            = _mm_set1_pd(fr->epsfac);
1575     charge           = mdatoms->chargeA;
1576     nvdwtype         = fr->ntype;
1577     vdwparam         = fr->nbfp;
1578     vdwtype          = mdatoms->typeA;
1579
1580     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1581     ewtab            = fr->ic->tabq_coul_F;
1582     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1583     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1584
1585     /* Setup water-specific parameters */
1586     inr              = nlist->iinr[0];
1587     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1588     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1589     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1590     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1591
1592     jq1              = _mm_set1_pd(charge[inr+1]);
1593     jq2              = _mm_set1_pd(charge[inr+2]);
1594     jq3              = _mm_set1_pd(charge[inr+3]);
1595     vdwjidx0A        = 2*vdwtype[inr+0];
1596     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1597     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1598     qq11             = _mm_mul_pd(iq1,jq1);
1599     qq12             = _mm_mul_pd(iq1,jq2);
1600     qq13             = _mm_mul_pd(iq1,jq3);
1601     qq21             = _mm_mul_pd(iq2,jq1);
1602     qq22             = _mm_mul_pd(iq2,jq2);
1603     qq23             = _mm_mul_pd(iq2,jq3);
1604     qq31             = _mm_mul_pd(iq3,jq1);
1605     qq32             = _mm_mul_pd(iq3,jq2);
1606     qq33             = _mm_mul_pd(iq3,jq3);
1607
1608     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1609     rcutoff_scalar   = fr->rcoulomb;
1610     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1611     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1612
1613     sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
1614     rvdw             = _mm_set1_pd(fr->rvdw);
1615
1616     /* Avoid stupid compiler warnings */
1617     jnrA = jnrB = 0;
1618     j_coord_offsetA = 0;
1619     j_coord_offsetB = 0;
1620
1621     outeriter        = 0;
1622     inneriter        = 0;
1623
1624     /* Start outer loop over neighborlists */
1625     for(iidx=0; iidx<nri; iidx++)
1626     {
1627         /* Load shift vector for this list */
1628         i_shift_offset   = DIM*shiftidx[iidx];
1629
1630         /* Load limits for loop over neighbors */
1631         j_index_start    = jindex[iidx];
1632         j_index_end      = jindex[iidx+1];
1633
1634         /* Get outer coordinate index */
1635         inr              = iinr[iidx];
1636         i_coord_offset   = DIM*inr;
1637
1638         /* Load i particle coords and add shift vector */
1639         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1640                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1641
1642         fix0             = _mm_setzero_pd();
1643         fiy0             = _mm_setzero_pd();
1644         fiz0             = _mm_setzero_pd();
1645         fix1             = _mm_setzero_pd();
1646         fiy1             = _mm_setzero_pd();
1647         fiz1             = _mm_setzero_pd();
1648         fix2             = _mm_setzero_pd();
1649         fiy2             = _mm_setzero_pd();
1650         fiz2             = _mm_setzero_pd();
1651         fix3             = _mm_setzero_pd();
1652         fiy3             = _mm_setzero_pd();
1653         fiz3             = _mm_setzero_pd();
1654
1655         /* Start inner kernel loop */
1656         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1657         {
1658
1659             /* Get j neighbor index, and coordinate index */
1660             jnrA             = jjnr[jidx];
1661             jnrB             = jjnr[jidx+1];
1662             j_coord_offsetA  = DIM*jnrA;
1663             j_coord_offsetB  = DIM*jnrB;
1664
1665             /* load j atom coordinates */
1666             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1667                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1668                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1669
1670             /* Calculate displacement vector */
1671             dx00             = _mm_sub_pd(ix0,jx0);
1672             dy00             = _mm_sub_pd(iy0,jy0);
1673             dz00             = _mm_sub_pd(iz0,jz0);
1674             dx11             = _mm_sub_pd(ix1,jx1);
1675             dy11             = _mm_sub_pd(iy1,jy1);
1676             dz11             = _mm_sub_pd(iz1,jz1);
1677             dx12             = _mm_sub_pd(ix1,jx2);
1678             dy12             = _mm_sub_pd(iy1,jy2);
1679             dz12             = _mm_sub_pd(iz1,jz2);
1680             dx13             = _mm_sub_pd(ix1,jx3);
1681             dy13             = _mm_sub_pd(iy1,jy3);
1682             dz13             = _mm_sub_pd(iz1,jz3);
1683             dx21             = _mm_sub_pd(ix2,jx1);
1684             dy21             = _mm_sub_pd(iy2,jy1);
1685             dz21             = _mm_sub_pd(iz2,jz1);
1686             dx22             = _mm_sub_pd(ix2,jx2);
1687             dy22             = _mm_sub_pd(iy2,jy2);
1688             dz22             = _mm_sub_pd(iz2,jz2);
1689             dx23             = _mm_sub_pd(ix2,jx3);
1690             dy23             = _mm_sub_pd(iy2,jy3);
1691             dz23             = _mm_sub_pd(iz2,jz3);
1692             dx31             = _mm_sub_pd(ix3,jx1);
1693             dy31             = _mm_sub_pd(iy3,jy1);
1694             dz31             = _mm_sub_pd(iz3,jz1);
1695             dx32             = _mm_sub_pd(ix3,jx2);
1696             dy32             = _mm_sub_pd(iy3,jy2);
1697             dz32             = _mm_sub_pd(iz3,jz2);
1698             dx33             = _mm_sub_pd(ix3,jx3);
1699             dy33             = _mm_sub_pd(iy3,jy3);
1700             dz33             = _mm_sub_pd(iz3,jz3);
1701
1702             /* Calculate squared distance and things based on it */
1703             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1704             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1705             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1706             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1707             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1708             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1709             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1710             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1711             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1712             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1713
1714             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1715             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1716             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1717             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1718             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1719             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1720             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1721             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1722             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1723
1724             rinvsq00         = gmx_mm_inv_pd(rsq00);
1725             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1726             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1727             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1728             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1729             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1730             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1731             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1732             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1733             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1734
1735             fjx0             = _mm_setzero_pd();
1736             fjy0             = _mm_setzero_pd();
1737             fjz0             = _mm_setzero_pd();
1738             fjx1             = _mm_setzero_pd();
1739             fjy1             = _mm_setzero_pd();
1740             fjz1             = _mm_setzero_pd();
1741             fjx2             = _mm_setzero_pd();
1742             fjy2             = _mm_setzero_pd();
1743             fjz2             = _mm_setzero_pd();
1744             fjx3             = _mm_setzero_pd();
1745             fjy3             = _mm_setzero_pd();
1746             fjz3             = _mm_setzero_pd();
1747
1748             /**************************
1749              * CALCULATE INTERACTIONS *
1750              **************************/
1751
1752             if (gmx_mm_any_lt(rsq00,rcutoff2))
1753             {
1754
1755             /* LENNARD-JONES DISPERSION/REPULSION */
1756
1757             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1758             fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1759
1760             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1761
1762             fscal            = fvdw;
1763
1764             fscal            = _mm_and_pd(fscal,cutoff_mask);
1765
1766             /* Update vectorial force */
1767             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1768             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1769             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1770             
1771             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1772             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1773             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1774
1775             }
1776
1777             /**************************
1778              * CALCULATE INTERACTIONS *
1779              **************************/
1780
1781             if (gmx_mm_any_lt(rsq11,rcutoff2))
1782             {
1783
1784             r11              = _mm_mul_pd(rsq11,rinv11);
1785
1786             /* EWALD ELECTROSTATICS */
1787
1788             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1789             ewrt             = _mm_mul_pd(r11,ewtabscale);
1790             ewitab           = _mm_cvttpd_epi32(ewrt);
1791 #ifdef __XOP__
1792             eweps            = _mm_frcz_pd(ewrt);
1793 #else
1794             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1795 #endif
1796             twoeweps         = _mm_add_pd(eweps,eweps);
1797             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1798                                          &ewtabF,&ewtabFn);
1799             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1800             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1801
1802             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1803
1804             fscal            = felec;
1805
1806             fscal            = _mm_and_pd(fscal,cutoff_mask);
1807
1808             /* Update vectorial force */
1809             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1810             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1811             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1812             
1813             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1814             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1815             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1816
1817             }
1818
1819             /**************************
1820              * CALCULATE INTERACTIONS *
1821              **************************/
1822
1823             if (gmx_mm_any_lt(rsq12,rcutoff2))
1824             {
1825
1826             r12              = _mm_mul_pd(rsq12,rinv12);
1827
1828             /* EWALD ELECTROSTATICS */
1829
1830             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1831             ewrt             = _mm_mul_pd(r12,ewtabscale);
1832             ewitab           = _mm_cvttpd_epi32(ewrt);
1833 #ifdef __XOP__
1834             eweps            = _mm_frcz_pd(ewrt);
1835 #else
1836             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1837 #endif
1838             twoeweps         = _mm_add_pd(eweps,eweps);
1839             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1840                                          &ewtabF,&ewtabFn);
1841             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1842             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1843
1844             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1845
1846             fscal            = felec;
1847
1848             fscal            = _mm_and_pd(fscal,cutoff_mask);
1849
1850             /* Update vectorial force */
1851             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1852             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1853             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1854             
1855             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1856             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1857             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1858
1859             }
1860
1861             /**************************
1862              * CALCULATE INTERACTIONS *
1863              **************************/
1864
1865             if (gmx_mm_any_lt(rsq13,rcutoff2))
1866             {
1867
1868             r13              = _mm_mul_pd(rsq13,rinv13);
1869
1870             /* EWALD ELECTROSTATICS */
1871
1872             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1873             ewrt             = _mm_mul_pd(r13,ewtabscale);
1874             ewitab           = _mm_cvttpd_epi32(ewrt);
1875 #ifdef __XOP__
1876             eweps            = _mm_frcz_pd(ewrt);
1877 #else
1878             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1879 #endif
1880             twoeweps         = _mm_add_pd(eweps,eweps);
1881             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1882                                          &ewtabF,&ewtabFn);
1883             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1884             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1885
1886             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
1887
1888             fscal            = felec;
1889
1890             fscal            = _mm_and_pd(fscal,cutoff_mask);
1891
1892             /* Update vectorial force */
1893             fix1             = _mm_macc_pd(dx13,fscal,fix1);
1894             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
1895             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
1896             
1897             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
1898             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
1899             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
1900
1901             }
1902
1903             /**************************
1904              * CALCULATE INTERACTIONS *
1905              **************************/
1906
1907             if (gmx_mm_any_lt(rsq21,rcutoff2))
1908             {
1909
1910             r21              = _mm_mul_pd(rsq21,rinv21);
1911
1912             /* EWALD ELECTROSTATICS */
1913
1914             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1915             ewrt             = _mm_mul_pd(r21,ewtabscale);
1916             ewitab           = _mm_cvttpd_epi32(ewrt);
1917 #ifdef __XOP__
1918             eweps            = _mm_frcz_pd(ewrt);
1919 #else
1920             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1921 #endif
1922             twoeweps         = _mm_add_pd(eweps,eweps);
1923             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1924                                          &ewtabF,&ewtabFn);
1925             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1926             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1927
1928             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1929
1930             fscal            = felec;
1931
1932             fscal            = _mm_and_pd(fscal,cutoff_mask);
1933
1934             /* Update vectorial force */
1935             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1936             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1937             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1938             
1939             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1940             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1941             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1942
1943             }
1944
1945             /**************************
1946              * CALCULATE INTERACTIONS *
1947              **************************/
1948
1949             if (gmx_mm_any_lt(rsq22,rcutoff2))
1950             {
1951
1952             r22              = _mm_mul_pd(rsq22,rinv22);
1953
1954             /* EWALD ELECTROSTATICS */
1955
1956             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1957             ewrt             = _mm_mul_pd(r22,ewtabscale);
1958             ewitab           = _mm_cvttpd_epi32(ewrt);
1959 #ifdef __XOP__
1960             eweps            = _mm_frcz_pd(ewrt);
1961 #else
1962             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1963 #endif
1964             twoeweps         = _mm_add_pd(eweps,eweps);
1965             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1966                                          &ewtabF,&ewtabFn);
1967             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1968             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1969
1970             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1971
1972             fscal            = felec;
1973
1974             fscal            = _mm_and_pd(fscal,cutoff_mask);
1975
1976             /* Update vectorial force */
1977             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1978             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1979             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1980             
1981             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1982             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1983             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1984
1985             }
1986
1987             /**************************
1988              * CALCULATE INTERACTIONS *
1989              **************************/
1990
1991             if (gmx_mm_any_lt(rsq23,rcutoff2))
1992             {
1993
1994             r23              = _mm_mul_pd(rsq23,rinv23);
1995
1996             /* EWALD ELECTROSTATICS */
1997
1998             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1999             ewrt             = _mm_mul_pd(r23,ewtabscale);
2000             ewitab           = _mm_cvttpd_epi32(ewrt);
2001 #ifdef __XOP__
2002             eweps            = _mm_frcz_pd(ewrt);
2003 #else
2004             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2005 #endif
2006             twoeweps         = _mm_add_pd(eweps,eweps);
2007             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2008                                          &ewtabF,&ewtabFn);
2009             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2010             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2011
2012             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
2013
2014             fscal            = felec;
2015
2016             fscal            = _mm_and_pd(fscal,cutoff_mask);
2017
2018             /* Update vectorial force */
2019             fix2             = _mm_macc_pd(dx23,fscal,fix2);
2020             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
2021             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
2022             
2023             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
2024             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
2025             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
2026
2027             }
2028
2029             /**************************
2030              * CALCULATE INTERACTIONS *
2031              **************************/
2032
2033             if (gmx_mm_any_lt(rsq31,rcutoff2))
2034             {
2035
2036             r31              = _mm_mul_pd(rsq31,rinv31);
2037
2038             /* EWALD ELECTROSTATICS */
2039
2040             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2041             ewrt             = _mm_mul_pd(r31,ewtabscale);
2042             ewitab           = _mm_cvttpd_epi32(ewrt);
2043 #ifdef __XOP__
2044             eweps            = _mm_frcz_pd(ewrt);
2045 #else
2046             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2047 #endif
2048             twoeweps         = _mm_add_pd(eweps,eweps);
2049             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2050                                          &ewtabF,&ewtabFn);
2051             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2052             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2053
2054             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
2055
2056             fscal            = felec;
2057
2058             fscal            = _mm_and_pd(fscal,cutoff_mask);
2059
2060             /* Update vectorial force */
2061             fix3             = _mm_macc_pd(dx31,fscal,fix3);
2062             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
2063             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
2064             
2065             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
2066             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
2067             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
2068
2069             }
2070
2071             /**************************
2072              * CALCULATE INTERACTIONS *
2073              **************************/
2074
2075             if (gmx_mm_any_lt(rsq32,rcutoff2))
2076             {
2077
2078             r32              = _mm_mul_pd(rsq32,rinv32);
2079
2080             /* EWALD ELECTROSTATICS */
2081
2082             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2083             ewrt             = _mm_mul_pd(r32,ewtabscale);
2084             ewitab           = _mm_cvttpd_epi32(ewrt);
2085 #ifdef __XOP__
2086             eweps            = _mm_frcz_pd(ewrt);
2087 #else
2088             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2089 #endif
2090             twoeweps         = _mm_add_pd(eweps,eweps);
2091             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2092                                          &ewtabF,&ewtabFn);
2093             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2094             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2095
2096             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
2097
2098             fscal            = felec;
2099
2100             fscal            = _mm_and_pd(fscal,cutoff_mask);
2101
2102             /* Update vectorial force */
2103             fix3             = _mm_macc_pd(dx32,fscal,fix3);
2104             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
2105             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
2106             
2107             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
2108             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
2109             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
2110
2111             }
2112
2113             /**************************
2114              * CALCULATE INTERACTIONS *
2115              **************************/
2116
2117             if (gmx_mm_any_lt(rsq33,rcutoff2))
2118             {
2119
2120             r33              = _mm_mul_pd(rsq33,rinv33);
2121
2122             /* EWALD ELECTROSTATICS */
2123
2124             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2125             ewrt             = _mm_mul_pd(r33,ewtabscale);
2126             ewitab           = _mm_cvttpd_epi32(ewrt);
2127 #ifdef __XOP__
2128             eweps            = _mm_frcz_pd(ewrt);
2129 #else
2130             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2131 #endif
2132             twoeweps         = _mm_add_pd(eweps,eweps);
2133             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2134                                          &ewtabF,&ewtabFn);
2135             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2136             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2137
2138             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
2139
2140             fscal            = felec;
2141
2142             fscal            = _mm_and_pd(fscal,cutoff_mask);
2143
2144             /* Update vectorial force */
2145             fix3             = _mm_macc_pd(dx33,fscal,fix3);
2146             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
2147             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
2148             
2149             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
2150             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
2151             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
2152
2153             }
2154
2155             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2156
2157             /* Inner loop uses 414 flops */
2158         }
2159
2160         if(jidx<j_index_end)
2161         {
2162
2163             jnrA             = jjnr[jidx];
2164             j_coord_offsetA  = DIM*jnrA;
2165
2166             /* load j atom coordinates */
2167             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2168                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2169                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2170
2171             /* Calculate displacement vector */
2172             dx00             = _mm_sub_pd(ix0,jx0);
2173             dy00             = _mm_sub_pd(iy0,jy0);
2174             dz00             = _mm_sub_pd(iz0,jz0);
2175             dx11             = _mm_sub_pd(ix1,jx1);
2176             dy11             = _mm_sub_pd(iy1,jy1);
2177             dz11             = _mm_sub_pd(iz1,jz1);
2178             dx12             = _mm_sub_pd(ix1,jx2);
2179             dy12             = _mm_sub_pd(iy1,jy2);
2180             dz12             = _mm_sub_pd(iz1,jz2);
2181             dx13             = _mm_sub_pd(ix1,jx3);
2182             dy13             = _mm_sub_pd(iy1,jy3);
2183             dz13             = _mm_sub_pd(iz1,jz3);
2184             dx21             = _mm_sub_pd(ix2,jx1);
2185             dy21             = _mm_sub_pd(iy2,jy1);
2186             dz21             = _mm_sub_pd(iz2,jz1);
2187             dx22             = _mm_sub_pd(ix2,jx2);
2188             dy22             = _mm_sub_pd(iy2,jy2);
2189             dz22             = _mm_sub_pd(iz2,jz2);
2190             dx23             = _mm_sub_pd(ix2,jx3);
2191             dy23             = _mm_sub_pd(iy2,jy3);
2192             dz23             = _mm_sub_pd(iz2,jz3);
2193             dx31             = _mm_sub_pd(ix3,jx1);
2194             dy31             = _mm_sub_pd(iy3,jy1);
2195             dz31             = _mm_sub_pd(iz3,jz1);
2196             dx32             = _mm_sub_pd(ix3,jx2);
2197             dy32             = _mm_sub_pd(iy3,jy2);
2198             dz32             = _mm_sub_pd(iz3,jz2);
2199             dx33             = _mm_sub_pd(ix3,jx3);
2200             dy33             = _mm_sub_pd(iy3,jy3);
2201             dz33             = _mm_sub_pd(iz3,jz3);
2202
2203             /* Calculate squared distance and things based on it */
2204             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2205             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2206             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2207             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2208             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2209             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2210             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2211             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2212             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2213             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2214
2215             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2216             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2217             rinv13           = gmx_mm_invsqrt_pd(rsq13);
2218             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2219             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2220             rinv23           = gmx_mm_invsqrt_pd(rsq23);
2221             rinv31           = gmx_mm_invsqrt_pd(rsq31);
2222             rinv32           = gmx_mm_invsqrt_pd(rsq32);
2223             rinv33           = gmx_mm_invsqrt_pd(rsq33);
2224
2225             rinvsq00         = gmx_mm_inv_pd(rsq00);
2226             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2227             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2228             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
2229             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2230             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2231             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
2232             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
2233             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
2234             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
2235
2236             fjx0             = _mm_setzero_pd();
2237             fjy0             = _mm_setzero_pd();
2238             fjz0             = _mm_setzero_pd();
2239             fjx1             = _mm_setzero_pd();
2240             fjy1             = _mm_setzero_pd();
2241             fjz1             = _mm_setzero_pd();
2242             fjx2             = _mm_setzero_pd();
2243             fjy2             = _mm_setzero_pd();
2244             fjz2             = _mm_setzero_pd();
2245             fjx3             = _mm_setzero_pd();
2246             fjy3             = _mm_setzero_pd();
2247             fjz3             = _mm_setzero_pd();
2248
2249             /**************************
2250              * CALCULATE INTERACTIONS *
2251              **************************/
2252
2253             if (gmx_mm_any_lt(rsq00,rcutoff2))
2254             {
2255
2256             /* LENNARD-JONES DISPERSION/REPULSION */
2257
2258             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2259             fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
2260
2261             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2262
2263             fscal            = fvdw;
2264
2265             fscal            = _mm_and_pd(fscal,cutoff_mask);
2266
2267             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2268
2269             /* Update vectorial force */
2270             fix0             = _mm_macc_pd(dx00,fscal,fix0);
2271             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
2272             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
2273             
2274             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
2275             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
2276             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
2277
2278             }
2279
2280             /**************************
2281              * CALCULATE INTERACTIONS *
2282              **************************/
2283
2284             if (gmx_mm_any_lt(rsq11,rcutoff2))
2285             {
2286
2287             r11              = _mm_mul_pd(rsq11,rinv11);
2288
2289             /* EWALD ELECTROSTATICS */
2290
2291             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2292             ewrt             = _mm_mul_pd(r11,ewtabscale);
2293             ewitab           = _mm_cvttpd_epi32(ewrt);
2294 #ifdef __XOP__
2295             eweps            = _mm_frcz_pd(ewrt);
2296 #else
2297             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2298 #endif
2299             twoeweps         = _mm_add_pd(eweps,eweps);
2300             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2301             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2302             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2303
2304             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2305
2306             fscal            = felec;
2307
2308             fscal            = _mm_and_pd(fscal,cutoff_mask);
2309
2310             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2311
2312             /* Update vectorial force */
2313             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2314             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2315             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2316             
2317             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2318             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2319             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2320
2321             }
2322
2323             /**************************
2324              * CALCULATE INTERACTIONS *
2325              **************************/
2326
2327             if (gmx_mm_any_lt(rsq12,rcutoff2))
2328             {
2329
2330             r12              = _mm_mul_pd(rsq12,rinv12);
2331
2332             /* EWALD ELECTROSTATICS */
2333
2334             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2335             ewrt             = _mm_mul_pd(r12,ewtabscale);
2336             ewitab           = _mm_cvttpd_epi32(ewrt);
2337 #ifdef __XOP__
2338             eweps            = _mm_frcz_pd(ewrt);
2339 #else
2340             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2341 #endif
2342             twoeweps         = _mm_add_pd(eweps,eweps);
2343             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2344             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2345             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2346
2347             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2348
2349             fscal            = felec;
2350
2351             fscal            = _mm_and_pd(fscal,cutoff_mask);
2352
2353             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2354
2355             /* Update vectorial force */
2356             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2357             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2358             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2359             
2360             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2361             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2362             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2363
2364             }
2365
2366             /**************************
2367              * CALCULATE INTERACTIONS *
2368              **************************/
2369
2370             if (gmx_mm_any_lt(rsq13,rcutoff2))
2371             {
2372
2373             r13              = _mm_mul_pd(rsq13,rinv13);
2374
2375             /* EWALD ELECTROSTATICS */
2376
2377             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2378             ewrt             = _mm_mul_pd(r13,ewtabscale);
2379             ewitab           = _mm_cvttpd_epi32(ewrt);
2380 #ifdef __XOP__
2381             eweps            = _mm_frcz_pd(ewrt);
2382 #else
2383             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2384 #endif
2385             twoeweps         = _mm_add_pd(eweps,eweps);
2386             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2387             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2388             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2389
2390             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2391
2392             fscal            = felec;
2393
2394             fscal            = _mm_and_pd(fscal,cutoff_mask);
2395
2396             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2397
2398             /* Update vectorial force */
2399             fix1             = _mm_macc_pd(dx13,fscal,fix1);
2400             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
2401             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
2402             
2403             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
2404             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
2405             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
2406
2407             }
2408
2409             /**************************
2410              * CALCULATE INTERACTIONS *
2411              **************************/
2412
2413             if (gmx_mm_any_lt(rsq21,rcutoff2))
2414             {
2415
2416             r21              = _mm_mul_pd(rsq21,rinv21);
2417
2418             /* EWALD ELECTROSTATICS */
2419
2420             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2421             ewrt             = _mm_mul_pd(r21,ewtabscale);
2422             ewitab           = _mm_cvttpd_epi32(ewrt);
2423 #ifdef __XOP__
2424             eweps            = _mm_frcz_pd(ewrt);
2425 #else
2426             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2427 #endif
2428             twoeweps         = _mm_add_pd(eweps,eweps);
2429             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2430             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2431             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2432
2433             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2434
2435             fscal            = felec;
2436
2437             fscal            = _mm_and_pd(fscal,cutoff_mask);
2438
2439             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2440
2441             /* Update vectorial force */
2442             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2443             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2444             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2445             
2446             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2447             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2448             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2449
2450             }
2451
2452             /**************************
2453              * CALCULATE INTERACTIONS *
2454              **************************/
2455
2456             if (gmx_mm_any_lt(rsq22,rcutoff2))
2457             {
2458
2459             r22              = _mm_mul_pd(rsq22,rinv22);
2460
2461             /* EWALD ELECTROSTATICS */
2462
2463             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2464             ewrt             = _mm_mul_pd(r22,ewtabscale);
2465             ewitab           = _mm_cvttpd_epi32(ewrt);
2466 #ifdef __XOP__
2467             eweps            = _mm_frcz_pd(ewrt);
2468 #else
2469             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2470 #endif
2471             twoeweps         = _mm_add_pd(eweps,eweps);
2472             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2473             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2474             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2475
2476             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2477
2478             fscal            = felec;
2479
2480             fscal            = _mm_and_pd(fscal,cutoff_mask);
2481
2482             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2483
2484             /* Update vectorial force */
2485             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2486             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2487             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2488             
2489             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2490             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2491             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2492
2493             }
2494
2495             /**************************
2496              * CALCULATE INTERACTIONS *
2497              **************************/
2498
2499             if (gmx_mm_any_lt(rsq23,rcutoff2))
2500             {
2501
2502             r23              = _mm_mul_pd(rsq23,rinv23);
2503
2504             /* EWALD ELECTROSTATICS */
2505
2506             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2507             ewrt             = _mm_mul_pd(r23,ewtabscale);
2508             ewitab           = _mm_cvttpd_epi32(ewrt);
2509 #ifdef __XOP__
2510             eweps            = _mm_frcz_pd(ewrt);
2511 #else
2512             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2513 #endif
2514             twoeweps         = _mm_add_pd(eweps,eweps);
2515             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2516             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2517             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2518
2519             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
2520
2521             fscal            = felec;
2522
2523             fscal            = _mm_and_pd(fscal,cutoff_mask);
2524
2525             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2526
2527             /* Update vectorial force */
2528             fix2             = _mm_macc_pd(dx23,fscal,fix2);
2529             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
2530             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
2531             
2532             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
2533             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
2534             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
2535
2536             }
2537
2538             /**************************
2539              * CALCULATE INTERACTIONS *
2540              **************************/
2541
2542             if (gmx_mm_any_lt(rsq31,rcutoff2))
2543             {
2544
2545             r31              = _mm_mul_pd(rsq31,rinv31);
2546
2547             /* EWALD ELECTROSTATICS */
2548
2549             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2550             ewrt             = _mm_mul_pd(r31,ewtabscale);
2551             ewitab           = _mm_cvttpd_epi32(ewrt);
2552 #ifdef __XOP__
2553             eweps            = _mm_frcz_pd(ewrt);
2554 #else
2555             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2556 #endif
2557             twoeweps         = _mm_add_pd(eweps,eweps);
2558             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2559             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2560             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2561
2562             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
2563
2564             fscal            = felec;
2565
2566             fscal            = _mm_and_pd(fscal,cutoff_mask);
2567
2568             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2569
2570             /* Update vectorial force */
2571             fix3             = _mm_macc_pd(dx31,fscal,fix3);
2572             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
2573             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
2574             
2575             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
2576             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
2577             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
2578
2579             }
2580
2581             /**************************
2582              * CALCULATE INTERACTIONS *
2583              **************************/
2584
2585             if (gmx_mm_any_lt(rsq32,rcutoff2))
2586             {
2587
2588             r32              = _mm_mul_pd(rsq32,rinv32);
2589
2590             /* EWALD ELECTROSTATICS */
2591
2592             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2593             ewrt             = _mm_mul_pd(r32,ewtabscale);
2594             ewitab           = _mm_cvttpd_epi32(ewrt);
2595 #ifdef __XOP__
2596             eweps            = _mm_frcz_pd(ewrt);
2597 #else
2598             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2599 #endif
2600             twoeweps         = _mm_add_pd(eweps,eweps);
2601             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2602             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2603             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2604
2605             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
2606
2607             fscal            = felec;
2608
2609             fscal            = _mm_and_pd(fscal,cutoff_mask);
2610
2611             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2612
2613             /* Update vectorial force */
2614             fix3             = _mm_macc_pd(dx32,fscal,fix3);
2615             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
2616             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
2617             
2618             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
2619             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
2620             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
2621
2622             }
2623
2624             /**************************
2625              * CALCULATE INTERACTIONS *
2626              **************************/
2627
2628             if (gmx_mm_any_lt(rsq33,rcutoff2))
2629             {
2630
2631             r33              = _mm_mul_pd(rsq33,rinv33);
2632
2633             /* EWALD ELECTROSTATICS */
2634
2635             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2636             ewrt             = _mm_mul_pd(r33,ewtabscale);
2637             ewitab           = _mm_cvttpd_epi32(ewrt);
2638 #ifdef __XOP__
2639             eweps            = _mm_frcz_pd(ewrt);
2640 #else
2641             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2642 #endif
2643             twoeweps         = _mm_add_pd(eweps,eweps);
2644             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2645             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2646             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2647
2648             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
2649
2650             fscal            = felec;
2651
2652             fscal            = _mm_and_pd(fscal,cutoff_mask);
2653
2654             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2655
2656             /* Update vectorial force */
2657             fix3             = _mm_macc_pd(dx33,fscal,fix3);
2658             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
2659             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
2660             
2661             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
2662             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
2663             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
2664
2665             }
2666
2667             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2668
2669             /* Inner loop uses 414 flops */
2670         }
2671
2672         /* End of innermost loop */
2673
2674         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2675                                               f+i_coord_offset,fshift+i_shift_offset);
2676
2677         /* Increment number of inner iterations */
2678         inneriter                  += j_index_end - j_index_start;
2679
2680         /* Outer loop uses 24 flops */
2681     }
2682
2683     /* Increment number of outer iterations */
2684     outeriter        += nri;
2685
2686     /* Update outer/inner flops */
2687
2688     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*414);
2689 }