made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_avx_128_fma_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_double
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B;
73     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B;
75     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B;
77     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     int              nvdwtype;
90     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91     int              *vdwtype;
92     real             *vdwparam;
93     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
94     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
95     __m128i          ewitab;
96     __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
97     real             *ewtab;
98     __m128d          dummy_mask,cutoff_mask;
99     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
100     __m128d          one     = _mm_set1_pd(1.0);
101     __m128d          two     = _mm_set1_pd(2.0);
102     x                = xx[0];
103     f                = ff[0];
104
105     nri              = nlist->nri;
106     iinr             = nlist->iinr;
107     jindex           = nlist->jindex;
108     jjnr             = nlist->jjnr;
109     shiftidx         = nlist->shift;
110     gid              = nlist->gid;
111     shiftvec         = fr->shift_vec[0];
112     fshift           = fr->fshift[0];
113     facel            = _mm_set1_pd(fr->epsfac);
114     charge           = mdatoms->chargeA;
115     nvdwtype         = fr->ntype;
116     vdwparam         = fr->nbfp;
117     vdwtype          = mdatoms->typeA;
118
119     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
120     ewtab            = fr->ic->tabq_coul_FDV0;
121     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
122     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
123
124     /* Setup water-specific parameters */
125     inr              = nlist->iinr[0];
126     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
127     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
128     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
129     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
130
131     jq0              = _mm_set1_pd(charge[inr+0]);
132     jq1              = _mm_set1_pd(charge[inr+1]);
133     jq2              = _mm_set1_pd(charge[inr+2]);
134     vdwjidx0A        = 2*vdwtype[inr+0];
135     qq00             = _mm_mul_pd(iq0,jq0);
136     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
137     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
138     qq01             = _mm_mul_pd(iq0,jq1);
139     qq02             = _mm_mul_pd(iq0,jq2);
140     qq10             = _mm_mul_pd(iq1,jq0);
141     qq11             = _mm_mul_pd(iq1,jq1);
142     qq12             = _mm_mul_pd(iq1,jq2);
143     qq20             = _mm_mul_pd(iq2,jq0);
144     qq21             = _mm_mul_pd(iq2,jq1);
145     qq22             = _mm_mul_pd(iq2,jq2);
146
147     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
148     rcutoff_scalar   = fr->rcoulomb;
149     rcutoff          = _mm_set1_pd(rcutoff_scalar);
150     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
151
152     sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
153     rvdw             = _mm_set1_pd(fr->rvdw);
154
155     /* Avoid stupid compiler warnings */
156     jnrA = jnrB = 0;
157     j_coord_offsetA = 0;
158     j_coord_offsetB = 0;
159
160     outeriter        = 0;
161     inneriter        = 0;
162
163     /* Start outer loop over neighborlists */
164     for(iidx=0; iidx<nri; iidx++)
165     {
166         /* Load shift vector for this list */
167         i_shift_offset   = DIM*shiftidx[iidx];
168
169         /* Load limits for loop over neighbors */
170         j_index_start    = jindex[iidx];
171         j_index_end      = jindex[iidx+1];
172
173         /* Get outer coordinate index */
174         inr              = iinr[iidx];
175         i_coord_offset   = DIM*inr;
176
177         /* Load i particle coords and add shift vector */
178         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
179                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
180
181         fix0             = _mm_setzero_pd();
182         fiy0             = _mm_setzero_pd();
183         fiz0             = _mm_setzero_pd();
184         fix1             = _mm_setzero_pd();
185         fiy1             = _mm_setzero_pd();
186         fiz1             = _mm_setzero_pd();
187         fix2             = _mm_setzero_pd();
188         fiy2             = _mm_setzero_pd();
189         fiz2             = _mm_setzero_pd();
190
191         /* Reset potential sums */
192         velecsum         = _mm_setzero_pd();
193         vvdwsum          = _mm_setzero_pd();
194
195         /* Start inner kernel loop */
196         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
197         {
198
199             /* Get j neighbor index, and coordinate index */
200             jnrA             = jjnr[jidx];
201             jnrB             = jjnr[jidx+1];
202             j_coord_offsetA  = DIM*jnrA;
203             j_coord_offsetB  = DIM*jnrB;
204
205             /* load j atom coordinates */
206             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
207                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
208
209             /* Calculate displacement vector */
210             dx00             = _mm_sub_pd(ix0,jx0);
211             dy00             = _mm_sub_pd(iy0,jy0);
212             dz00             = _mm_sub_pd(iz0,jz0);
213             dx01             = _mm_sub_pd(ix0,jx1);
214             dy01             = _mm_sub_pd(iy0,jy1);
215             dz01             = _mm_sub_pd(iz0,jz1);
216             dx02             = _mm_sub_pd(ix0,jx2);
217             dy02             = _mm_sub_pd(iy0,jy2);
218             dz02             = _mm_sub_pd(iz0,jz2);
219             dx10             = _mm_sub_pd(ix1,jx0);
220             dy10             = _mm_sub_pd(iy1,jy0);
221             dz10             = _mm_sub_pd(iz1,jz0);
222             dx11             = _mm_sub_pd(ix1,jx1);
223             dy11             = _mm_sub_pd(iy1,jy1);
224             dz11             = _mm_sub_pd(iz1,jz1);
225             dx12             = _mm_sub_pd(ix1,jx2);
226             dy12             = _mm_sub_pd(iy1,jy2);
227             dz12             = _mm_sub_pd(iz1,jz2);
228             dx20             = _mm_sub_pd(ix2,jx0);
229             dy20             = _mm_sub_pd(iy2,jy0);
230             dz20             = _mm_sub_pd(iz2,jz0);
231             dx21             = _mm_sub_pd(ix2,jx1);
232             dy21             = _mm_sub_pd(iy2,jy1);
233             dz21             = _mm_sub_pd(iz2,jz1);
234             dx22             = _mm_sub_pd(ix2,jx2);
235             dy22             = _mm_sub_pd(iy2,jy2);
236             dz22             = _mm_sub_pd(iz2,jz2);
237
238             /* Calculate squared distance and things based on it */
239             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
240             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
241             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
242             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
243             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
244             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
245             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
246             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
247             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
248
249             rinv00           = gmx_mm_invsqrt_pd(rsq00);
250             rinv01           = gmx_mm_invsqrt_pd(rsq01);
251             rinv02           = gmx_mm_invsqrt_pd(rsq02);
252             rinv10           = gmx_mm_invsqrt_pd(rsq10);
253             rinv11           = gmx_mm_invsqrt_pd(rsq11);
254             rinv12           = gmx_mm_invsqrt_pd(rsq12);
255             rinv20           = gmx_mm_invsqrt_pd(rsq20);
256             rinv21           = gmx_mm_invsqrt_pd(rsq21);
257             rinv22           = gmx_mm_invsqrt_pd(rsq22);
258
259             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
260             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
261             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
262             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
263             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
264             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
265             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
266             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
267             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
268
269             fjx0             = _mm_setzero_pd();
270             fjy0             = _mm_setzero_pd();
271             fjz0             = _mm_setzero_pd();
272             fjx1             = _mm_setzero_pd();
273             fjy1             = _mm_setzero_pd();
274             fjz1             = _mm_setzero_pd();
275             fjx2             = _mm_setzero_pd();
276             fjy2             = _mm_setzero_pd();
277             fjz2             = _mm_setzero_pd();
278
279             /**************************
280              * CALCULATE INTERACTIONS *
281              **************************/
282
283             if (gmx_mm_any_lt(rsq00,rcutoff2))
284             {
285
286             r00              = _mm_mul_pd(rsq00,rinv00);
287
288             /* EWALD ELECTROSTATICS */
289
290             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
291             ewrt             = _mm_mul_pd(r00,ewtabscale);
292             ewitab           = _mm_cvttpd_epi32(ewrt);
293 #ifdef __XOP__
294             eweps            = _mm_frcz_pd(ewrt);
295 #else
296             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
297 #endif
298             twoeweps         = _mm_add_pd(eweps,eweps);
299             ewitab           = _mm_slli_epi32(ewitab,2);
300             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
301             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
302             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
303             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
304             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
305             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
306             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
307             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
308             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
309             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
310
311             /* LENNARD-JONES DISPERSION/REPULSION */
312
313             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
314             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
315             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
316             vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
317                                            _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
318             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
319
320             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
321
322             /* Update potential sum for this i atom from the interaction with this j atom. */
323             velec            = _mm_and_pd(velec,cutoff_mask);
324             velecsum         = _mm_add_pd(velecsum,velec);
325             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
326             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
327
328             fscal            = _mm_add_pd(felec,fvdw);
329
330             fscal            = _mm_and_pd(fscal,cutoff_mask);
331
332             /* Update vectorial force */
333             fix0             = _mm_macc_pd(dx00,fscal,fix0);
334             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
335             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
336             
337             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
338             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
339             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
340
341             }
342
343             /**************************
344              * CALCULATE INTERACTIONS *
345              **************************/
346
347             if (gmx_mm_any_lt(rsq01,rcutoff2))
348             {
349
350             r01              = _mm_mul_pd(rsq01,rinv01);
351
352             /* EWALD ELECTROSTATICS */
353
354             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
355             ewrt             = _mm_mul_pd(r01,ewtabscale);
356             ewitab           = _mm_cvttpd_epi32(ewrt);
357 #ifdef __XOP__
358             eweps            = _mm_frcz_pd(ewrt);
359 #else
360             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
361 #endif
362             twoeweps         = _mm_add_pd(eweps,eweps);
363             ewitab           = _mm_slli_epi32(ewitab,2);
364             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
365             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
366             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
367             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
368             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
369             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
370             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
371             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
372             velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
373             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
374
375             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
376
377             /* Update potential sum for this i atom from the interaction with this j atom. */
378             velec            = _mm_and_pd(velec,cutoff_mask);
379             velecsum         = _mm_add_pd(velecsum,velec);
380
381             fscal            = felec;
382
383             fscal            = _mm_and_pd(fscal,cutoff_mask);
384
385             /* Update vectorial force */
386             fix0             = _mm_macc_pd(dx01,fscal,fix0);
387             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
388             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
389             
390             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
391             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
392             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
393
394             }
395
396             /**************************
397              * CALCULATE INTERACTIONS *
398              **************************/
399
400             if (gmx_mm_any_lt(rsq02,rcutoff2))
401             {
402
403             r02              = _mm_mul_pd(rsq02,rinv02);
404
405             /* EWALD ELECTROSTATICS */
406
407             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
408             ewrt             = _mm_mul_pd(r02,ewtabscale);
409             ewitab           = _mm_cvttpd_epi32(ewrt);
410 #ifdef __XOP__
411             eweps            = _mm_frcz_pd(ewrt);
412 #else
413             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
414 #endif
415             twoeweps         = _mm_add_pd(eweps,eweps);
416             ewitab           = _mm_slli_epi32(ewitab,2);
417             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
418             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
419             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
420             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
421             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
422             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
423             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
424             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
425             velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
426             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
427
428             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
429
430             /* Update potential sum for this i atom from the interaction with this j atom. */
431             velec            = _mm_and_pd(velec,cutoff_mask);
432             velecsum         = _mm_add_pd(velecsum,velec);
433
434             fscal            = felec;
435
436             fscal            = _mm_and_pd(fscal,cutoff_mask);
437
438             /* Update vectorial force */
439             fix0             = _mm_macc_pd(dx02,fscal,fix0);
440             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
441             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
442             
443             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
444             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
445             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
446
447             }
448
449             /**************************
450              * CALCULATE INTERACTIONS *
451              **************************/
452
453             if (gmx_mm_any_lt(rsq10,rcutoff2))
454             {
455
456             r10              = _mm_mul_pd(rsq10,rinv10);
457
458             /* EWALD ELECTROSTATICS */
459
460             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
461             ewrt             = _mm_mul_pd(r10,ewtabscale);
462             ewitab           = _mm_cvttpd_epi32(ewrt);
463 #ifdef __XOP__
464             eweps            = _mm_frcz_pd(ewrt);
465 #else
466             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
467 #endif
468             twoeweps         = _mm_add_pd(eweps,eweps);
469             ewitab           = _mm_slli_epi32(ewitab,2);
470             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
471             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
472             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
473             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
474             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
475             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
476             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
477             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
478             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
479             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
480
481             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
482
483             /* Update potential sum for this i atom from the interaction with this j atom. */
484             velec            = _mm_and_pd(velec,cutoff_mask);
485             velecsum         = _mm_add_pd(velecsum,velec);
486
487             fscal            = felec;
488
489             fscal            = _mm_and_pd(fscal,cutoff_mask);
490
491             /* Update vectorial force */
492             fix1             = _mm_macc_pd(dx10,fscal,fix1);
493             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
494             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
495             
496             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
497             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
498             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
499
500             }
501
502             /**************************
503              * CALCULATE INTERACTIONS *
504              **************************/
505
506             if (gmx_mm_any_lt(rsq11,rcutoff2))
507             {
508
509             r11              = _mm_mul_pd(rsq11,rinv11);
510
511             /* EWALD ELECTROSTATICS */
512
513             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
514             ewrt             = _mm_mul_pd(r11,ewtabscale);
515             ewitab           = _mm_cvttpd_epi32(ewrt);
516 #ifdef __XOP__
517             eweps            = _mm_frcz_pd(ewrt);
518 #else
519             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
520 #endif
521             twoeweps         = _mm_add_pd(eweps,eweps);
522             ewitab           = _mm_slli_epi32(ewitab,2);
523             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
524             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
525             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
526             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
527             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
528             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
529             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
530             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
531             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
532             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
533
534             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
535
536             /* Update potential sum for this i atom from the interaction with this j atom. */
537             velec            = _mm_and_pd(velec,cutoff_mask);
538             velecsum         = _mm_add_pd(velecsum,velec);
539
540             fscal            = felec;
541
542             fscal            = _mm_and_pd(fscal,cutoff_mask);
543
544             /* Update vectorial force */
545             fix1             = _mm_macc_pd(dx11,fscal,fix1);
546             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
547             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
548             
549             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
550             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
551             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
552
553             }
554
555             /**************************
556              * CALCULATE INTERACTIONS *
557              **************************/
558
559             if (gmx_mm_any_lt(rsq12,rcutoff2))
560             {
561
562             r12              = _mm_mul_pd(rsq12,rinv12);
563
564             /* EWALD ELECTROSTATICS */
565
566             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
567             ewrt             = _mm_mul_pd(r12,ewtabscale);
568             ewitab           = _mm_cvttpd_epi32(ewrt);
569 #ifdef __XOP__
570             eweps            = _mm_frcz_pd(ewrt);
571 #else
572             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
573 #endif
574             twoeweps         = _mm_add_pd(eweps,eweps);
575             ewitab           = _mm_slli_epi32(ewitab,2);
576             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
577             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
578             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
579             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
580             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
581             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
582             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
583             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
584             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
585             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
586
587             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
588
589             /* Update potential sum for this i atom from the interaction with this j atom. */
590             velec            = _mm_and_pd(velec,cutoff_mask);
591             velecsum         = _mm_add_pd(velecsum,velec);
592
593             fscal            = felec;
594
595             fscal            = _mm_and_pd(fscal,cutoff_mask);
596
597             /* Update vectorial force */
598             fix1             = _mm_macc_pd(dx12,fscal,fix1);
599             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
600             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
601             
602             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
603             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
604             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
605
606             }
607
608             /**************************
609              * CALCULATE INTERACTIONS *
610              **************************/
611
612             if (gmx_mm_any_lt(rsq20,rcutoff2))
613             {
614
615             r20              = _mm_mul_pd(rsq20,rinv20);
616
617             /* EWALD ELECTROSTATICS */
618
619             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
620             ewrt             = _mm_mul_pd(r20,ewtabscale);
621             ewitab           = _mm_cvttpd_epi32(ewrt);
622 #ifdef __XOP__
623             eweps            = _mm_frcz_pd(ewrt);
624 #else
625             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
626 #endif
627             twoeweps         = _mm_add_pd(eweps,eweps);
628             ewitab           = _mm_slli_epi32(ewitab,2);
629             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
630             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
631             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
632             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
633             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
634             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
635             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
636             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
637             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
638             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
639
640             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
641
642             /* Update potential sum for this i atom from the interaction with this j atom. */
643             velec            = _mm_and_pd(velec,cutoff_mask);
644             velecsum         = _mm_add_pd(velecsum,velec);
645
646             fscal            = felec;
647
648             fscal            = _mm_and_pd(fscal,cutoff_mask);
649
650             /* Update vectorial force */
651             fix2             = _mm_macc_pd(dx20,fscal,fix2);
652             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
653             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
654             
655             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
656             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
657             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
658
659             }
660
661             /**************************
662              * CALCULATE INTERACTIONS *
663              **************************/
664
665             if (gmx_mm_any_lt(rsq21,rcutoff2))
666             {
667
668             r21              = _mm_mul_pd(rsq21,rinv21);
669
670             /* EWALD ELECTROSTATICS */
671
672             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
673             ewrt             = _mm_mul_pd(r21,ewtabscale);
674             ewitab           = _mm_cvttpd_epi32(ewrt);
675 #ifdef __XOP__
676             eweps            = _mm_frcz_pd(ewrt);
677 #else
678             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
679 #endif
680             twoeweps         = _mm_add_pd(eweps,eweps);
681             ewitab           = _mm_slli_epi32(ewitab,2);
682             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
683             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
684             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
685             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
686             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
687             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
688             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
689             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
690             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
691             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
692
693             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
694
695             /* Update potential sum for this i atom from the interaction with this j atom. */
696             velec            = _mm_and_pd(velec,cutoff_mask);
697             velecsum         = _mm_add_pd(velecsum,velec);
698
699             fscal            = felec;
700
701             fscal            = _mm_and_pd(fscal,cutoff_mask);
702
703             /* Update vectorial force */
704             fix2             = _mm_macc_pd(dx21,fscal,fix2);
705             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
706             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
707             
708             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
709             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
710             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
711
712             }
713
714             /**************************
715              * CALCULATE INTERACTIONS *
716              **************************/
717
718             if (gmx_mm_any_lt(rsq22,rcutoff2))
719             {
720
721             r22              = _mm_mul_pd(rsq22,rinv22);
722
723             /* EWALD ELECTROSTATICS */
724
725             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
726             ewrt             = _mm_mul_pd(r22,ewtabscale);
727             ewitab           = _mm_cvttpd_epi32(ewrt);
728 #ifdef __XOP__
729             eweps            = _mm_frcz_pd(ewrt);
730 #else
731             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
732 #endif
733             twoeweps         = _mm_add_pd(eweps,eweps);
734             ewitab           = _mm_slli_epi32(ewitab,2);
735             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
736             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
737             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
738             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
739             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
740             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
741             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
742             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
743             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
744             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
745
746             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
747
748             /* Update potential sum for this i atom from the interaction with this j atom. */
749             velec            = _mm_and_pd(velec,cutoff_mask);
750             velecsum         = _mm_add_pd(velecsum,velec);
751
752             fscal            = felec;
753
754             fscal            = _mm_and_pd(fscal,cutoff_mask);
755
756             /* Update vectorial force */
757             fix2             = _mm_macc_pd(dx22,fscal,fix2);
758             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
759             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
760             
761             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
762             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
763             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
764
765             }
766
767             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
768
769             /* Inner loop uses 459 flops */
770         }
771
772         if(jidx<j_index_end)
773         {
774
775             jnrA             = jjnr[jidx];
776             j_coord_offsetA  = DIM*jnrA;
777
778             /* load j atom coordinates */
779             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
780                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
781
782             /* Calculate displacement vector */
783             dx00             = _mm_sub_pd(ix0,jx0);
784             dy00             = _mm_sub_pd(iy0,jy0);
785             dz00             = _mm_sub_pd(iz0,jz0);
786             dx01             = _mm_sub_pd(ix0,jx1);
787             dy01             = _mm_sub_pd(iy0,jy1);
788             dz01             = _mm_sub_pd(iz0,jz1);
789             dx02             = _mm_sub_pd(ix0,jx2);
790             dy02             = _mm_sub_pd(iy0,jy2);
791             dz02             = _mm_sub_pd(iz0,jz2);
792             dx10             = _mm_sub_pd(ix1,jx0);
793             dy10             = _mm_sub_pd(iy1,jy0);
794             dz10             = _mm_sub_pd(iz1,jz0);
795             dx11             = _mm_sub_pd(ix1,jx1);
796             dy11             = _mm_sub_pd(iy1,jy1);
797             dz11             = _mm_sub_pd(iz1,jz1);
798             dx12             = _mm_sub_pd(ix1,jx2);
799             dy12             = _mm_sub_pd(iy1,jy2);
800             dz12             = _mm_sub_pd(iz1,jz2);
801             dx20             = _mm_sub_pd(ix2,jx0);
802             dy20             = _mm_sub_pd(iy2,jy0);
803             dz20             = _mm_sub_pd(iz2,jz0);
804             dx21             = _mm_sub_pd(ix2,jx1);
805             dy21             = _mm_sub_pd(iy2,jy1);
806             dz21             = _mm_sub_pd(iz2,jz1);
807             dx22             = _mm_sub_pd(ix2,jx2);
808             dy22             = _mm_sub_pd(iy2,jy2);
809             dz22             = _mm_sub_pd(iz2,jz2);
810
811             /* Calculate squared distance and things based on it */
812             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
813             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
814             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
815             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
816             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
817             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
818             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
819             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
820             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
821
822             rinv00           = gmx_mm_invsqrt_pd(rsq00);
823             rinv01           = gmx_mm_invsqrt_pd(rsq01);
824             rinv02           = gmx_mm_invsqrt_pd(rsq02);
825             rinv10           = gmx_mm_invsqrt_pd(rsq10);
826             rinv11           = gmx_mm_invsqrt_pd(rsq11);
827             rinv12           = gmx_mm_invsqrt_pd(rsq12);
828             rinv20           = gmx_mm_invsqrt_pd(rsq20);
829             rinv21           = gmx_mm_invsqrt_pd(rsq21);
830             rinv22           = gmx_mm_invsqrt_pd(rsq22);
831
832             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
833             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
834             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
835             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
836             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
837             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
838             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
839             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
840             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
841
842             fjx0             = _mm_setzero_pd();
843             fjy0             = _mm_setzero_pd();
844             fjz0             = _mm_setzero_pd();
845             fjx1             = _mm_setzero_pd();
846             fjy1             = _mm_setzero_pd();
847             fjz1             = _mm_setzero_pd();
848             fjx2             = _mm_setzero_pd();
849             fjy2             = _mm_setzero_pd();
850             fjz2             = _mm_setzero_pd();
851
852             /**************************
853              * CALCULATE INTERACTIONS *
854              **************************/
855
856             if (gmx_mm_any_lt(rsq00,rcutoff2))
857             {
858
859             r00              = _mm_mul_pd(rsq00,rinv00);
860
861             /* EWALD ELECTROSTATICS */
862
863             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
864             ewrt             = _mm_mul_pd(r00,ewtabscale);
865             ewitab           = _mm_cvttpd_epi32(ewrt);
866 #ifdef __XOP__
867             eweps            = _mm_frcz_pd(ewrt);
868 #else
869             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
870 #endif
871             twoeweps         = _mm_add_pd(eweps,eweps);
872             ewitab           = _mm_slli_epi32(ewitab,2);
873             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
874             ewtabD           = _mm_setzero_pd();
875             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
876             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
877             ewtabFn          = _mm_setzero_pd();
878             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
879             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
880             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
881             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
882             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
883
884             /* LENNARD-JONES DISPERSION/REPULSION */
885
886             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
887             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
888             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
889             vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
890                                            _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
891             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
892
893             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
894
895             /* Update potential sum for this i atom from the interaction with this j atom. */
896             velec            = _mm_and_pd(velec,cutoff_mask);
897             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
898             velecsum         = _mm_add_pd(velecsum,velec);
899             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
900             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
901             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
902
903             fscal            = _mm_add_pd(felec,fvdw);
904
905             fscal            = _mm_and_pd(fscal,cutoff_mask);
906
907             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
908
909             /* Update vectorial force */
910             fix0             = _mm_macc_pd(dx00,fscal,fix0);
911             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
912             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
913             
914             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
915             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
916             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
917
918             }
919
920             /**************************
921              * CALCULATE INTERACTIONS *
922              **************************/
923
924             if (gmx_mm_any_lt(rsq01,rcutoff2))
925             {
926
927             r01              = _mm_mul_pd(rsq01,rinv01);
928
929             /* EWALD ELECTROSTATICS */
930
931             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
932             ewrt             = _mm_mul_pd(r01,ewtabscale);
933             ewitab           = _mm_cvttpd_epi32(ewrt);
934 #ifdef __XOP__
935             eweps            = _mm_frcz_pd(ewrt);
936 #else
937             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
938 #endif
939             twoeweps         = _mm_add_pd(eweps,eweps);
940             ewitab           = _mm_slli_epi32(ewitab,2);
941             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
942             ewtabD           = _mm_setzero_pd();
943             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
944             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
945             ewtabFn          = _mm_setzero_pd();
946             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
947             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
948             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
949             velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
950             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
951
952             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
953
954             /* Update potential sum for this i atom from the interaction with this j atom. */
955             velec            = _mm_and_pd(velec,cutoff_mask);
956             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
957             velecsum         = _mm_add_pd(velecsum,velec);
958
959             fscal            = felec;
960
961             fscal            = _mm_and_pd(fscal,cutoff_mask);
962
963             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
964
965             /* Update vectorial force */
966             fix0             = _mm_macc_pd(dx01,fscal,fix0);
967             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
968             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
969             
970             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
971             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
972             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
973
974             }
975
976             /**************************
977              * CALCULATE INTERACTIONS *
978              **************************/
979
980             if (gmx_mm_any_lt(rsq02,rcutoff2))
981             {
982
983             r02              = _mm_mul_pd(rsq02,rinv02);
984
985             /* EWALD ELECTROSTATICS */
986
987             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
988             ewrt             = _mm_mul_pd(r02,ewtabscale);
989             ewitab           = _mm_cvttpd_epi32(ewrt);
990 #ifdef __XOP__
991             eweps            = _mm_frcz_pd(ewrt);
992 #else
993             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
994 #endif
995             twoeweps         = _mm_add_pd(eweps,eweps);
996             ewitab           = _mm_slli_epi32(ewitab,2);
997             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
998             ewtabD           = _mm_setzero_pd();
999             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1000             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1001             ewtabFn          = _mm_setzero_pd();
1002             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1003             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1004             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1005             velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
1006             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1007
1008             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1009
1010             /* Update potential sum for this i atom from the interaction with this j atom. */
1011             velec            = _mm_and_pd(velec,cutoff_mask);
1012             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1013             velecsum         = _mm_add_pd(velecsum,velec);
1014
1015             fscal            = felec;
1016
1017             fscal            = _mm_and_pd(fscal,cutoff_mask);
1018
1019             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1020
1021             /* Update vectorial force */
1022             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1023             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1024             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1025             
1026             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1027             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1028             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1029
1030             }
1031
1032             /**************************
1033              * CALCULATE INTERACTIONS *
1034              **************************/
1035
1036             if (gmx_mm_any_lt(rsq10,rcutoff2))
1037             {
1038
1039             r10              = _mm_mul_pd(rsq10,rinv10);
1040
1041             /* EWALD ELECTROSTATICS */
1042
1043             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1044             ewrt             = _mm_mul_pd(r10,ewtabscale);
1045             ewitab           = _mm_cvttpd_epi32(ewrt);
1046 #ifdef __XOP__
1047             eweps            = _mm_frcz_pd(ewrt);
1048 #else
1049             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1050 #endif
1051             twoeweps         = _mm_add_pd(eweps,eweps);
1052             ewitab           = _mm_slli_epi32(ewitab,2);
1053             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1054             ewtabD           = _mm_setzero_pd();
1055             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1056             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1057             ewtabFn          = _mm_setzero_pd();
1058             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1059             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1060             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1061             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
1062             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1063
1064             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
1065
1066             /* Update potential sum for this i atom from the interaction with this j atom. */
1067             velec            = _mm_and_pd(velec,cutoff_mask);
1068             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1069             velecsum         = _mm_add_pd(velecsum,velec);
1070
1071             fscal            = felec;
1072
1073             fscal            = _mm_and_pd(fscal,cutoff_mask);
1074
1075             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1076
1077             /* Update vectorial force */
1078             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1079             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1080             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1081             
1082             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1083             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1084             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1085
1086             }
1087
1088             /**************************
1089              * CALCULATE INTERACTIONS *
1090              **************************/
1091
1092             if (gmx_mm_any_lt(rsq11,rcutoff2))
1093             {
1094
1095             r11              = _mm_mul_pd(rsq11,rinv11);
1096
1097             /* EWALD ELECTROSTATICS */
1098
1099             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1100             ewrt             = _mm_mul_pd(r11,ewtabscale);
1101             ewitab           = _mm_cvttpd_epi32(ewrt);
1102 #ifdef __XOP__
1103             eweps            = _mm_frcz_pd(ewrt);
1104 #else
1105             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1106 #endif
1107             twoeweps         = _mm_add_pd(eweps,eweps);
1108             ewitab           = _mm_slli_epi32(ewitab,2);
1109             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1110             ewtabD           = _mm_setzero_pd();
1111             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1112             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1113             ewtabFn          = _mm_setzero_pd();
1114             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1115             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1116             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1117             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
1118             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1119
1120             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1121
1122             /* Update potential sum for this i atom from the interaction with this j atom. */
1123             velec            = _mm_and_pd(velec,cutoff_mask);
1124             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1125             velecsum         = _mm_add_pd(velecsum,velec);
1126
1127             fscal            = felec;
1128
1129             fscal            = _mm_and_pd(fscal,cutoff_mask);
1130
1131             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1132
1133             /* Update vectorial force */
1134             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1135             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1136             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1137             
1138             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1139             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1140             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1141
1142             }
1143
1144             /**************************
1145              * CALCULATE INTERACTIONS *
1146              **************************/
1147
1148             if (gmx_mm_any_lt(rsq12,rcutoff2))
1149             {
1150
1151             r12              = _mm_mul_pd(rsq12,rinv12);
1152
1153             /* EWALD ELECTROSTATICS */
1154
1155             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1156             ewrt             = _mm_mul_pd(r12,ewtabscale);
1157             ewitab           = _mm_cvttpd_epi32(ewrt);
1158 #ifdef __XOP__
1159             eweps            = _mm_frcz_pd(ewrt);
1160 #else
1161             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1162 #endif
1163             twoeweps         = _mm_add_pd(eweps,eweps);
1164             ewitab           = _mm_slli_epi32(ewitab,2);
1165             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1166             ewtabD           = _mm_setzero_pd();
1167             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1168             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1169             ewtabFn          = _mm_setzero_pd();
1170             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1171             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1172             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1173             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
1174             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1175
1176             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1177
1178             /* Update potential sum for this i atom from the interaction with this j atom. */
1179             velec            = _mm_and_pd(velec,cutoff_mask);
1180             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1181             velecsum         = _mm_add_pd(velecsum,velec);
1182
1183             fscal            = felec;
1184
1185             fscal            = _mm_and_pd(fscal,cutoff_mask);
1186
1187             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1188
1189             /* Update vectorial force */
1190             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1191             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1192             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1193             
1194             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1195             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1196             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1197
1198             }
1199
1200             /**************************
1201              * CALCULATE INTERACTIONS *
1202              **************************/
1203
1204             if (gmx_mm_any_lt(rsq20,rcutoff2))
1205             {
1206
1207             r20              = _mm_mul_pd(rsq20,rinv20);
1208
1209             /* EWALD ELECTROSTATICS */
1210
1211             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1212             ewrt             = _mm_mul_pd(r20,ewtabscale);
1213             ewitab           = _mm_cvttpd_epi32(ewrt);
1214 #ifdef __XOP__
1215             eweps            = _mm_frcz_pd(ewrt);
1216 #else
1217             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1218 #endif
1219             twoeweps         = _mm_add_pd(eweps,eweps);
1220             ewitab           = _mm_slli_epi32(ewitab,2);
1221             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1222             ewtabD           = _mm_setzero_pd();
1223             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1224             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1225             ewtabFn          = _mm_setzero_pd();
1226             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1227             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1228             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1229             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
1230             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1231
1232             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
1233
1234             /* Update potential sum for this i atom from the interaction with this j atom. */
1235             velec            = _mm_and_pd(velec,cutoff_mask);
1236             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1237             velecsum         = _mm_add_pd(velecsum,velec);
1238
1239             fscal            = felec;
1240
1241             fscal            = _mm_and_pd(fscal,cutoff_mask);
1242
1243             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1244
1245             /* Update vectorial force */
1246             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1247             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1248             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1249             
1250             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1251             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1252             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1253
1254             }
1255
1256             /**************************
1257              * CALCULATE INTERACTIONS *
1258              **************************/
1259
1260             if (gmx_mm_any_lt(rsq21,rcutoff2))
1261             {
1262
1263             r21              = _mm_mul_pd(rsq21,rinv21);
1264
1265             /* EWALD ELECTROSTATICS */
1266
1267             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1268             ewrt             = _mm_mul_pd(r21,ewtabscale);
1269             ewitab           = _mm_cvttpd_epi32(ewrt);
1270 #ifdef __XOP__
1271             eweps            = _mm_frcz_pd(ewrt);
1272 #else
1273             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1274 #endif
1275             twoeweps         = _mm_add_pd(eweps,eweps);
1276             ewitab           = _mm_slli_epi32(ewitab,2);
1277             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1278             ewtabD           = _mm_setzero_pd();
1279             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1280             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1281             ewtabFn          = _mm_setzero_pd();
1282             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1283             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1284             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1285             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
1286             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1287
1288             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1289
1290             /* Update potential sum for this i atom from the interaction with this j atom. */
1291             velec            = _mm_and_pd(velec,cutoff_mask);
1292             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1293             velecsum         = _mm_add_pd(velecsum,velec);
1294
1295             fscal            = felec;
1296
1297             fscal            = _mm_and_pd(fscal,cutoff_mask);
1298
1299             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1300
1301             /* Update vectorial force */
1302             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1303             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1304             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1305             
1306             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1307             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1308             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1309
1310             }
1311
1312             /**************************
1313              * CALCULATE INTERACTIONS *
1314              **************************/
1315
1316             if (gmx_mm_any_lt(rsq22,rcutoff2))
1317             {
1318
1319             r22              = _mm_mul_pd(rsq22,rinv22);
1320
1321             /* EWALD ELECTROSTATICS */
1322
1323             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1324             ewrt             = _mm_mul_pd(r22,ewtabscale);
1325             ewitab           = _mm_cvttpd_epi32(ewrt);
1326 #ifdef __XOP__
1327             eweps            = _mm_frcz_pd(ewrt);
1328 #else
1329             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1330 #endif
1331             twoeweps         = _mm_add_pd(eweps,eweps);
1332             ewitab           = _mm_slli_epi32(ewitab,2);
1333             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1334             ewtabD           = _mm_setzero_pd();
1335             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1336             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1337             ewtabFn          = _mm_setzero_pd();
1338             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1339             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1340             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1341             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
1342             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1343
1344             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1345
1346             /* Update potential sum for this i atom from the interaction with this j atom. */
1347             velec            = _mm_and_pd(velec,cutoff_mask);
1348             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1349             velecsum         = _mm_add_pd(velecsum,velec);
1350
1351             fscal            = felec;
1352
1353             fscal            = _mm_and_pd(fscal,cutoff_mask);
1354
1355             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1356
1357             /* Update vectorial force */
1358             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1359             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1360             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1361             
1362             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1363             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1364             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1365
1366             }
1367
1368             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1369
1370             /* Inner loop uses 459 flops */
1371         }
1372
1373         /* End of innermost loop */
1374
1375         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1376                                               f+i_coord_offset,fshift+i_shift_offset);
1377
1378         ggid                        = gid[iidx];
1379         /* Update potential energies */
1380         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1381         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1382
1383         /* Increment number of inner iterations */
1384         inneriter                  += j_index_end - j_index_start;
1385
1386         /* Outer loop uses 20 flops */
1387     }
1388
1389     /* Increment number of outer iterations */
1390     outeriter        += nri;
1391
1392     /* Update outer/inner flops */
1393
1394     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*459);
1395 }
1396 /*
1397  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_double
1398  * Electrostatics interaction: Ewald
1399  * VdW interaction:            LennardJones
1400  * Geometry:                   Water3-Water3
1401  * Calculate force/pot:        Force
1402  */
1403 void
1404 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_double
1405                     (t_nblist * gmx_restrict                nlist,
1406                      rvec * gmx_restrict                    xx,
1407                      rvec * gmx_restrict                    ff,
1408                      t_forcerec * gmx_restrict              fr,
1409                      t_mdatoms * gmx_restrict               mdatoms,
1410                      nb_kernel_data_t * gmx_restrict        kernel_data,
1411                      t_nrnb * gmx_restrict                  nrnb)
1412 {
1413     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1414      * just 0 for non-waters.
1415      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1416      * jnr indices corresponding to data put in the four positions in the SIMD register.
1417      */
1418     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1419     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1420     int              jnrA,jnrB;
1421     int              j_coord_offsetA,j_coord_offsetB;
1422     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1423     real             rcutoff_scalar;
1424     real             *shiftvec,*fshift,*x,*f;
1425     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1426     int              vdwioffset0;
1427     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1428     int              vdwioffset1;
1429     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1430     int              vdwioffset2;
1431     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1432     int              vdwjidx0A,vdwjidx0B;
1433     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1434     int              vdwjidx1A,vdwjidx1B;
1435     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1436     int              vdwjidx2A,vdwjidx2B;
1437     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1438     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1439     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1440     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1441     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1442     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1443     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1444     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1445     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1446     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1447     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1448     real             *charge;
1449     int              nvdwtype;
1450     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1451     int              *vdwtype;
1452     real             *vdwparam;
1453     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1454     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1455     __m128i          ewitab;
1456     __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1457     real             *ewtab;
1458     __m128d          dummy_mask,cutoff_mask;
1459     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1460     __m128d          one     = _mm_set1_pd(1.0);
1461     __m128d          two     = _mm_set1_pd(2.0);
1462     x                = xx[0];
1463     f                = ff[0];
1464
1465     nri              = nlist->nri;
1466     iinr             = nlist->iinr;
1467     jindex           = nlist->jindex;
1468     jjnr             = nlist->jjnr;
1469     shiftidx         = nlist->shift;
1470     gid              = nlist->gid;
1471     shiftvec         = fr->shift_vec[0];
1472     fshift           = fr->fshift[0];
1473     facel            = _mm_set1_pd(fr->epsfac);
1474     charge           = mdatoms->chargeA;
1475     nvdwtype         = fr->ntype;
1476     vdwparam         = fr->nbfp;
1477     vdwtype          = mdatoms->typeA;
1478
1479     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1480     ewtab            = fr->ic->tabq_coul_F;
1481     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1482     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1483
1484     /* Setup water-specific parameters */
1485     inr              = nlist->iinr[0];
1486     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1487     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1488     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1489     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1490
1491     jq0              = _mm_set1_pd(charge[inr+0]);
1492     jq1              = _mm_set1_pd(charge[inr+1]);
1493     jq2              = _mm_set1_pd(charge[inr+2]);
1494     vdwjidx0A        = 2*vdwtype[inr+0];
1495     qq00             = _mm_mul_pd(iq0,jq0);
1496     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1497     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1498     qq01             = _mm_mul_pd(iq0,jq1);
1499     qq02             = _mm_mul_pd(iq0,jq2);
1500     qq10             = _mm_mul_pd(iq1,jq0);
1501     qq11             = _mm_mul_pd(iq1,jq1);
1502     qq12             = _mm_mul_pd(iq1,jq2);
1503     qq20             = _mm_mul_pd(iq2,jq0);
1504     qq21             = _mm_mul_pd(iq2,jq1);
1505     qq22             = _mm_mul_pd(iq2,jq2);
1506
1507     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1508     rcutoff_scalar   = fr->rcoulomb;
1509     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1510     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1511
1512     sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
1513     rvdw             = _mm_set1_pd(fr->rvdw);
1514
1515     /* Avoid stupid compiler warnings */
1516     jnrA = jnrB = 0;
1517     j_coord_offsetA = 0;
1518     j_coord_offsetB = 0;
1519
1520     outeriter        = 0;
1521     inneriter        = 0;
1522
1523     /* Start outer loop over neighborlists */
1524     for(iidx=0; iidx<nri; iidx++)
1525     {
1526         /* Load shift vector for this list */
1527         i_shift_offset   = DIM*shiftidx[iidx];
1528
1529         /* Load limits for loop over neighbors */
1530         j_index_start    = jindex[iidx];
1531         j_index_end      = jindex[iidx+1];
1532
1533         /* Get outer coordinate index */
1534         inr              = iinr[iidx];
1535         i_coord_offset   = DIM*inr;
1536
1537         /* Load i particle coords and add shift vector */
1538         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1539                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1540
1541         fix0             = _mm_setzero_pd();
1542         fiy0             = _mm_setzero_pd();
1543         fiz0             = _mm_setzero_pd();
1544         fix1             = _mm_setzero_pd();
1545         fiy1             = _mm_setzero_pd();
1546         fiz1             = _mm_setzero_pd();
1547         fix2             = _mm_setzero_pd();
1548         fiy2             = _mm_setzero_pd();
1549         fiz2             = _mm_setzero_pd();
1550
1551         /* Start inner kernel loop */
1552         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1553         {
1554
1555             /* Get j neighbor index, and coordinate index */
1556             jnrA             = jjnr[jidx];
1557             jnrB             = jjnr[jidx+1];
1558             j_coord_offsetA  = DIM*jnrA;
1559             j_coord_offsetB  = DIM*jnrB;
1560
1561             /* load j atom coordinates */
1562             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1563                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1564
1565             /* Calculate displacement vector */
1566             dx00             = _mm_sub_pd(ix0,jx0);
1567             dy00             = _mm_sub_pd(iy0,jy0);
1568             dz00             = _mm_sub_pd(iz0,jz0);
1569             dx01             = _mm_sub_pd(ix0,jx1);
1570             dy01             = _mm_sub_pd(iy0,jy1);
1571             dz01             = _mm_sub_pd(iz0,jz1);
1572             dx02             = _mm_sub_pd(ix0,jx2);
1573             dy02             = _mm_sub_pd(iy0,jy2);
1574             dz02             = _mm_sub_pd(iz0,jz2);
1575             dx10             = _mm_sub_pd(ix1,jx0);
1576             dy10             = _mm_sub_pd(iy1,jy0);
1577             dz10             = _mm_sub_pd(iz1,jz0);
1578             dx11             = _mm_sub_pd(ix1,jx1);
1579             dy11             = _mm_sub_pd(iy1,jy1);
1580             dz11             = _mm_sub_pd(iz1,jz1);
1581             dx12             = _mm_sub_pd(ix1,jx2);
1582             dy12             = _mm_sub_pd(iy1,jy2);
1583             dz12             = _mm_sub_pd(iz1,jz2);
1584             dx20             = _mm_sub_pd(ix2,jx0);
1585             dy20             = _mm_sub_pd(iy2,jy0);
1586             dz20             = _mm_sub_pd(iz2,jz0);
1587             dx21             = _mm_sub_pd(ix2,jx1);
1588             dy21             = _mm_sub_pd(iy2,jy1);
1589             dz21             = _mm_sub_pd(iz2,jz1);
1590             dx22             = _mm_sub_pd(ix2,jx2);
1591             dy22             = _mm_sub_pd(iy2,jy2);
1592             dz22             = _mm_sub_pd(iz2,jz2);
1593
1594             /* Calculate squared distance and things based on it */
1595             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1596             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1597             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1598             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1599             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1600             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1601             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1602             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1603             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1604
1605             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1606             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1607             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1608             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1609             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1610             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1611             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1612             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1613             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1614
1615             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1616             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1617             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1618             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1619             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1620             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1621             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1622             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1623             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1624
1625             fjx0             = _mm_setzero_pd();
1626             fjy0             = _mm_setzero_pd();
1627             fjz0             = _mm_setzero_pd();
1628             fjx1             = _mm_setzero_pd();
1629             fjy1             = _mm_setzero_pd();
1630             fjz1             = _mm_setzero_pd();
1631             fjx2             = _mm_setzero_pd();
1632             fjy2             = _mm_setzero_pd();
1633             fjz2             = _mm_setzero_pd();
1634
1635             /**************************
1636              * CALCULATE INTERACTIONS *
1637              **************************/
1638
1639             if (gmx_mm_any_lt(rsq00,rcutoff2))
1640             {
1641
1642             r00              = _mm_mul_pd(rsq00,rinv00);
1643
1644             /* EWALD ELECTROSTATICS */
1645
1646             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1647             ewrt             = _mm_mul_pd(r00,ewtabscale);
1648             ewitab           = _mm_cvttpd_epi32(ewrt);
1649 #ifdef __XOP__
1650             eweps            = _mm_frcz_pd(ewrt);
1651 #else
1652             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1653 #endif
1654             twoeweps         = _mm_add_pd(eweps,eweps);
1655             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1656                                          &ewtabF,&ewtabFn);
1657             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1658             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1659
1660             /* LENNARD-JONES DISPERSION/REPULSION */
1661
1662             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1663             fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1664
1665             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1666
1667             fscal            = _mm_add_pd(felec,fvdw);
1668
1669             fscal            = _mm_and_pd(fscal,cutoff_mask);
1670
1671             /* Update vectorial force */
1672             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1673             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1674             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1675             
1676             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1677             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1678             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1679
1680             }
1681
1682             /**************************
1683              * CALCULATE INTERACTIONS *
1684              **************************/
1685
1686             if (gmx_mm_any_lt(rsq01,rcutoff2))
1687             {
1688
1689             r01              = _mm_mul_pd(rsq01,rinv01);
1690
1691             /* EWALD ELECTROSTATICS */
1692
1693             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1694             ewrt             = _mm_mul_pd(r01,ewtabscale);
1695             ewitab           = _mm_cvttpd_epi32(ewrt);
1696 #ifdef __XOP__
1697             eweps            = _mm_frcz_pd(ewrt);
1698 #else
1699             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1700 #endif
1701             twoeweps         = _mm_add_pd(eweps,eweps);
1702             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1703                                          &ewtabF,&ewtabFn);
1704             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1705             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1706
1707             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
1708
1709             fscal            = felec;
1710
1711             fscal            = _mm_and_pd(fscal,cutoff_mask);
1712
1713             /* Update vectorial force */
1714             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1715             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1716             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1717             
1718             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1719             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1720             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1721
1722             }
1723
1724             /**************************
1725              * CALCULATE INTERACTIONS *
1726              **************************/
1727
1728             if (gmx_mm_any_lt(rsq02,rcutoff2))
1729             {
1730
1731             r02              = _mm_mul_pd(rsq02,rinv02);
1732
1733             /* EWALD ELECTROSTATICS */
1734
1735             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1736             ewrt             = _mm_mul_pd(r02,ewtabscale);
1737             ewitab           = _mm_cvttpd_epi32(ewrt);
1738 #ifdef __XOP__
1739             eweps            = _mm_frcz_pd(ewrt);
1740 #else
1741             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1742 #endif
1743             twoeweps         = _mm_add_pd(eweps,eweps);
1744             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1745                                          &ewtabF,&ewtabFn);
1746             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1747             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1748
1749             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1750
1751             fscal            = felec;
1752
1753             fscal            = _mm_and_pd(fscal,cutoff_mask);
1754
1755             /* Update vectorial force */
1756             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1757             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1758             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1759             
1760             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1761             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1762             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1763
1764             }
1765
1766             /**************************
1767              * CALCULATE INTERACTIONS *
1768              **************************/
1769
1770             if (gmx_mm_any_lt(rsq10,rcutoff2))
1771             {
1772
1773             r10              = _mm_mul_pd(rsq10,rinv10);
1774
1775             /* EWALD ELECTROSTATICS */
1776
1777             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1778             ewrt             = _mm_mul_pd(r10,ewtabscale);
1779             ewitab           = _mm_cvttpd_epi32(ewrt);
1780 #ifdef __XOP__
1781             eweps            = _mm_frcz_pd(ewrt);
1782 #else
1783             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1784 #endif
1785             twoeweps         = _mm_add_pd(eweps,eweps);
1786             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1787                                          &ewtabF,&ewtabFn);
1788             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1789             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1790
1791             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
1792
1793             fscal            = felec;
1794
1795             fscal            = _mm_and_pd(fscal,cutoff_mask);
1796
1797             /* Update vectorial force */
1798             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1799             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1800             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1801             
1802             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1803             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1804             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1805
1806             }
1807
1808             /**************************
1809              * CALCULATE INTERACTIONS *
1810              **************************/
1811
1812             if (gmx_mm_any_lt(rsq11,rcutoff2))
1813             {
1814
1815             r11              = _mm_mul_pd(rsq11,rinv11);
1816
1817             /* EWALD ELECTROSTATICS */
1818
1819             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1820             ewrt             = _mm_mul_pd(r11,ewtabscale);
1821             ewitab           = _mm_cvttpd_epi32(ewrt);
1822 #ifdef __XOP__
1823             eweps            = _mm_frcz_pd(ewrt);
1824 #else
1825             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1826 #endif
1827             twoeweps         = _mm_add_pd(eweps,eweps);
1828             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1829                                          &ewtabF,&ewtabFn);
1830             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1831             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1832
1833             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1834
1835             fscal            = felec;
1836
1837             fscal            = _mm_and_pd(fscal,cutoff_mask);
1838
1839             /* Update vectorial force */
1840             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1841             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1842             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1843             
1844             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1845             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1846             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1847
1848             }
1849
1850             /**************************
1851              * CALCULATE INTERACTIONS *
1852              **************************/
1853
1854             if (gmx_mm_any_lt(rsq12,rcutoff2))
1855             {
1856
1857             r12              = _mm_mul_pd(rsq12,rinv12);
1858
1859             /* EWALD ELECTROSTATICS */
1860
1861             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1862             ewrt             = _mm_mul_pd(r12,ewtabscale);
1863             ewitab           = _mm_cvttpd_epi32(ewrt);
1864 #ifdef __XOP__
1865             eweps            = _mm_frcz_pd(ewrt);
1866 #else
1867             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1868 #endif
1869             twoeweps         = _mm_add_pd(eweps,eweps);
1870             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1871                                          &ewtabF,&ewtabFn);
1872             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1873             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1874
1875             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1876
1877             fscal            = felec;
1878
1879             fscal            = _mm_and_pd(fscal,cutoff_mask);
1880
1881             /* Update vectorial force */
1882             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1883             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1884             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1885             
1886             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1887             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1888             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1889
1890             }
1891
1892             /**************************
1893              * CALCULATE INTERACTIONS *
1894              **************************/
1895
1896             if (gmx_mm_any_lt(rsq20,rcutoff2))
1897             {
1898
1899             r20              = _mm_mul_pd(rsq20,rinv20);
1900
1901             /* EWALD ELECTROSTATICS */
1902
1903             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1904             ewrt             = _mm_mul_pd(r20,ewtabscale);
1905             ewitab           = _mm_cvttpd_epi32(ewrt);
1906 #ifdef __XOP__
1907             eweps            = _mm_frcz_pd(ewrt);
1908 #else
1909             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1910 #endif
1911             twoeweps         = _mm_add_pd(eweps,eweps);
1912             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1913                                          &ewtabF,&ewtabFn);
1914             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1915             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1916
1917             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
1918
1919             fscal            = felec;
1920
1921             fscal            = _mm_and_pd(fscal,cutoff_mask);
1922
1923             /* Update vectorial force */
1924             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1925             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1926             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1927             
1928             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1929             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1930             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1931
1932             }
1933
1934             /**************************
1935              * CALCULATE INTERACTIONS *
1936              **************************/
1937
1938             if (gmx_mm_any_lt(rsq21,rcutoff2))
1939             {
1940
1941             r21              = _mm_mul_pd(rsq21,rinv21);
1942
1943             /* EWALD ELECTROSTATICS */
1944
1945             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1946             ewrt             = _mm_mul_pd(r21,ewtabscale);
1947             ewitab           = _mm_cvttpd_epi32(ewrt);
1948 #ifdef __XOP__
1949             eweps            = _mm_frcz_pd(ewrt);
1950 #else
1951             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1952 #endif
1953             twoeweps         = _mm_add_pd(eweps,eweps);
1954             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1955                                          &ewtabF,&ewtabFn);
1956             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1957             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1958
1959             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1960
1961             fscal            = felec;
1962
1963             fscal            = _mm_and_pd(fscal,cutoff_mask);
1964
1965             /* Update vectorial force */
1966             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1967             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1968             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1969             
1970             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1971             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1972             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1973
1974             }
1975
1976             /**************************
1977              * CALCULATE INTERACTIONS *
1978              **************************/
1979
1980             if (gmx_mm_any_lt(rsq22,rcutoff2))
1981             {
1982
1983             r22              = _mm_mul_pd(rsq22,rinv22);
1984
1985             /* EWALD ELECTROSTATICS */
1986
1987             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1988             ewrt             = _mm_mul_pd(r22,ewtabscale);
1989             ewitab           = _mm_cvttpd_epi32(ewrt);
1990 #ifdef __XOP__
1991             eweps            = _mm_frcz_pd(ewrt);
1992 #else
1993             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1994 #endif
1995             twoeweps         = _mm_add_pd(eweps,eweps);
1996             gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1997                                          &ewtabF,&ewtabFn);
1998             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1999             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2000
2001             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2002
2003             fscal            = felec;
2004
2005             fscal            = _mm_and_pd(fscal,cutoff_mask);
2006
2007             /* Update vectorial force */
2008             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2009             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2010             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2011             
2012             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2013             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2014             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2015
2016             }
2017
2018             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2019
2020             /* Inner loop uses 385 flops */
2021         }
2022
2023         if(jidx<j_index_end)
2024         {
2025
2026             jnrA             = jjnr[jidx];
2027             j_coord_offsetA  = DIM*jnrA;
2028
2029             /* load j atom coordinates */
2030             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2031                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2032
2033             /* Calculate displacement vector */
2034             dx00             = _mm_sub_pd(ix0,jx0);
2035             dy00             = _mm_sub_pd(iy0,jy0);
2036             dz00             = _mm_sub_pd(iz0,jz0);
2037             dx01             = _mm_sub_pd(ix0,jx1);
2038             dy01             = _mm_sub_pd(iy0,jy1);
2039             dz01             = _mm_sub_pd(iz0,jz1);
2040             dx02             = _mm_sub_pd(ix0,jx2);
2041             dy02             = _mm_sub_pd(iy0,jy2);
2042             dz02             = _mm_sub_pd(iz0,jz2);
2043             dx10             = _mm_sub_pd(ix1,jx0);
2044             dy10             = _mm_sub_pd(iy1,jy0);
2045             dz10             = _mm_sub_pd(iz1,jz0);
2046             dx11             = _mm_sub_pd(ix1,jx1);
2047             dy11             = _mm_sub_pd(iy1,jy1);
2048             dz11             = _mm_sub_pd(iz1,jz1);
2049             dx12             = _mm_sub_pd(ix1,jx2);
2050             dy12             = _mm_sub_pd(iy1,jy2);
2051             dz12             = _mm_sub_pd(iz1,jz2);
2052             dx20             = _mm_sub_pd(ix2,jx0);
2053             dy20             = _mm_sub_pd(iy2,jy0);
2054             dz20             = _mm_sub_pd(iz2,jz0);
2055             dx21             = _mm_sub_pd(ix2,jx1);
2056             dy21             = _mm_sub_pd(iy2,jy1);
2057             dz21             = _mm_sub_pd(iz2,jz1);
2058             dx22             = _mm_sub_pd(ix2,jx2);
2059             dy22             = _mm_sub_pd(iy2,jy2);
2060             dz22             = _mm_sub_pd(iz2,jz2);
2061
2062             /* Calculate squared distance and things based on it */
2063             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2064             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2065             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2066             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2067             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2068             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2069             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2070             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2071             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2072
2073             rinv00           = gmx_mm_invsqrt_pd(rsq00);
2074             rinv01           = gmx_mm_invsqrt_pd(rsq01);
2075             rinv02           = gmx_mm_invsqrt_pd(rsq02);
2076             rinv10           = gmx_mm_invsqrt_pd(rsq10);
2077             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2078             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2079             rinv20           = gmx_mm_invsqrt_pd(rsq20);
2080             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2081             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2082
2083             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
2084             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
2085             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
2086             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
2087             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2088             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2089             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
2090             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2091             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2092
2093             fjx0             = _mm_setzero_pd();
2094             fjy0             = _mm_setzero_pd();
2095             fjz0             = _mm_setzero_pd();
2096             fjx1             = _mm_setzero_pd();
2097             fjy1             = _mm_setzero_pd();
2098             fjz1             = _mm_setzero_pd();
2099             fjx2             = _mm_setzero_pd();
2100             fjy2             = _mm_setzero_pd();
2101             fjz2             = _mm_setzero_pd();
2102
2103             /**************************
2104              * CALCULATE INTERACTIONS *
2105              **************************/
2106
2107             if (gmx_mm_any_lt(rsq00,rcutoff2))
2108             {
2109
2110             r00              = _mm_mul_pd(rsq00,rinv00);
2111
2112             /* EWALD ELECTROSTATICS */
2113
2114             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2115             ewrt             = _mm_mul_pd(r00,ewtabscale);
2116             ewitab           = _mm_cvttpd_epi32(ewrt);
2117 #ifdef __XOP__
2118             eweps            = _mm_frcz_pd(ewrt);
2119 #else
2120             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2121 #endif
2122             twoeweps         = _mm_add_pd(eweps,eweps);
2123             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2124             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2125             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2126
2127             /* LENNARD-JONES DISPERSION/REPULSION */
2128
2129             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2130             fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
2131
2132             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2133
2134             fscal            = _mm_add_pd(felec,fvdw);
2135
2136             fscal            = _mm_and_pd(fscal,cutoff_mask);
2137
2138             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2139
2140             /* Update vectorial force */
2141             fix0             = _mm_macc_pd(dx00,fscal,fix0);
2142             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
2143             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
2144             
2145             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
2146             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
2147             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
2148
2149             }
2150
2151             /**************************
2152              * CALCULATE INTERACTIONS *
2153              **************************/
2154
2155             if (gmx_mm_any_lt(rsq01,rcutoff2))
2156             {
2157
2158             r01              = _mm_mul_pd(rsq01,rinv01);
2159
2160             /* EWALD ELECTROSTATICS */
2161
2162             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2163             ewrt             = _mm_mul_pd(r01,ewtabscale);
2164             ewitab           = _mm_cvttpd_epi32(ewrt);
2165 #ifdef __XOP__
2166             eweps            = _mm_frcz_pd(ewrt);
2167 #else
2168             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2169 #endif
2170             twoeweps         = _mm_add_pd(eweps,eweps);
2171             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2172             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2173             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2174
2175             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
2176
2177             fscal            = felec;
2178
2179             fscal            = _mm_and_pd(fscal,cutoff_mask);
2180
2181             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2182
2183             /* Update vectorial force */
2184             fix0             = _mm_macc_pd(dx01,fscal,fix0);
2185             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
2186             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
2187             
2188             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
2189             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
2190             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
2191
2192             }
2193
2194             /**************************
2195              * CALCULATE INTERACTIONS *
2196              **************************/
2197
2198             if (gmx_mm_any_lt(rsq02,rcutoff2))
2199             {
2200
2201             r02              = _mm_mul_pd(rsq02,rinv02);
2202
2203             /* EWALD ELECTROSTATICS */
2204
2205             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2206             ewrt             = _mm_mul_pd(r02,ewtabscale);
2207             ewitab           = _mm_cvttpd_epi32(ewrt);
2208 #ifdef __XOP__
2209             eweps            = _mm_frcz_pd(ewrt);
2210 #else
2211             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2212 #endif
2213             twoeweps         = _mm_add_pd(eweps,eweps);
2214             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2215             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2216             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2217
2218             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
2219
2220             fscal            = felec;
2221
2222             fscal            = _mm_and_pd(fscal,cutoff_mask);
2223
2224             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2225
2226             /* Update vectorial force */
2227             fix0             = _mm_macc_pd(dx02,fscal,fix0);
2228             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
2229             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
2230             
2231             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
2232             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
2233             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
2234
2235             }
2236
2237             /**************************
2238              * CALCULATE INTERACTIONS *
2239              **************************/
2240
2241             if (gmx_mm_any_lt(rsq10,rcutoff2))
2242             {
2243
2244             r10              = _mm_mul_pd(rsq10,rinv10);
2245
2246             /* EWALD ELECTROSTATICS */
2247
2248             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2249             ewrt             = _mm_mul_pd(r10,ewtabscale);
2250             ewitab           = _mm_cvttpd_epi32(ewrt);
2251 #ifdef __XOP__
2252             eweps            = _mm_frcz_pd(ewrt);
2253 #else
2254             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2255 #endif
2256             twoeweps         = _mm_add_pd(eweps,eweps);
2257             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2258             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2259             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2260
2261             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
2262
2263             fscal            = felec;
2264
2265             fscal            = _mm_and_pd(fscal,cutoff_mask);
2266
2267             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2268
2269             /* Update vectorial force */
2270             fix1             = _mm_macc_pd(dx10,fscal,fix1);
2271             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
2272             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
2273             
2274             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
2275             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
2276             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
2277
2278             }
2279
2280             /**************************
2281              * CALCULATE INTERACTIONS *
2282              **************************/
2283
2284             if (gmx_mm_any_lt(rsq11,rcutoff2))
2285             {
2286
2287             r11              = _mm_mul_pd(rsq11,rinv11);
2288
2289             /* EWALD ELECTROSTATICS */
2290
2291             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2292             ewrt             = _mm_mul_pd(r11,ewtabscale);
2293             ewitab           = _mm_cvttpd_epi32(ewrt);
2294 #ifdef __XOP__
2295             eweps            = _mm_frcz_pd(ewrt);
2296 #else
2297             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2298 #endif
2299             twoeweps         = _mm_add_pd(eweps,eweps);
2300             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2301             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2302             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2303
2304             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2305
2306             fscal            = felec;
2307
2308             fscal            = _mm_and_pd(fscal,cutoff_mask);
2309
2310             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2311
2312             /* Update vectorial force */
2313             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2314             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2315             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2316             
2317             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2318             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2319             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2320
2321             }
2322
2323             /**************************
2324              * CALCULATE INTERACTIONS *
2325              **************************/
2326
2327             if (gmx_mm_any_lt(rsq12,rcutoff2))
2328             {
2329
2330             r12              = _mm_mul_pd(rsq12,rinv12);
2331
2332             /* EWALD ELECTROSTATICS */
2333
2334             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2335             ewrt             = _mm_mul_pd(r12,ewtabscale);
2336             ewitab           = _mm_cvttpd_epi32(ewrt);
2337 #ifdef __XOP__
2338             eweps            = _mm_frcz_pd(ewrt);
2339 #else
2340             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2341 #endif
2342             twoeweps         = _mm_add_pd(eweps,eweps);
2343             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2344             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2345             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2346
2347             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2348
2349             fscal            = felec;
2350
2351             fscal            = _mm_and_pd(fscal,cutoff_mask);
2352
2353             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2354
2355             /* Update vectorial force */
2356             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2357             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2358             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2359             
2360             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2361             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2362             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2363
2364             }
2365
2366             /**************************
2367              * CALCULATE INTERACTIONS *
2368              **************************/
2369
2370             if (gmx_mm_any_lt(rsq20,rcutoff2))
2371             {
2372
2373             r20              = _mm_mul_pd(rsq20,rinv20);
2374
2375             /* EWALD ELECTROSTATICS */
2376
2377             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2378             ewrt             = _mm_mul_pd(r20,ewtabscale);
2379             ewitab           = _mm_cvttpd_epi32(ewrt);
2380 #ifdef __XOP__
2381             eweps            = _mm_frcz_pd(ewrt);
2382 #else
2383             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2384 #endif
2385             twoeweps         = _mm_add_pd(eweps,eweps);
2386             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2387             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2388             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2389
2390             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
2391
2392             fscal            = felec;
2393
2394             fscal            = _mm_and_pd(fscal,cutoff_mask);
2395
2396             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2397
2398             /* Update vectorial force */
2399             fix2             = _mm_macc_pd(dx20,fscal,fix2);
2400             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
2401             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
2402             
2403             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
2404             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
2405             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
2406
2407             }
2408
2409             /**************************
2410              * CALCULATE INTERACTIONS *
2411              **************************/
2412
2413             if (gmx_mm_any_lt(rsq21,rcutoff2))
2414             {
2415
2416             r21              = _mm_mul_pd(rsq21,rinv21);
2417
2418             /* EWALD ELECTROSTATICS */
2419
2420             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2421             ewrt             = _mm_mul_pd(r21,ewtabscale);
2422             ewitab           = _mm_cvttpd_epi32(ewrt);
2423 #ifdef __XOP__
2424             eweps            = _mm_frcz_pd(ewrt);
2425 #else
2426             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2427 #endif
2428             twoeweps         = _mm_add_pd(eweps,eweps);
2429             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2430             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2431             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2432
2433             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2434
2435             fscal            = felec;
2436
2437             fscal            = _mm_and_pd(fscal,cutoff_mask);
2438
2439             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2440
2441             /* Update vectorial force */
2442             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2443             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2444             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2445             
2446             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2447             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2448             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2449
2450             }
2451
2452             /**************************
2453              * CALCULATE INTERACTIONS *
2454              **************************/
2455
2456             if (gmx_mm_any_lt(rsq22,rcutoff2))
2457             {
2458
2459             r22              = _mm_mul_pd(rsq22,rinv22);
2460
2461             /* EWALD ELECTROSTATICS */
2462
2463             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2464             ewrt             = _mm_mul_pd(r22,ewtabscale);
2465             ewitab           = _mm_cvttpd_epi32(ewrt);
2466 #ifdef __XOP__
2467             eweps            = _mm_frcz_pd(ewrt);
2468 #else
2469             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2470 #endif
2471             twoeweps         = _mm_add_pd(eweps,eweps);
2472             gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2473             felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2474             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2475
2476             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2477
2478             fscal            = felec;
2479
2480             fscal            = _mm_and_pd(fscal,cutoff_mask);
2481
2482             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2483
2484             /* Update vectorial force */
2485             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2486             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2487             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2488             
2489             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2490             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2491             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2492
2493             }
2494
2495             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2496
2497             /* Inner loop uses 385 flops */
2498         }
2499
2500         /* End of innermost loop */
2501
2502         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2503                                               f+i_coord_offset,fshift+i_shift_offset);
2504
2505         /* Increment number of inner iterations */
2506         inneriter                  += j_index_end - j_index_start;
2507
2508         /* Outer loop uses 18 flops */
2509     }
2510
2511     /* Increment number of outer iterations */
2512     outeriter        += nri;
2513
2514     /* Update outer/inner flops */
2515
2516     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*385);
2517 }