d3af9afe4bc7f11eb4add0cc4a55cf421b033513
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_avx_128_fma_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
48 #include "kernelutil_x86_avx_128_fma_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_double
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB;
75     int              j_coord_offsetA,j_coord_offsetB;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80     int              vdwioffset0;
81     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82     int              vdwioffset1;
83     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84     int              vdwioffset2;
85     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86     int              vdwioffset3;
87     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88     int              vdwjidx0A,vdwjidx0B;
89     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B;
91     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B;
93     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     int              vdwjidx3A,vdwjidx3B;
95     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
113     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
114     __m128i          ewitab;
115     __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
116     real             *ewtab;
117     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
118     real             rswitch_scalar,d_scalar;
119     __m128d          dummy_mask,cutoff_mask;
120     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
121     __m128d          one     = _mm_set1_pd(1.0);
122     __m128d          two     = _mm_set1_pd(2.0);
123     x                = xx[0];
124     f                = ff[0];
125
126     nri              = nlist->nri;
127     iinr             = nlist->iinr;
128     jindex           = nlist->jindex;
129     jjnr             = nlist->jjnr;
130     shiftidx         = nlist->shift;
131     gid              = nlist->gid;
132     shiftvec         = fr->shift_vec[0];
133     fshift           = fr->fshift[0];
134     facel            = _mm_set1_pd(fr->epsfac);
135     charge           = mdatoms->chargeA;
136     nvdwtype         = fr->ntype;
137     vdwparam         = fr->nbfp;
138     vdwtype          = mdatoms->typeA;
139
140     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
141     ewtab            = fr->ic->tabq_coul_FDV0;
142     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
143     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
144
145     /* Setup water-specific parameters */
146     inr              = nlist->iinr[0];
147     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
148     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
149     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
150     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
151
152     jq1              = _mm_set1_pd(charge[inr+1]);
153     jq2              = _mm_set1_pd(charge[inr+2]);
154     jq3              = _mm_set1_pd(charge[inr+3]);
155     vdwjidx0A        = 2*vdwtype[inr+0];
156     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
157     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
158     qq11             = _mm_mul_pd(iq1,jq1);
159     qq12             = _mm_mul_pd(iq1,jq2);
160     qq13             = _mm_mul_pd(iq1,jq3);
161     qq21             = _mm_mul_pd(iq2,jq1);
162     qq22             = _mm_mul_pd(iq2,jq2);
163     qq23             = _mm_mul_pd(iq2,jq3);
164     qq31             = _mm_mul_pd(iq3,jq1);
165     qq32             = _mm_mul_pd(iq3,jq2);
166     qq33             = _mm_mul_pd(iq3,jq3);
167
168     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169     rcutoff_scalar   = fr->rcoulomb;
170     rcutoff          = _mm_set1_pd(rcutoff_scalar);
171     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
172
173     rswitch_scalar   = fr->rcoulomb_switch;
174     rswitch          = _mm_set1_pd(rswitch_scalar);
175     /* Setup switch parameters */
176     d_scalar         = rcutoff_scalar-rswitch_scalar;
177     d                = _mm_set1_pd(d_scalar);
178     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
179     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
182     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
183     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
184
185     /* Avoid stupid compiler warnings */
186     jnrA = jnrB = 0;
187     j_coord_offsetA = 0;
188     j_coord_offsetB = 0;
189
190     outeriter        = 0;
191     inneriter        = 0;
192
193     /* Start outer loop over neighborlists */
194     for(iidx=0; iidx<nri; iidx++)
195     {
196         /* Load shift vector for this list */
197         i_shift_offset   = DIM*shiftidx[iidx];
198
199         /* Load limits for loop over neighbors */
200         j_index_start    = jindex[iidx];
201         j_index_end      = jindex[iidx+1];
202
203         /* Get outer coordinate index */
204         inr              = iinr[iidx];
205         i_coord_offset   = DIM*inr;
206
207         /* Load i particle coords and add shift vector */
208         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
209                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
210
211         fix0             = _mm_setzero_pd();
212         fiy0             = _mm_setzero_pd();
213         fiz0             = _mm_setzero_pd();
214         fix1             = _mm_setzero_pd();
215         fiy1             = _mm_setzero_pd();
216         fiz1             = _mm_setzero_pd();
217         fix2             = _mm_setzero_pd();
218         fiy2             = _mm_setzero_pd();
219         fiz2             = _mm_setzero_pd();
220         fix3             = _mm_setzero_pd();
221         fiy3             = _mm_setzero_pd();
222         fiz3             = _mm_setzero_pd();
223
224         /* Reset potential sums */
225         velecsum         = _mm_setzero_pd();
226         vvdwsum          = _mm_setzero_pd();
227
228         /* Start inner kernel loop */
229         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
230         {
231
232             /* Get j neighbor index, and coordinate index */
233             jnrA             = jjnr[jidx];
234             jnrB             = jjnr[jidx+1];
235             j_coord_offsetA  = DIM*jnrA;
236             j_coord_offsetB  = DIM*jnrB;
237
238             /* load j atom coordinates */
239             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
240                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
241                                               &jy2,&jz2,&jx3,&jy3,&jz3);
242
243             /* Calculate displacement vector */
244             dx00             = _mm_sub_pd(ix0,jx0);
245             dy00             = _mm_sub_pd(iy0,jy0);
246             dz00             = _mm_sub_pd(iz0,jz0);
247             dx11             = _mm_sub_pd(ix1,jx1);
248             dy11             = _mm_sub_pd(iy1,jy1);
249             dz11             = _mm_sub_pd(iz1,jz1);
250             dx12             = _mm_sub_pd(ix1,jx2);
251             dy12             = _mm_sub_pd(iy1,jy2);
252             dz12             = _mm_sub_pd(iz1,jz2);
253             dx13             = _mm_sub_pd(ix1,jx3);
254             dy13             = _mm_sub_pd(iy1,jy3);
255             dz13             = _mm_sub_pd(iz1,jz3);
256             dx21             = _mm_sub_pd(ix2,jx1);
257             dy21             = _mm_sub_pd(iy2,jy1);
258             dz21             = _mm_sub_pd(iz2,jz1);
259             dx22             = _mm_sub_pd(ix2,jx2);
260             dy22             = _mm_sub_pd(iy2,jy2);
261             dz22             = _mm_sub_pd(iz2,jz2);
262             dx23             = _mm_sub_pd(ix2,jx3);
263             dy23             = _mm_sub_pd(iy2,jy3);
264             dz23             = _mm_sub_pd(iz2,jz3);
265             dx31             = _mm_sub_pd(ix3,jx1);
266             dy31             = _mm_sub_pd(iy3,jy1);
267             dz31             = _mm_sub_pd(iz3,jz1);
268             dx32             = _mm_sub_pd(ix3,jx2);
269             dy32             = _mm_sub_pd(iy3,jy2);
270             dz32             = _mm_sub_pd(iz3,jz2);
271             dx33             = _mm_sub_pd(ix3,jx3);
272             dy33             = _mm_sub_pd(iy3,jy3);
273             dz33             = _mm_sub_pd(iz3,jz3);
274
275             /* Calculate squared distance and things based on it */
276             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
277             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
278             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
279             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
280             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
281             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
282             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
283             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
284             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
285             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
286
287             rinv00           = gmx_mm_invsqrt_pd(rsq00);
288             rinv11           = gmx_mm_invsqrt_pd(rsq11);
289             rinv12           = gmx_mm_invsqrt_pd(rsq12);
290             rinv13           = gmx_mm_invsqrt_pd(rsq13);
291             rinv21           = gmx_mm_invsqrt_pd(rsq21);
292             rinv22           = gmx_mm_invsqrt_pd(rsq22);
293             rinv23           = gmx_mm_invsqrt_pd(rsq23);
294             rinv31           = gmx_mm_invsqrt_pd(rsq31);
295             rinv32           = gmx_mm_invsqrt_pd(rsq32);
296             rinv33           = gmx_mm_invsqrt_pd(rsq33);
297
298             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
299             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
300             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
301             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
302             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
303             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
304             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
305             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
306             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
307             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
308
309             fjx0             = _mm_setzero_pd();
310             fjy0             = _mm_setzero_pd();
311             fjz0             = _mm_setzero_pd();
312             fjx1             = _mm_setzero_pd();
313             fjy1             = _mm_setzero_pd();
314             fjz1             = _mm_setzero_pd();
315             fjx2             = _mm_setzero_pd();
316             fjy2             = _mm_setzero_pd();
317             fjz2             = _mm_setzero_pd();
318             fjx3             = _mm_setzero_pd();
319             fjy3             = _mm_setzero_pd();
320             fjz3             = _mm_setzero_pd();
321
322             /**************************
323              * CALCULATE INTERACTIONS *
324              **************************/
325
326             if (gmx_mm_any_lt(rsq00,rcutoff2))
327             {
328
329             r00              = _mm_mul_pd(rsq00,rinv00);
330
331             /* LENNARD-JONES DISPERSION/REPULSION */
332
333             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
334             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
335             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
336             vvdw             = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
337             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
338
339             d                = _mm_sub_pd(r00,rswitch);
340             d                = _mm_max_pd(d,_mm_setzero_pd());
341             d2               = _mm_mul_pd(d,d);
342             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
343
344             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
345
346             /* Evaluate switch function */
347             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
348             fvdw             = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
349             vvdw             = _mm_mul_pd(vvdw,sw);
350             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
351
352             /* Update potential sum for this i atom from the interaction with this j atom. */
353             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
354             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
355
356             fscal            = fvdw;
357
358             fscal            = _mm_and_pd(fscal,cutoff_mask);
359
360             /* Update vectorial force */
361             fix0             = _mm_macc_pd(dx00,fscal,fix0);
362             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
363             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
364             
365             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
366             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
367             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
368
369             }
370
371             /**************************
372              * CALCULATE INTERACTIONS *
373              **************************/
374
375             if (gmx_mm_any_lt(rsq11,rcutoff2))
376             {
377
378             r11              = _mm_mul_pd(rsq11,rinv11);
379
380             /* EWALD ELECTROSTATICS */
381
382             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
383             ewrt             = _mm_mul_pd(r11,ewtabscale);
384             ewitab           = _mm_cvttpd_epi32(ewrt);
385 #ifdef __XOP__
386             eweps            = _mm_frcz_pd(ewrt);
387 #else
388             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
389 #endif
390             twoeweps         = _mm_add_pd(eweps,eweps);
391             ewitab           = _mm_slli_epi32(ewitab,2);
392             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
393             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
394             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
395             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
396             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
397             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
398             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
399             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
400             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
401             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
402
403             d                = _mm_sub_pd(r11,rswitch);
404             d                = _mm_max_pd(d,_mm_setzero_pd());
405             d2               = _mm_mul_pd(d,d);
406             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
407
408             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
409
410             /* Evaluate switch function */
411             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
412             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
413             velec            = _mm_mul_pd(velec,sw);
414             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
415
416             /* Update potential sum for this i atom from the interaction with this j atom. */
417             velec            = _mm_and_pd(velec,cutoff_mask);
418             velecsum         = _mm_add_pd(velecsum,velec);
419
420             fscal            = felec;
421
422             fscal            = _mm_and_pd(fscal,cutoff_mask);
423
424             /* Update vectorial force */
425             fix1             = _mm_macc_pd(dx11,fscal,fix1);
426             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
427             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
428             
429             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
430             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
431             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
432
433             }
434
435             /**************************
436              * CALCULATE INTERACTIONS *
437              **************************/
438
439             if (gmx_mm_any_lt(rsq12,rcutoff2))
440             {
441
442             r12              = _mm_mul_pd(rsq12,rinv12);
443
444             /* EWALD ELECTROSTATICS */
445
446             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
447             ewrt             = _mm_mul_pd(r12,ewtabscale);
448             ewitab           = _mm_cvttpd_epi32(ewrt);
449 #ifdef __XOP__
450             eweps            = _mm_frcz_pd(ewrt);
451 #else
452             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
453 #endif
454             twoeweps         = _mm_add_pd(eweps,eweps);
455             ewitab           = _mm_slli_epi32(ewitab,2);
456             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
457             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
458             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
459             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
460             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
461             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
462             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
463             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
464             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
465             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
466
467             d                = _mm_sub_pd(r12,rswitch);
468             d                = _mm_max_pd(d,_mm_setzero_pd());
469             d2               = _mm_mul_pd(d,d);
470             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
471
472             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
473
474             /* Evaluate switch function */
475             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
476             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
477             velec            = _mm_mul_pd(velec,sw);
478             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
479
480             /* Update potential sum for this i atom from the interaction with this j atom. */
481             velec            = _mm_and_pd(velec,cutoff_mask);
482             velecsum         = _mm_add_pd(velecsum,velec);
483
484             fscal            = felec;
485
486             fscal            = _mm_and_pd(fscal,cutoff_mask);
487
488             /* Update vectorial force */
489             fix1             = _mm_macc_pd(dx12,fscal,fix1);
490             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
491             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
492             
493             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
494             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
495             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
496
497             }
498
499             /**************************
500              * CALCULATE INTERACTIONS *
501              **************************/
502
503             if (gmx_mm_any_lt(rsq13,rcutoff2))
504             {
505
506             r13              = _mm_mul_pd(rsq13,rinv13);
507
508             /* EWALD ELECTROSTATICS */
509
510             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511             ewrt             = _mm_mul_pd(r13,ewtabscale);
512             ewitab           = _mm_cvttpd_epi32(ewrt);
513 #ifdef __XOP__
514             eweps            = _mm_frcz_pd(ewrt);
515 #else
516             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
517 #endif
518             twoeweps         = _mm_add_pd(eweps,eweps);
519             ewitab           = _mm_slli_epi32(ewitab,2);
520             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
521             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
522             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
523             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
524             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
525             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
526             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
527             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
528             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
529             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
530
531             d                = _mm_sub_pd(r13,rswitch);
532             d                = _mm_max_pd(d,_mm_setzero_pd());
533             d2               = _mm_mul_pd(d,d);
534             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
535
536             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
537
538             /* Evaluate switch function */
539             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
540             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
541             velec            = _mm_mul_pd(velec,sw);
542             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
543
544             /* Update potential sum for this i atom from the interaction with this j atom. */
545             velec            = _mm_and_pd(velec,cutoff_mask);
546             velecsum         = _mm_add_pd(velecsum,velec);
547
548             fscal            = felec;
549
550             fscal            = _mm_and_pd(fscal,cutoff_mask);
551
552             /* Update vectorial force */
553             fix1             = _mm_macc_pd(dx13,fscal,fix1);
554             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
555             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
556             
557             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
558             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
559             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
560
561             }
562
563             /**************************
564              * CALCULATE INTERACTIONS *
565              **************************/
566
567             if (gmx_mm_any_lt(rsq21,rcutoff2))
568             {
569
570             r21              = _mm_mul_pd(rsq21,rinv21);
571
572             /* EWALD ELECTROSTATICS */
573
574             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
575             ewrt             = _mm_mul_pd(r21,ewtabscale);
576             ewitab           = _mm_cvttpd_epi32(ewrt);
577 #ifdef __XOP__
578             eweps            = _mm_frcz_pd(ewrt);
579 #else
580             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
581 #endif
582             twoeweps         = _mm_add_pd(eweps,eweps);
583             ewitab           = _mm_slli_epi32(ewitab,2);
584             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
585             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
586             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
587             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
588             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
589             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
590             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
591             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
592             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
593             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
594
595             d                = _mm_sub_pd(r21,rswitch);
596             d                = _mm_max_pd(d,_mm_setzero_pd());
597             d2               = _mm_mul_pd(d,d);
598             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
599
600             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
601
602             /* Evaluate switch function */
603             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
604             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
605             velec            = _mm_mul_pd(velec,sw);
606             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
607
608             /* Update potential sum for this i atom from the interaction with this j atom. */
609             velec            = _mm_and_pd(velec,cutoff_mask);
610             velecsum         = _mm_add_pd(velecsum,velec);
611
612             fscal            = felec;
613
614             fscal            = _mm_and_pd(fscal,cutoff_mask);
615
616             /* Update vectorial force */
617             fix2             = _mm_macc_pd(dx21,fscal,fix2);
618             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
619             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
620             
621             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
622             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
623             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
624
625             }
626
627             /**************************
628              * CALCULATE INTERACTIONS *
629              **************************/
630
631             if (gmx_mm_any_lt(rsq22,rcutoff2))
632             {
633
634             r22              = _mm_mul_pd(rsq22,rinv22);
635
636             /* EWALD ELECTROSTATICS */
637
638             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
639             ewrt             = _mm_mul_pd(r22,ewtabscale);
640             ewitab           = _mm_cvttpd_epi32(ewrt);
641 #ifdef __XOP__
642             eweps            = _mm_frcz_pd(ewrt);
643 #else
644             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
645 #endif
646             twoeweps         = _mm_add_pd(eweps,eweps);
647             ewitab           = _mm_slli_epi32(ewitab,2);
648             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
649             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
650             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
651             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
652             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
653             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
654             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
655             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
656             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
657             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
658
659             d                = _mm_sub_pd(r22,rswitch);
660             d                = _mm_max_pd(d,_mm_setzero_pd());
661             d2               = _mm_mul_pd(d,d);
662             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
663
664             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
665
666             /* Evaluate switch function */
667             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
668             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
669             velec            = _mm_mul_pd(velec,sw);
670             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
671
672             /* Update potential sum for this i atom from the interaction with this j atom. */
673             velec            = _mm_and_pd(velec,cutoff_mask);
674             velecsum         = _mm_add_pd(velecsum,velec);
675
676             fscal            = felec;
677
678             fscal            = _mm_and_pd(fscal,cutoff_mask);
679
680             /* Update vectorial force */
681             fix2             = _mm_macc_pd(dx22,fscal,fix2);
682             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
683             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
684             
685             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
686             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
687             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
688
689             }
690
691             /**************************
692              * CALCULATE INTERACTIONS *
693              **************************/
694
695             if (gmx_mm_any_lt(rsq23,rcutoff2))
696             {
697
698             r23              = _mm_mul_pd(rsq23,rinv23);
699
700             /* EWALD ELECTROSTATICS */
701
702             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
703             ewrt             = _mm_mul_pd(r23,ewtabscale);
704             ewitab           = _mm_cvttpd_epi32(ewrt);
705 #ifdef __XOP__
706             eweps            = _mm_frcz_pd(ewrt);
707 #else
708             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
709 #endif
710             twoeweps         = _mm_add_pd(eweps,eweps);
711             ewitab           = _mm_slli_epi32(ewitab,2);
712             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
713             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
714             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
715             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
716             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
717             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
718             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
719             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
720             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
721             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
722
723             d                = _mm_sub_pd(r23,rswitch);
724             d                = _mm_max_pd(d,_mm_setzero_pd());
725             d2               = _mm_mul_pd(d,d);
726             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
727
728             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
729
730             /* Evaluate switch function */
731             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
732             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
733             velec            = _mm_mul_pd(velec,sw);
734             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
735
736             /* Update potential sum for this i atom from the interaction with this j atom. */
737             velec            = _mm_and_pd(velec,cutoff_mask);
738             velecsum         = _mm_add_pd(velecsum,velec);
739
740             fscal            = felec;
741
742             fscal            = _mm_and_pd(fscal,cutoff_mask);
743
744             /* Update vectorial force */
745             fix2             = _mm_macc_pd(dx23,fscal,fix2);
746             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
747             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
748             
749             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
750             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
751             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
752
753             }
754
755             /**************************
756              * CALCULATE INTERACTIONS *
757              **************************/
758
759             if (gmx_mm_any_lt(rsq31,rcutoff2))
760             {
761
762             r31              = _mm_mul_pd(rsq31,rinv31);
763
764             /* EWALD ELECTROSTATICS */
765
766             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
767             ewrt             = _mm_mul_pd(r31,ewtabscale);
768             ewitab           = _mm_cvttpd_epi32(ewrt);
769 #ifdef __XOP__
770             eweps            = _mm_frcz_pd(ewrt);
771 #else
772             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
773 #endif
774             twoeweps         = _mm_add_pd(eweps,eweps);
775             ewitab           = _mm_slli_epi32(ewitab,2);
776             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
777             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
778             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
779             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
780             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
781             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
782             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
783             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
784             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
785             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
786
787             d                = _mm_sub_pd(r31,rswitch);
788             d                = _mm_max_pd(d,_mm_setzero_pd());
789             d2               = _mm_mul_pd(d,d);
790             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
791
792             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
793
794             /* Evaluate switch function */
795             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
796             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
797             velec            = _mm_mul_pd(velec,sw);
798             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
799
800             /* Update potential sum for this i atom from the interaction with this j atom. */
801             velec            = _mm_and_pd(velec,cutoff_mask);
802             velecsum         = _mm_add_pd(velecsum,velec);
803
804             fscal            = felec;
805
806             fscal            = _mm_and_pd(fscal,cutoff_mask);
807
808             /* Update vectorial force */
809             fix3             = _mm_macc_pd(dx31,fscal,fix3);
810             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
811             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
812             
813             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
814             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
815             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
816
817             }
818
819             /**************************
820              * CALCULATE INTERACTIONS *
821              **************************/
822
823             if (gmx_mm_any_lt(rsq32,rcutoff2))
824             {
825
826             r32              = _mm_mul_pd(rsq32,rinv32);
827
828             /* EWALD ELECTROSTATICS */
829
830             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
831             ewrt             = _mm_mul_pd(r32,ewtabscale);
832             ewitab           = _mm_cvttpd_epi32(ewrt);
833 #ifdef __XOP__
834             eweps            = _mm_frcz_pd(ewrt);
835 #else
836             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
837 #endif
838             twoeweps         = _mm_add_pd(eweps,eweps);
839             ewitab           = _mm_slli_epi32(ewitab,2);
840             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
841             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
842             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
843             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
844             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
845             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
846             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
847             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
848             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
849             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
850
851             d                = _mm_sub_pd(r32,rswitch);
852             d                = _mm_max_pd(d,_mm_setzero_pd());
853             d2               = _mm_mul_pd(d,d);
854             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
855
856             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
857
858             /* Evaluate switch function */
859             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
860             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
861             velec            = _mm_mul_pd(velec,sw);
862             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
863
864             /* Update potential sum for this i atom from the interaction with this j atom. */
865             velec            = _mm_and_pd(velec,cutoff_mask);
866             velecsum         = _mm_add_pd(velecsum,velec);
867
868             fscal            = felec;
869
870             fscal            = _mm_and_pd(fscal,cutoff_mask);
871
872             /* Update vectorial force */
873             fix3             = _mm_macc_pd(dx32,fscal,fix3);
874             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
875             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
876             
877             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
878             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
879             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
880
881             }
882
883             /**************************
884              * CALCULATE INTERACTIONS *
885              **************************/
886
887             if (gmx_mm_any_lt(rsq33,rcutoff2))
888             {
889
890             r33              = _mm_mul_pd(rsq33,rinv33);
891
892             /* EWALD ELECTROSTATICS */
893
894             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
895             ewrt             = _mm_mul_pd(r33,ewtabscale);
896             ewitab           = _mm_cvttpd_epi32(ewrt);
897 #ifdef __XOP__
898             eweps            = _mm_frcz_pd(ewrt);
899 #else
900             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
901 #endif
902             twoeweps         = _mm_add_pd(eweps,eweps);
903             ewitab           = _mm_slli_epi32(ewitab,2);
904             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
905             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
906             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
907             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
908             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
909             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
910             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
911             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
912             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
913             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
914
915             d                = _mm_sub_pd(r33,rswitch);
916             d                = _mm_max_pd(d,_mm_setzero_pd());
917             d2               = _mm_mul_pd(d,d);
918             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
919
920             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
921
922             /* Evaluate switch function */
923             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
924             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
925             velec            = _mm_mul_pd(velec,sw);
926             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
927
928             /* Update potential sum for this i atom from the interaction with this j atom. */
929             velec            = _mm_and_pd(velec,cutoff_mask);
930             velecsum         = _mm_add_pd(velecsum,velec);
931
932             fscal            = felec;
933
934             fscal            = _mm_and_pd(fscal,cutoff_mask);
935
936             /* Update vectorial force */
937             fix3             = _mm_macc_pd(dx33,fscal,fix3);
938             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
939             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
940             
941             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
942             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
943             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
944
945             }
946
947             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
948
949             /* Inner loop uses 677 flops */
950         }
951
952         if(jidx<j_index_end)
953         {
954
955             jnrA             = jjnr[jidx];
956             j_coord_offsetA  = DIM*jnrA;
957
958             /* load j atom coordinates */
959             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
960                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
961                                               &jy2,&jz2,&jx3,&jy3,&jz3);
962
963             /* Calculate displacement vector */
964             dx00             = _mm_sub_pd(ix0,jx0);
965             dy00             = _mm_sub_pd(iy0,jy0);
966             dz00             = _mm_sub_pd(iz0,jz0);
967             dx11             = _mm_sub_pd(ix1,jx1);
968             dy11             = _mm_sub_pd(iy1,jy1);
969             dz11             = _mm_sub_pd(iz1,jz1);
970             dx12             = _mm_sub_pd(ix1,jx2);
971             dy12             = _mm_sub_pd(iy1,jy2);
972             dz12             = _mm_sub_pd(iz1,jz2);
973             dx13             = _mm_sub_pd(ix1,jx3);
974             dy13             = _mm_sub_pd(iy1,jy3);
975             dz13             = _mm_sub_pd(iz1,jz3);
976             dx21             = _mm_sub_pd(ix2,jx1);
977             dy21             = _mm_sub_pd(iy2,jy1);
978             dz21             = _mm_sub_pd(iz2,jz1);
979             dx22             = _mm_sub_pd(ix2,jx2);
980             dy22             = _mm_sub_pd(iy2,jy2);
981             dz22             = _mm_sub_pd(iz2,jz2);
982             dx23             = _mm_sub_pd(ix2,jx3);
983             dy23             = _mm_sub_pd(iy2,jy3);
984             dz23             = _mm_sub_pd(iz2,jz3);
985             dx31             = _mm_sub_pd(ix3,jx1);
986             dy31             = _mm_sub_pd(iy3,jy1);
987             dz31             = _mm_sub_pd(iz3,jz1);
988             dx32             = _mm_sub_pd(ix3,jx2);
989             dy32             = _mm_sub_pd(iy3,jy2);
990             dz32             = _mm_sub_pd(iz3,jz2);
991             dx33             = _mm_sub_pd(ix3,jx3);
992             dy33             = _mm_sub_pd(iy3,jy3);
993             dz33             = _mm_sub_pd(iz3,jz3);
994
995             /* Calculate squared distance and things based on it */
996             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
997             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
998             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
999             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1000             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1001             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1002             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1003             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1004             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1005             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1006
1007             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1008             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1009             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1010             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1011             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1012             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1013             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1014             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1015             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1016             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1017
1018             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1019             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1020             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1021             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1022             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1023             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1024             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1025             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1026             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1027             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1028
1029             fjx0             = _mm_setzero_pd();
1030             fjy0             = _mm_setzero_pd();
1031             fjz0             = _mm_setzero_pd();
1032             fjx1             = _mm_setzero_pd();
1033             fjy1             = _mm_setzero_pd();
1034             fjz1             = _mm_setzero_pd();
1035             fjx2             = _mm_setzero_pd();
1036             fjy2             = _mm_setzero_pd();
1037             fjz2             = _mm_setzero_pd();
1038             fjx3             = _mm_setzero_pd();
1039             fjy3             = _mm_setzero_pd();
1040             fjz3             = _mm_setzero_pd();
1041
1042             /**************************
1043              * CALCULATE INTERACTIONS *
1044              **************************/
1045
1046             if (gmx_mm_any_lt(rsq00,rcutoff2))
1047             {
1048
1049             r00              = _mm_mul_pd(rsq00,rinv00);
1050
1051             /* LENNARD-JONES DISPERSION/REPULSION */
1052
1053             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1054             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
1055             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1056             vvdw             = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1057             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1058
1059             d                = _mm_sub_pd(r00,rswitch);
1060             d                = _mm_max_pd(d,_mm_setzero_pd());
1061             d2               = _mm_mul_pd(d,d);
1062             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1063
1064             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1065
1066             /* Evaluate switch function */
1067             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1068             fvdw             = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1069             vvdw             = _mm_mul_pd(vvdw,sw);
1070             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1071
1072             /* Update potential sum for this i atom from the interaction with this j atom. */
1073             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
1074             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1075             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
1076
1077             fscal            = fvdw;
1078
1079             fscal            = _mm_and_pd(fscal,cutoff_mask);
1080
1081             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1082
1083             /* Update vectorial force */
1084             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1085             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1086             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1087             
1088             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1089             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1090             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1091
1092             }
1093
1094             /**************************
1095              * CALCULATE INTERACTIONS *
1096              **************************/
1097
1098             if (gmx_mm_any_lt(rsq11,rcutoff2))
1099             {
1100
1101             r11              = _mm_mul_pd(rsq11,rinv11);
1102
1103             /* EWALD ELECTROSTATICS */
1104
1105             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1106             ewrt             = _mm_mul_pd(r11,ewtabscale);
1107             ewitab           = _mm_cvttpd_epi32(ewrt);
1108 #ifdef __XOP__
1109             eweps            = _mm_frcz_pd(ewrt);
1110 #else
1111             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1112 #endif
1113             twoeweps         = _mm_add_pd(eweps,eweps);
1114             ewitab           = _mm_slli_epi32(ewitab,2);
1115             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1116             ewtabD           = _mm_setzero_pd();
1117             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1118             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1119             ewtabFn          = _mm_setzero_pd();
1120             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1121             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1122             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1123             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1124             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1125
1126             d                = _mm_sub_pd(r11,rswitch);
1127             d                = _mm_max_pd(d,_mm_setzero_pd());
1128             d2               = _mm_mul_pd(d,d);
1129             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1130
1131             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1132
1133             /* Evaluate switch function */
1134             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1135             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1136             velec            = _mm_mul_pd(velec,sw);
1137             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1138
1139             /* Update potential sum for this i atom from the interaction with this j atom. */
1140             velec            = _mm_and_pd(velec,cutoff_mask);
1141             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1142             velecsum         = _mm_add_pd(velecsum,velec);
1143
1144             fscal            = felec;
1145
1146             fscal            = _mm_and_pd(fscal,cutoff_mask);
1147
1148             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1149
1150             /* Update vectorial force */
1151             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1152             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1153             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1154             
1155             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1156             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1157             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1158
1159             }
1160
1161             /**************************
1162              * CALCULATE INTERACTIONS *
1163              **************************/
1164
1165             if (gmx_mm_any_lt(rsq12,rcutoff2))
1166             {
1167
1168             r12              = _mm_mul_pd(rsq12,rinv12);
1169
1170             /* EWALD ELECTROSTATICS */
1171
1172             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1173             ewrt             = _mm_mul_pd(r12,ewtabscale);
1174             ewitab           = _mm_cvttpd_epi32(ewrt);
1175 #ifdef __XOP__
1176             eweps            = _mm_frcz_pd(ewrt);
1177 #else
1178             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1179 #endif
1180             twoeweps         = _mm_add_pd(eweps,eweps);
1181             ewitab           = _mm_slli_epi32(ewitab,2);
1182             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1183             ewtabD           = _mm_setzero_pd();
1184             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1185             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1186             ewtabFn          = _mm_setzero_pd();
1187             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1188             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1189             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1190             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1191             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1192
1193             d                = _mm_sub_pd(r12,rswitch);
1194             d                = _mm_max_pd(d,_mm_setzero_pd());
1195             d2               = _mm_mul_pd(d,d);
1196             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1197
1198             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1199
1200             /* Evaluate switch function */
1201             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1202             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1203             velec            = _mm_mul_pd(velec,sw);
1204             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1205
1206             /* Update potential sum for this i atom from the interaction with this j atom. */
1207             velec            = _mm_and_pd(velec,cutoff_mask);
1208             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1209             velecsum         = _mm_add_pd(velecsum,velec);
1210
1211             fscal            = felec;
1212
1213             fscal            = _mm_and_pd(fscal,cutoff_mask);
1214
1215             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1216
1217             /* Update vectorial force */
1218             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1219             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1220             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1221             
1222             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1223             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1224             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1225
1226             }
1227
1228             /**************************
1229              * CALCULATE INTERACTIONS *
1230              **************************/
1231
1232             if (gmx_mm_any_lt(rsq13,rcutoff2))
1233             {
1234
1235             r13              = _mm_mul_pd(rsq13,rinv13);
1236
1237             /* EWALD ELECTROSTATICS */
1238
1239             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1240             ewrt             = _mm_mul_pd(r13,ewtabscale);
1241             ewitab           = _mm_cvttpd_epi32(ewrt);
1242 #ifdef __XOP__
1243             eweps            = _mm_frcz_pd(ewrt);
1244 #else
1245             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1246 #endif
1247             twoeweps         = _mm_add_pd(eweps,eweps);
1248             ewitab           = _mm_slli_epi32(ewitab,2);
1249             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1250             ewtabD           = _mm_setzero_pd();
1251             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1252             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1253             ewtabFn          = _mm_setzero_pd();
1254             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1255             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1256             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1257             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1258             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1259
1260             d                = _mm_sub_pd(r13,rswitch);
1261             d                = _mm_max_pd(d,_mm_setzero_pd());
1262             d2               = _mm_mul_pd(d,d);
1263             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1264
1265             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1266
1267             /* Evaluate switch function */
1268             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1269             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1270             velec            = _mm_mul_pd(velec,sw);
1271             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
1272
1273             /* Update potential sum for this i atom from the interaction with this j atom. */
1274             velec            = _mm_and_pd(velec,cutoff_mask);
1275             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1276             velecsum         = _mm_add_pd(velecsum,velec);
1277
1278             fscal            = felec;
1279
1280             fscal            = _mm_and_pd(fscal,cutoff_mask);
1281
1282             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1283
1284             /* Update vectorial force */
1285             fix1             = _mm_macc_pd(dx13,fscal,fix1);
1286             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
1287             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
1288             
1289             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
1290             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
1291             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
1292
1293             }
1294
1295             /**************************
1296              * CALCULATE INTERACTIONS *
1297              **************************/
1298
1299             if (gmx_mm_any_lt(rsq21,rcutoff2))
1300             {
1301
1302             r21              = _mm_mul_pd(rsq21,rinv21);
1303
1304             /* EWALD ELECTROSTATICS */
1305
1306             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1307             ewrt             = _mm_mul_pd(r21,ewtabscale);
1308             ewitab           = _mm_cvttpd_epi32(ewrt);
1309 #ifdef __XOP__
1310             eweps            = _mm_frcz_pd(ewrt);
1311 #else
1312             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1313 #endif
1314             twoeweps         = _mm_add_pd(eweps,eweps);
1315             ewitab           = _mm_slli_epi32(ewitab,2);
1316             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1317             ewtabD           = _mm_setzero_pd();
1318             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1319             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1320             ewtabFn          = _mm_setzero_pd();
1321             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1322             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1323             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1324             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1325             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1326
1327             d                = _mm_sub_pd(r21,rswitch);
1328             d                = _mm_max_pd(d,_mm_setzero_pd());
1329             d2               = _mm_mul_pd(d,d);
1330             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1331
1332             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1333
1334             /* Evaluate switch function */
1335             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1336             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1337             velec            = _mm_mul_pd(velec,sw);
1338             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1339
1340             /* Update potential sum for this i atom from the interaction with this j atom. */
1341             velec            = _mm_and_pd(velec,cutoff_mask);
1342             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1343             velecsum         = _mm_add_pd(velecsum,velec);
1344
1345             fscal            = felec;
1346
1347             fscal            = _mm_and_pd(fscal,cutoff_mask);
1348
1349             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1350
1351             /* Update vectorial force */
1352             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1353             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1354             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1355             
1356             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1357             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1358             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1359
1360             }
1361
1362             /**************************
1363              * CALCULATE INTERACTIONS *
1364              **************************/
1365
1366             if (gmx_mm_any_lt(rsq22,rcutoff2))
1367             {
1368
1369             r22              = _mm_mul_pd(rsq22,rinv22);
1370
1371             /* EWALD ELECTROSTATICS */
1372
1373             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1374             ewrt             = _mm_mul_pd(r22,ewtabscale);
1375             ewitab           = _mm_cvttpd_epi32(ewrt);
1376 #ifdef __XOP__
1377             eweps            = _mm_frcz_pd(ewrt);
1378 #else
1379             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1380 #endif
1381             twoeweps         = _mm_add_pd(eweps,eweps);
1382             ewitab           = _mm_slli_epi32(ewitab,2);
1383             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1384             ewtabD           = _mm_setzero_pd();
1385             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1386             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1387             ewtabFn          = _mm_setzero_pd();
1388             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1389             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1390             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1391             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1392             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1393
1394             d                = _mm_sub_pd(r22,rswitch);
1395             d                = _mm_max_pd(d,_mm_setzero_pd());
1396             d2               = _mm_mul_pd(d,d);
1397             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1398
1399             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1400
1401             /* Evaluate switch function */
1402             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1403             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1404             velec            = _mm_mul_pd(velec,sw);
1405             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1406
1407             /* Update potential sum for this i atom from the interaction with this j atom. */
1408             velec            = _mm_and_pd(velec,cutoff_mask);
1409             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1410             velecsum         = _mm_add_pd(velecsum,velec);
1411
1412             fscal            = felec;
1413
1414             fscal            = _mm_and_pd(fscal,cutoff_mask);
1415
1416             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1417
1418             /* Update vectorial force */
1419             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1420             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1421             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1422             
1423             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1424             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1425             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1426
1427             }
1428
1429             /**************************
1430              * CALCULATE INTERACTIONS *
1431              **************************/
1432
1433             if (gmx_mm_any_lt(rsq23,rcutoff2))
1434             {
1435
1436             r23              = _mm_mul_pd(rsq23,rinv23);
1437
1438             /* EWALD ELECTROSTATICS */
1439
1440             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1441             ewrt             = _mm_mul_pd(r23,ewtabscale);
1442             ewitab           = _mm_cvttpd_epi32(ewrt);
1443 #ifdef __XOP__
1444             eweps            = _mm_frcz_pd(ewrt);
1445 #else
1446             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1447 #endif
1448             twoeweps         = _mm_add_pd(eweps,eweps);
1449             ewitab           = _mm_slli_epi32(ewitab,2);
1450             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1451             ewtabD           = _mm_setzero_pd();
1452             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1453             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1454             ewtabFn          = _mm_setzero_pd();
1455             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1456             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1457             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1458             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1459             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1460
1461             d                = _mm_sub_pd(r23,rswitch);
1462             d                = _mm_max_pd(d,_mm_setzero_pd());
1463             d2               = _mm_mul_pd(d,d);
1464             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1465
1466             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1467
1468             /* Evaluate switch function */
1469             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1470             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1471             velec            = _mm_mul_pd(velec,sw);
1472             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
1473
1474             /* Update potential sum for this i atom from the interaction with this j atom. */
1475             velec            = _mm_and_pd(velec,cutoff_mask);
1476             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1477             velecsum         = _mm_add_pd(velecsum,velec);
1478
1479             fscal            = felec;
1480
1481             fscal            = _mm_and_pd(fscal,cutoff_mask);
1482
1483             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1484
1485             /* Update vectorial force */
1486             fix2             = _mm_macc_pd(dx23,fscal,fix2);
1487             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
1488             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
1489             
1490             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
1491             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
1492             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
1493
1494             }
1495
1496             /**************************
1497              * CALCULATE INTERACTIONS *
1498              **************************/
1499
1500             if (gmx_mm_any_lt(rsq31,rcutoff2))
1501             {
1502
1503             r31              = _mm_mul_pd(rsq31,rinv31);
1504
1505             /* EWALD ELECTROSTATICS */
1506
1507             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1508             ewrt             = _mm_mul_pd(r31,ewtabscale);
1509             ewitab           = _mm_cvttpd_epi32(ewrt);
1510 #ifdef __XOP__
1511             eweps            = _mm_frcz_pd(ewrt);
1512 #else
1513             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1514 #endif
1515             twoeweps         = _mm_add_pd(eweps,eweps);
1516             ewitab           = _mm_slli_epi32(ewitab,2);
1517             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1518             ewtabD           = _mm_setzero_pd();
1519             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1520             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1521             ewtabFn          = _mm_setzero_pd();
1522             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1523             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1524             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1525             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1526             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1527
1528             d                = _mm_sub_pd(r31,rswitch);
1529             d                = _mm_max_pd(d,_mm_setzero_pd());
1530             d2               = _mm_mul_pd(d,d);
1531             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1532
1533             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1534
1535             /* Evaluate switch function */
1536             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1537             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1538             velec            = _mm_mul_pd(velec,sw);
1539             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
1540
1541             /* Update potential sum for this i atom from the interaction with this j atom. */
1542             velec            = _mm_and_pd(velec,cutoff_mask);
1543             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1544             velecsum         = _mm_add_pd(velecsum,velec);
1545
1546             fscal            = felec;
1547
1548             fscal            = _mm_and_pd(fscal,cutoff_mask);
1549
1550             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1551
1552             /* Update vectorial force */
1553             fix3             = _mm_macc_pd(dx31,fscal,fix3);
1554             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
1555             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
1556             
1557             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
1558             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
1559             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
1560
1561             }
1562
1563             /**************************
1564              * CALCULATE INTERACTIONS *
1565              **************************/
1566
1567             if (gmx_mm_any_lt(rsq32,rcutoff2))
1568             {
1569
1570             r32              = _mm_mul_pd(rsq32,rinv32);
1571
1572             /* EWALD ELECTROSTATICS */
1573
1574             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1575             ewrt             = _mm_mul_pd(r32,ewtabscale);
1576             ewitab           = _mm_cvttpd_epi32(ewrt);
1577 #ifdef __XOP__
1578             eweps            = _mm_frcz_pd(ewrt);
1579 #else
1580             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1581 #endif
1582             twoeweps         = _mm_add_pd(eweps,eweps);
1583             ewitab           = _mm_slli_epi32(ewitab,2);
1584             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1585             ewtabD           = _mm_setzero_pd();
1586             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1587             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1588             ewtabFn          = _mm_setzero_pd();
1589             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1590             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1591             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1592             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1593             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1594
1595             d                = _mm_sub_pd(r32,rswitch);
1596             d                = _mm_max_pd(d,_mm_setzero_pd());
1597             d2               = _mm_mul_pd(d,d);
1598             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1599
1600             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1601
1602             /* Evaluate switch function */
1603             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1604             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1605             velec            = _mm_mul_pd(velec,sw);
1606             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
1607
1608             /* Update potential sum for this i atom from the interaction with this j atom. */
1609             velec            = _mm_and_pd(velec,cutoff_mask);
1610             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1611             velecsum         = _mm_add_pd(velecsum,velec);
1612
1613             fscal            = felec;
1614
1615             fscal            = _mm_and_pd(fscal,cutoff_mask);
1616
1617             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1618
1619             /* Update vectorial force */
1620             fix3             = _mm_macc_pd(dx32,fscal,fix3);
1621             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
1622             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
1623             
1624             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
1625             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
1626             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
1627
1628             }
1629
1630             /**************************
1631              * CALCULATE INTERACTIONS *
1632              **************************/
1633
1634             if (gmx_mm_any_lt(rsq33,rcutoff2))
1635             {
1636
1637             r33              = _mm_mul_pd(rsq33,rinv33);
1638
1639             /* EWALD ELECTROSTATICS */
1640
1641             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1642             ewrt             = _mm_mul_pd(r33,ewtabscale);
1643             ewitab           = _mm_cvttpd_epi32(ewrt);
1644 #ifdef __XOP__
1645             eweps            = _mm_frcz_pd(ewrt);
1646 #else
1647             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1648 #endif
1649             twoeweps         = _mm_add_pd(eweps,eweps);
1650             ewitab           = _mm_slli_epi32(ewitab,2);
1651             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1652             ewtabD           = _mm_setzero_pd();
1653             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1654             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1655             ewtabFn          = _mm_setzero_pd();
1656             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1657             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
1658             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1659             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1660             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1661
1662             d                = _mm_sub_pd(r33,rswitch);
1663             d                = _mm_max_pd(d,_mm_setzero_pd());
1664             d2               = _mm_mul_pd(d,d);
1665             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1666
1667             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1668
1669             /* Evaluate switch function */
1670             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1671             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1672             velec            = _mm_mul_pd(velec,sw);
1673             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
1674
1675             /* Update potential sum for this i atom from the interaction with this j atom. */
1676             velec            = _mm_and_pd(velec,cutoff_mask);
1677             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1678             velecsum         = _mm_add_pd(velecsum,velec);
1679
1680             fscal            = felec;
1681
1682             fscal            = _mm_and_pd(fscal,cutoff_mask);
1683
1684             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1685
1686             /* Update vectorial force */
1687             fix3             = _mm_macc_pd(dx33,fscal,fix3);
1688             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
1689             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
1690             
1691             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
1692             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
1693             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
1694
1695             }
1696
1697             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1698
1699             /* Inner loop uses 677 flops */
1700         }
1701
1702         /* End of innermost loop */
1703
1704         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1705                                               f+i_coord_offset,fshift+i_shift_offset);
1706
1707         ggid                        = gid[iidx];
1708         /* Update potential energies */
1709         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1710         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1711
1712         /* Increment number of inner iterations */
1713         inneriter                  += j_index_end - j_index_start;
1714
1715         /* Outer loop uses 26 flops */
1716     }
1717
1718     /* Increment number of outer iterations */
1719     outeriter        += nri;
1720
1721     /* Update outer/inner flops */
1722
1723     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*677);
1724 }
1725 /*
1726  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_double
1727  * Electrostatics interaction: Ewald
1728  * VdW interaction:            LennardJones
1729  * Geometry:                   Water4-Water4
1730  * Calculate force/pot:        Force
1731  */
1732 void
1733 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_double
1734                     (t_nblist                    * gmx_restrict       nlist,
1735                      rvec                        * gmx_restrict          xx,
1736                      rvec                        * gmx_restrict          ff,
1737                      t_forcerec                  * gmx_restrict          fr,
1738                      t_mdatoms                   * gmx_restrict     mdatoms,
1739                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1740                      t_nrnb                      * gmx_restrict        nrnb)
1741 {
1742     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1743      * just 0 for non-waters.
1744      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1745      * jnr indices corresponding to data put in the four positions in the SIMD register.
1746      */
1747     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1748     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1749     int              jnrA,jnrB;
1750     int              j_coord_offsetA,j_coord_offsetB;
1751     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1752     real             rcutoff_scalar;
1753     real             *shiftvec,*fshift,*x,*f;
1754     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1755     int              vdwioffset0;
1756     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1757     int              vdwioffset1;
1758     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1759     int              vdwioffset2;
1760     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1761     int              vdwioffset3;
1762     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1763     int              vdwjidx0A,vdwjidx0B;
1764     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1765     int              vdwjidx1A,vdwjidx1B;
1766     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1767     int              vdwjidx2A,vdwjidx2B;
1768     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1769     int              vdwjidx3A,vdwjidx3B;
1770     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1771     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1772     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1773     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1774     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1775     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1776     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1777     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1778     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1779     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1780     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1781     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1782     real             *charge;
1783     int              nvdwtype;
1784     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1785     int              *vdwtype;
1786     real             *vdwparam;
1787     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1788     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1789     __m128i          ewitab;
1790     __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1791     real             *ewtab;
1792     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1793     real             rswitch_scalar,d_scalar;
1794     __m128d          dummy_mask,cutoff_mask;
1795     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1796     __m128d          one     = _mm_set1_pd(1.0);
1797     __m128d          two     = _mm_set1_pd(2.0);
1798     x                = xx[0];
1799     f                = ff[0];
1800
1801     nri              = nlist->nri;
1802     iinr             = nlist->iinr;
1803     jindex           = nlist->jindex;
1804     jjnr             = nlist->jjnr;
1805     shiftidx         = nlist->shift;
1806     gid              = nlist->gid;
1807     shiftvec         = fr->shift_vec[0];
1808     fshift           = fr->fshift[0];
1809     facel            = _mm_set1_pd(fr->epsfac);
1810     charge           = mdatoms->chargeA;
1811     nvdwtype         = fr->ntype;
1812     vdwparam         = fr->nbfp;
1813     vdwtype          = mdatoms->typeA;
1814
1815     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1816     ewtab            = fr->ic->tabq_coul_FDV0;
1817     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1818     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1819
1820     /* Setup water-specific parameters */
1821     inr              = nlist->iinr[0];
1822     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1823     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1824     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1825     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1826
1827     jq1              = _mm_set1_pd(charge[inr+1]);
1828     jq2              = _mm_set1_pd(charge[inr+2]);
1829     jq3              = _mm_set1_pd(charge[inr+3]);
1830     vdwjidx0A        = 2*vdwtype[inr+0];
1831     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1832     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1833     qq11             = _mm_mul_pd(iq1,jq1);
1834     qq12             = _mm_mul_pd(iq1,jq2);
1835     qq13             = _mm_mul_pd(iq1,jq3);
1836     qq21             = _mm_mul_pd(iq2,jq1);
1837     qq22             = _mm_mul_pd(iq2,jq2);
1838     qq23             = _mm_mul_pd(iq2,jq3);
1839     qq31             = _mm_mul_pd(iq3,jq1);
1840     qq32             = _mm_mul_pd(iq3,jq2);
1841     qq33             = _mm_mul_pd(iq3,jq3);
1842
1843     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1844     rcutoff_scalar   = fr->rcoulomb;
1845     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1846     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1847
1848     rswitch_scalar   = fr->rcoulomb_switch;
1849     rswitch          = _mm_set1_pd(rswitch_scalar);
1850     /* Setup switch parameters */
1851     d_scalar         = rcutoff_scalar-rswitch_scalar;
1852     d                = _mm_set1_pd(d_scalar);
1853     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1854     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1855     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1856     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1857     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1858     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1859
1860     /* Avoid stupid compiler warnings */
1861     jnrA = jnrB = 0;
1862     j_coord_offsetA = 0;
1863     j_coord_offsetB = 0;
1864
1865     outeriter        = 0;
1866     inneriter        = 0;
1867
1868     /* Start outer loop over neighborlists */
1869     for(iidx=0; iidx<nri; iidx++)
1870     {
1871         /* Load shift vector for this list */
1872         i_shift_offset   = DIM*shiftidx[iidx];
1873
1874         /* Load limits for loop over neighbors */
1875         j_index_start    = jindex[iidx];
1876         j_index_end      = jindex[iidx+1];
1877
1878         /* Get outer coordinate index */
1879         inr              = iinr[iidx];
1880         i_coord_offset   = DIM*inr;
1881
1882         /* Load i particle coords and add shift vector */
1883         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1884                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1885
1886         fix0             = _mm_setzero_pd();
1887         fiy0             = _mm_setzero_pd();
1888         fiz0             = _mm_setzero_pd();
1889         fix1             = _mm_setzero_pd();
1890         fiy1             = _mm_setzero_pd();
1891         fiz1             = _mm_setzero_pd();
1892         fix2             = _mm_setzero_pd();
1893         fiy2             = _mm_setzero_pd();
1894         fiz2             = _mm_setzero_pd();
1895         fix3             = _mm_setzero_pd();
1896         fiy3             = _mm_setzero_pd();
1897         fiz3             = _mm_setzero_pd();
1898
1899         /* Start inner kernel loop */
1900         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1901         {
1902
1903             /* Get j neighbor index, and coordinate index */
1904             jnrA             = jjnr[jidx];
1905             jnrB             = jjnr[jidx+1];
1906             j_coord_offsetA  = DIM*jnrA;
1907             j_coord_offsetB  = DIM*jnrB;
1908
1909             /* load j atom coordinates */
1910             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1911                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1912                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1913
1914             /* Calculate displacement vector */
1915             dx00             = _mm_sub_pd(ix0,jx0);
1916             dy00             = _mm_sub_pd(iy0,jy0);
1917             dz00             = _mm_sub_pd(iz0,jz0);
1918             dx11             = _mm_sub_pd(ix1,jx1);
1919             dy11             = _mm_sub_pd(iy1,jy1);
1920             dz11             = _mm_sub_pd(iz1,jz1);
1921             dx12             = _mm_sub_pd(ix1,jx2);
1922             dy12             = _mm_sub_pd(iy1,jy2);
1923             dz12             = _mm_sub_pd(iz1,jz2);
1924             dx13             = _mm_sub_pd(ix1,jx3);
1925             dy13             = _mm_sub_pd(iy1,jy3);
1926             dz13             = _mm_sub_pd(iz1,jz3);
1927             dx21             = _mm_sub_pd(ix2,jx1);
1928             dy21             = _mm_sub_pd(iy2,jy1);
1929             dz21             = _mm_sub_pd(iz2,jz1);
1930             dx22             = _mm_sub_pd(ix2,jx2);
1931             dy22             = _mm_sub_pd(iy2,jy2);
1932             dz22             = _mm_sub_pd(iz2,jz2);
1933             dx23             = _mm_sub_pd(ix2,jx3);
1934             dy23             = _mm_sub_pd(iy2,jy3);
1935             dz23             = _mm_sub_pd(iz2,jz3);
1936             dx31             = _mm_sub_pd(ix3,jx1);
1937             dy31             = _mm_sub_pd(iy3,jy1);
1938             dz31             = _mm_sub_pd(iz3,jz1);
1939             dx32             = _mm_sub_pd(ix3,jx2);
1940             dy32             = _mm_sub_pd(iy3,jy2);
1941             dz32             = _mm_sub_pd(iz3,jz2);
1942             dx33             = _mm_sub_pd(ix3,jx3);
1943             dy33             = _mm_sub_pd(iy3,jy3);
1944             dz33             = _mm_sub_pd(iz3,jz3);
1945
1946             /* Calculate squared distance and things based on it */
1947             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1948             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1949             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1950             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1951             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1952             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1953             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1954             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1955             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1956             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1957
1958             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1959             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1960             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1961             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1962             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1963             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1964             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1965             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1966             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1967             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1968
1969             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1970             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1971             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1972             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1973             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1974             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1975             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1976             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1977             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1978             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1979
1980             fjx0             = _mm_setzero_pd();
1981             fjy0             = _mm_setzero_pd();
1982             fjz0             = _mm_setzero_pd();
1983             fjx1             = _mm_setzero_pd();
1984             fjy1             = _mm_setzero_pd();
1985             fjz1             = _mm_setzero_pd();
1986             fjx2             = _mm_setzero_pd();
1987             fjy2             = _mm_setzero_pd();
1988             fjz2             = _mm_setzero_pd();
1989             fjx3             = _mm_setzero_pd();
1990             fjy3             = _mm_setzero_pd();
1991             fjz3             = _mm_setzero_pd();
1992
1993             /**************************
1994              * CALCULATE INTERACTIONS *
1995              **************************/
1996
1997             if (gmx_mm_any_lt(rsq00,rcutoff2))
1998             {
1999
2000             r00              = _mm_mul_pd(rsq00,rinv00);
2001
2002             /* LENNARD-JONES DISPERSION/REPULSION */
2003
2004             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2005             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
2006             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2007             vvdw             = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
2008             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2009
2010             d                = _mm_sub_pd(r00,rswitch);
2011             d                = _mm_max_pd(d,_mm_setzero_pd());
2012             d2               = _mm_mul_pd(d,d);
2013             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2014
2015             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2016
2017             /* Evaluate switch function */
2018             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2019             fvdw             = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2020             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2021
2022             fscal            = fvdw;
2023
2024             fscal            = _mm_and_pd(fscal,cutoff_mask);
2025
2026             /* Update vectorial force */
2027             fix0             = _mm_macc_pd(dx00,fscal,fix0);
2028             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
2029             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
2030             
2031             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
2032             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
2033             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
2034
2035             }
2036
2037             /**************************
2038              * CALCULATE INTERACTIONS *
2039              **************************/
2040
2041             if (gmx_mm_any_lt(rsq11,rcutoff2))
2042             {
2043
2044             r11              = _mm_mul_pd(rsq11,rinv11);
2045
2046             /* EWALD ELECTROSTATICS */
2047
2048             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2049             ewrt             = _mm_mul_pd(r11,ewtabscale);
2050             ewitab           = _mm_cvttpd_epi32(ewrt);
2051 #ifdef __XOP__
2052             eweps            = _mm_frcz_pd(ewrt);
2053 #else
2054             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2055 #endif
2056             twoeweps         = _mm_add_pd(eweps,eweps);
2057             ewitab           = _mm_slli_epi32(ewitab,2);
2058             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2059             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2060             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2061             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2062             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2063             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2064             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2065             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2066             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2067             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2068
2069             d                = _mm_sub_pd(r11,rswitch);
2070             d                = _mm_max_pd(d,_mm_setzero_pd());
2071             d2               = _mm_mul_pd(d,d);
2072             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2073
2074             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2075
2076             /* Evaluate switch function */
2077             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2078             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2079             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2080
2081             fscal            = felec;
2082
2083             fscal            = _mm_and_pd(fscal,cutoff_mask);
2084
2085             /* Update vectorial force */
2086             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2087             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2088             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2089             
2090             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2091             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2092             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2093
2094             }
2095
2096             /**************************
2097              * CALCULATE INTERACTIONS *
2098              **************************/
2099
2100             if (gmx_mm_any_lt(rsq12,rcutoff2))
2101             {
2102
2103             r12              = _mm_mul_pd(rsq12,rinv12);
2104
2105             /* EWALD ELECTROSTATICS */
2106
2107             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2108             ewrt             = _mm_mul_pd(r12,ewtabscale);
2109             ewitab           = _mm_cvttpd_epi32(ewrt);
2110 #ifdef __XOP__
2111             eweps            = _mm_frcz_pd(ewrt);
2112 #else
2113             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2114 #endif
2115             twoeweps         = _mm_add_pd(eweps,eweps);
2116             ewitab           = _mm_slli_epi32(ewitab,2);
2117             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2118             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2119             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2120             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2121             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2122             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2123             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2124             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2125             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2126             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2127
2128             d                = _mm_sub_pd(r12,rswitch);
2129             d                = _mm_max_pd(d,_mm_setzero_pd());
2130             d2               = _mm_mul_pd(d,d);
2131             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2132
2133             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2134
2135             /* Evaluate switch function */
2136             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2137             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2138             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2139
2140             fscal            = felec;
2141
2142             fscal            = _mm_and_pd(fscal,cutoff_mask);
2143
2144             /* Update vectorial force */
2145             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2146             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2147             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2148             
2149             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2150             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2151             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2152
2153             }
2154
2155             /**************************
2156              * CALCULATE INTERACTIONS *
2157              **************************/
2158
2159             if (gmx_mm_any_lt(rsq13,rcutoff2))
2160             {
2161
2162             r13              = _mm_mul_pd(rsq13,rinv13);
2163
2164             /* EWALD ELECTROSTATICS */
2165
2166             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2167             ewrt             = _mm_mul_pd(r13,ewtabscale);
2168             ewitab           = _mm_cvttpd_epi32(ewrt);
2169 #ifdef __XOP__
2170             eweps            = _mm_frcz_pd(ewrt);
2171 #else
2172             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2173 #endif
2174             twoeweps         = _mm_add_pd(eweps,eweps);
2175             ewitab           = _mm_slli_epi32(ewitab,2);
2176             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2177             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2178             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2179             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2180             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2181             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2182             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2183             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2184             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2185             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2186
2187             d                = _mm_sub_pd(r13,rswitch);
2188             d                = _mm_max_pd(d,_mm_setzero_pd());
2189             d2               = _mm_mul_pd(d,d);
2190             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2191
2192             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2193
2194             /* Evaluate switch function */
2195             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2196             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2197             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2198
2199             fscal            = felec;
2200
2201             fscal            = _mm_and_pd(fscal,cutoff_mask);
2202
2203             /* Update vectorial force */
2204             fix1             = _mm_macc_pd(dx13,fscal,fix1);
2205             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
2206             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
2207             
2208             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
2209             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
2210             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
2211
2212             }
2213
2214             /**************************
2215              * CALCULATE INTERACTIONS *
2216              **************************/
2217
2218             if (gmx_mm_any_lt(rsq21,rcutoff2))
2219             {
2220
2221             r21              = _mm_mul_pd(rsq21,rinv21);
2222
2223             /* EWALD ELECTROSTATICS */
2224
2225             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2226             ewrt             = _mm_mul_pd(r21,ewtabscale);
2227             ewitab           = _mm_cvttpd_epi32(ewrt);
2228 #ifdef __XOP__
2229             eweps            = _mm_frcz_pd(ewrt);
2230 #else
2231             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2232 #endif
2233             twoeweps         = _mm_add_pd(eweps,eweps);
2234             ewitab           = _mm_slli_epi32(ewitab,2);
2235             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2236             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2237             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2238             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2239             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2240             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2241             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2242             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2243             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2244             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2245
2246             d                = _mm_sub_pd(r21,rswitch);
2247             d                = _mm_max_pd(d,_mm_setzero_pd());
2248             d2               = _mm_mul_pd(d,d);
2249             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2250
2251             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2252
2253             /* Evaluate switch function */
2254             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2255             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2256             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2257
2258             fscal            = felec;
2259
2260             fscal            = _mm_and_pd(fscal,cutoff_mask);
2261
2262             /* Update vectorial force */
2263             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2264             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2265             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2266             
2267             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2268             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2269             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2270
2271             }
2272
2273             /**************************
2274              * CALCULATE INTERACTIONS *
2275              **************************/
2276
2277             if (gmx_mm_any_lt(rsq22,rcutoff2))
2278             {
2279
2280             r22              = _mm_mul_pd(rsq22,rinv22);
2281
2282             /* EWALD ELECTROSTATICS */
2283
2284             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2285             ewrt             = _mm_mul_pd(r22,ewtabscale);
2286             ewitab           = _mm_cvttpd_epi32(ewrt);
2287 #ifdef __XOP__
2288             eweps            = _mm_frcz_pd(ewrt);
2289 #else
2290             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2291 #endif
2292             twoeweps         = _mm_add_pd(eweps,eweps);
2293             ewitab           = _mm_slli_epi32(ewitab,2);
2294             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2295             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2296             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2297             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2298             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2299             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2300             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2301             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2302             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2303             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2304
2305             d                = _mm_sub_pd(r22,rswitch);
2306             d                = _mm_max_pd(d,_mm_setzero_pd());
2307             d2               = _mm_mul_pd(d,d);
2308             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2309
2310             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2311
2312             /* Evaluate switch function */
2313             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2314             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2315             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2316
2317             fscal            = felec;
2318
2319             fscal            = _mm_and_pd(fscal,cutoff_mask);
2320
2321             /* Update vectorial force */
2322             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2323             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2324             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2325             
2326             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2327             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2328             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2329
2330             }
2331
2332             /**************************
2333              * CALCULATE INTERACTIONS *
2334              **************************/
2335
2336             if (gmx_mm_any_lt(rsq23,rcutoff2))
2337             {
2338
2339             r23              = _mm_mul_pd(rsq23,rinv23);
2340
2341             /* EWALD ELECTROSTATICS */
2342
2343             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2344             ewrt             = _mm_mul_pd(r23,ewtabscale);
2345             ewitab           = _mm_cvttpd_epi32(ewrt);
2346 #ifdef __XOP__
2347             eweps            = _mm_frcz_pd(ewrt);
2348 #else
2349             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2350 #endif
2351             twoeweps         = _mm_add_pd(eweps,eweps);
2352             ewitab           = _mm_slli_epi32(ewitab,2);
2353             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2354             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2355             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2356             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2357             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2358             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2359             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2360             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2361             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2362             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2363
2364             d                = _mm_sub_pd(r23,rswitch);
2365             d                = _mm_max_pd(d,_mm_setzero_pd());
2366             d2               = _mm_mul_pd(d,d);
2367             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2368
2369             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2370
2371             /* Evaluate switch function */
2372             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2373             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2374             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
2375
2376             fscal            = felec;
2377
2378             fscal            = _mm_and_pd(fscal,cutoff_mask);
2379
2380             /* Update vectorial force */
2381             fix2             = _mm_macc_pd(dx23,fscal,fix2);
2382             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
2383             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
2384             
2385             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
2386             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
2387             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
2388
2389             }
2390
2391             /**************************
2392              * CALCULATE INTERACTIONS *
2393              **************************/
2394
2395             if (gmx_mm_any_lt(rsq31,rcutoff2))
2396             {
2397
2398             r31              = _mm_mul_pd(rsq31,rinv31);
2399
2400             /* EWALD ELECTROSTATICS */
2401
2402             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2403             ewrt             = _mm_mul_pd(r31,ewtabscale);
2404             ewitab           = _mm_cvttpd_epi32(ewrt);
2405 #ifdef __XOP__
2406             eweps            = _mm_frcz_pd(ewrt);
2407 #else
2408             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2409 #endif
2410             twoeweps         = _mm_add_pd(eweps,eweps);
2411             ewitab           = _mm_slli_epi32(ewitab,2);
2412             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2413             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2414             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2415             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2416             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2417             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2418             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2419             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2420             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2421             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2422
2423             d                = _mm_sub_pd(r31,rswitch);
2424             d                = _mm_max_pd(d,_mm_setzero_pd());
2425             d2               = _mm_mul_pd(d,d);
2426             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2427
2428             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2429
2430             /* Evaluate switch function */
2431             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2432             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2433             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
2434
2435             fscal            = felec;
2436
2437             fscal            = _mm_and_pd(fscal,cutoff_mask);
2438
2439             /* Update vectorial force */
2440             fix3             = _mm_macc_pd(dx31,fscal,fix3);
2441             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
2442             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
2443             
2444             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
2445             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
2446             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
2447
2448             }
2449
2450             /**************************
2451              * CALCULATE INTERACTIONS *
2452              **************************/
2453
2454             if (gmx_mm_any_lt(rsq32,rcutoff2))
2455             {
2456
2457             r32              = _mm_mul_pd(rsq32,rinv32);
2458
2459             /* EWALD ELECTROSTATICS */
2460
2461             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2462             ewrt             = _mm_mul_pd(r32,ewtabscale);
2463             ewitab           = _mm_cvttpd_epi32(ewrt);
2464 #ifdef __XOP__
2465             eweps            = _mm_frcz_pd(ewrt);
2466 #else
2467             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2468 #endif
2469             twoeweps         = _mm_add_pd(eweps,eweps);
2470             ewitab           = _mm_slli_epi32(ewitab,2);
2471             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2472             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2473             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2474             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2475             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2476             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2477             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2478             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2479             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2480             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2481
2482             d                = _mm_sub_pd(r32,rswitch);
2483             d                = _mm_max_pd(d,_mm_setzero_pd());
2484             d2               = _mm_mul_pd(d,d);
2485             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2486
2487             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2488
2489             /* Evaluate switch function */
2490             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2491             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2492             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
2493
2494             fscal            = felec;
2495
2496             fscal            = _mm_and_pd(fscal,cutoff_mask);
2497
2498             /* Update vectorial force */
2499             fix3             = _mm_macc_pd(dx32,fscal,fix3);
2500             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
2501             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
2502             
2503             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
2504             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
2505             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
2506
2507             }
2508
2509             /**************************
2510              * CALCULATE INTERACTIONS *
2511              **************************/
2512
2513             if (gmx_mm_any_lt(rsq33,rcutoff2))
2514             {
2515
2516             r33              = _mm_mul_pd(rsq33,rinv33);
2517
2518             /* EWALD ELECTROSTATICS */
2519
2520             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2521             ewrt             = _mm_mul_pd(r33,ewtabscale);
2522             ewitab           = _mm_cvttpd_epi32(ewrt);
2523 #ifdef __XOP__
2524             eweps            = _mm_frcz_pd(ewrt);
2525 #else
2526             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2527 #endif
2528             twoeweps         = _mm_add_pd(eweps,eweps);
2529             ewitab           = _mm_slli_epi32(ewitab,2);
2530             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2531             ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2532             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2533             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2534             ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2535             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2536             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2537             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2538             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2539             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2540
2541             d                = _mm_sub_pd(r33,rswitch);
2542             d                = _mm_max_pd(d,_mm_setzero_pd());
2543             d2               = _mm_mul_pd(d,d);
2544             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2545
2546             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2547
2548             /* Evaluate switch function */
2549             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2550             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2551             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
2552
2553             fscal            = felec;
2554
2555             fscal            = _mm_and_pd(fscal,cutoff_mask);
2556
2557             /* Update vectorial force */
2558             fix3             = _mm_macc_pd(dx33,fscal,fix3);
2559             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
2560             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
2561             
2562             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
2563             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
2564             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
2565
2566             }
2567
2568             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2569
2570             /* Inner loop uses 647 flops */
2571         }
2572
2573         if(jidx<j_index_end)
2574         {
2575
2576             jnrA             = jjnr[jidx];
2577             j_coord_offsetA  = DIM*jnrA;
2578
2579             /* load j atom coordinates */
2580             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2581                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2582                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2583
2584             /* Calculate displacement vector */
2585             dx00             = _mm_sub_pd(ix0,jx0);
2586             dy00             = _mm_sub_pd(iy0,jy0);
2587             dz00             = _mm_sub_pd(iz0,jz0);
2588             dx11             = _mm_sub_pd(ix1,jx1);
2589             dy11             = _mm_sub_pd(iy1,jy1);
2590             dz11             = _mm_sub_pd(iz1,jz1);
2591             dx12             = _mm_sub_pd(ix1,jx2);
2592             dy12             = _mm_sub_pd(iy1,jy2);
2593             dz12             = _mm_sub_pd(iz1,jz2);
2594             dx13             = _mm_sub_pd(ix1,jx3);
2595             dy13             = _mm_sub_pd(iy1,jy3);
2596             dz13             = _mm_sub_pd(iz1,jz3);
2597             dx21             = _mm_sub_pd(ix2,jx1);
2598             dy21             = _mm_sub_pd(iy2,jy1);
2599             dz21             = _mm_sub_pd(iz2,jz1);
2600             dx22             = _mm_sub_pd(ix2,jx2);
2601             dy22             = _mm_sub_pd(iy2,jy2);
2602             dz22             = _mm_sub_pd(iz2,jz2);
2603             dx23             = _mm_sub_pd(ix2,jx3);
2604             dy23             = _mm_sub_pd(iy2,jy3);
2605             dz23             = _mm_sub_pd(iz2,jz3);
2606             dx31             = _mm_sub_pd(ix3,jx1);
2607             dy31             = _mm_sub_pd(iy3,jy1);
2608             dz31             = _mm_sub_pd(iz3,jz1);
2609             dx32             = _mm_sub_pd(ix3,jx2);
2610             dy32             = _mm_sub_pd(iy3,jy2);
2611             dz32             = _mm_sub_pd(iz3,jz2);
2612             dx33             = _mm_sub_pd(ix3,jx3);
2613             dy33             = _mm_sub_pd(iy3,jy3);
2614             dz33             = _mm_sub_pd(iz3,jz3);
2615
2616             /* Calculate squared distance and things based on it */
2617             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2618             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2619             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2620             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2621             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2622             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2623             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2624             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2625             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2626             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2627
2628             rinv00           = gmx_mm_invsqrt_pd(rsq00);
2629             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2630             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2631             rinv13           = gmx_mm_invsqrt_pd(rsq13);
2632             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2633             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2634             rinv23           = gmx_mm_invsqrt_pd(rsq23);
2635             rinv31           = gmx_mm_invsqrt_pd(rsq31);
2636             rinv32           = gmx_mm_invsqrt_pd(rsq32);
2637             rinv33           = gmx_mm_invsqrt_pd(rsq33);
2638
2639             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
2640             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2641             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2642             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
2643             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2644             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2645             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
2646             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
2647             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
2648             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
2649
2650             fjx0             = _mm_setzero_pd();
2651             fjy0             = _mm_setzero_pd();
2652             fjz0             = _mm_setzero_pd();
2653             fjx1             = _mm_setzero_pd();
2654             fjy1             = _mm_setzero_pd();
2655             fjz1             = _mm_setzero_pd();
2656             fjx2             = _mm_setzero_pd();
2657             fjy2             = _mm_setzero_pd();
2658             fjz2             = _mm_setzero_pd();
2659             fjx3             = _mm_setzero_pd();
2660             fjy3             = _mm_setzero_pd();
2661             fjz3             = _mm_setzero_pd();
2662
2663             /**************************
2664              * CALCULATE INTERACTIONS *
2665              **************************/
2666
2667             if (gmx_mm_any_lt(rsq00,rcutoff2))
2668             {
2669
2670             r00              = _mm_mul_pd(rsq00,rinv00);
2671
2672             /* LENNARD-JONES DISPERSION/REPULSION */
2673
2674             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2675             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
2676             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2677             vvdw             = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
2678             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2679
2680             d                = _mm_sub_pd(r00,rswitch);
2681             d                = _mm_max_pd(d,_mm_setzero_pd());
2682             d2               = _mm_mul_pd(d,d);
2683             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2684
2685             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2686
2687             /* Evaluate switch function */
2688             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2689             fvdw             = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2690             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2691
2692             fscal            = fvdw;
2693
2694             fscal            = _mm_and_pd(fscal,cutoff_mask);
2695
2696             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2697
2698             /* Update vectorial force */
2699             fix0             = _mm_macc_pd(dx00,fscal,fix0);
2700             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
2701             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
2702             
2703             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
2704             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
2705             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
2706
2707             }
2708
2709             /**************************
2710              * CALCULATE INTERACTIONS *
2711              **************************/
2712
2713             if (gmx_mm_any_lt(rsq11,rcutoff2))
2714             {
2715
2716             r11              = _mm_mul_pd(rsq11,rinv11);
2717
2718             /* EWALD ELECTROSTATICS */
2719
2720             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2721             ewrt             = _mm_mul_pd(r11,ewtabscale);
2722             ewitab           = _mm_cvttpd_epi32(ewrt);
2723 #ifdef __XOP__
2724             eweps            = _mm_frcz_pd(ewrt);
2725 #else
2726             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2727 #endif
2728             twoeweps         = _mm_add_pd(eweps,eweps);
2729             ewitab           = _mm_slli_epi32(ewitab,2);
2730             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2731             ewtabD           = _mm_setzero_pd();
2732             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2733             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2734             ewtabFn          = _mm_setzero_pd();
2735             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2736             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2737             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2738             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2739             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2740
2741             d                = _mm_sub_pd(r11,rswitch);
2742             d                = _mm_max_pd(d,_mm_setzero_pd());
2743             d2               = _mm_mul_pd(d,d);
2744             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2745
2746             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2747
2748             /* Evaluate switch function */
2749             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2750             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2751             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2752
2753             fscal            = felec;
2754
2755             fscal            = _mm_and_pd(fscal,cutoff_mask);
2756
2757             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2758
2759             /* Update vectorial force */
2760             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2761             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2762             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2763             
2764             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2765             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2766             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2767
2768             }
2769
2770             /**************************
2771              * CALCULATE INTERACTIONS *
2772              **************************/
2773
2774             if (gmx_mm_any_lt(rsq12,rcutoff2))
2775             {
2776
2777             r12              = _mm_mul_pd(rsq12,rinv12);
2778
2779             /* EWALD ELECTROSTATICS */
2780
2781             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2782             ewrt             = _mm_mul_pd(r12,ewtabscale);
2783             ewitab           = _mm_cvttpd_epi32(ewrt);
2784 #ifdef __XOP__
2785             eweps            = _mm_frcz_pd(ewrt);
2786 #else
2787             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2788 #endif
2789             twoeweps         = _mm_add_pd(eweps,eweps);
2790             ewitab           = _mm_slli_epi32(ewitab,2);
2791             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2792             ewtabD           = _mm_setzero_pd();
2793             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2794             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2795             ewtabFn          = _mm_setzero_pd();
2796             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2797             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2798             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2799             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2800             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2801
2802             d                = _mm_sub_pd(r12,rswitch);
2803             d                = _mm_max_pd(d,_mm_setzero_pd());
2804             d2               = _mm_mul_pd(d,d);
2805             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2806
2807             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2808
2809             /* Evaluate switch function */
2810             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2811             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2812             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2813
2814             fscal            = felec;
2815
2816             fscal            = _mm_and_pd(fscal,cutoff_mask);
2817
2818             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2819
2820             /* Update vectorial force */
2821             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2822             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2823             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2824             
2825             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2826             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2827             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2828
2829             }
2830
2831             /**************************
2832              * CALCULATE INTERACTIONS *
2833              **************************/
2834
2835             if (gmx_mm_any_lt(rsq13,rcutoff2))
2836             {
2837
2838             r13              = _mm_mul_pd(rsq13,rinv13);
2839
2840             /* EWALD ELECTROSTATICS */
2841
2842             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2843             ewrt             = _mm_mul_pd(r13,ewtabscale);
2844             ewitab           = _mm_cvttpd_epi32(ewrt);
2845 #ifdef __XOP__
2846             eweps            = _mm_frcz_pd(ewrt);
2847 #else
2848             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2849 #endif
2850             twoeweps         = _mm_add_pd(eweps,eweps);
2851             ewitab           = _mm_slli_epi32(ewitab,2);
2852             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2853             ewtabD           = _mm_setzero_pd();
2854             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2855             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2856             ewtabFn          = _mm_setzero_pd();
2857             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2858             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2859             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2860             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2861             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2862
2863             d                = _mm_sub_pd(r13,rswitch);
2864             d                = _mm_max_pd(d,_mm_setzero_pd());
2865             d2               = _mm_mul_pd(d,d);
2866             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2867
2868             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2869
2870             /* Evaluate switch function */
2871             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2872             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2873             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2874
2875             fscal            = felec;
2876
2877             fscal            = _mm_and_pd(fscal,cutoff_mask);
2878
2879             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2880
2881             /* Update vectorial force */
2882             fix1             = _mm_macc_pd(dx13,fscal,fix1);
2883             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
2884             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
2885             
2886             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
2887             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
2888             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
2889
2890             }
2891
2892             /**************************
2893              * CALCULATE INTERACTIONS *
2894              **************************/
2895
2896             if (gmx_mm_any_lt(rsq21,rcutoff2))
2897             {
2898
2899             r21              = _mm_mul_pd(rsq21,rinv21);
2900
2901             /* EWALD ELECTROSTATICS */
2902
2903             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2904             ewrt             = _mm_mul_pd(r21,ewtabscale);
2905             ewitab           = _mm_cvttpd_epi32(ewrt);
2906 #ifdef __XOP__
2907             eweps            = _mm_frcz_pd(ewrt);
2908 #else
2909             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2910 #endif
2911             twoeweps         = _mm_add_pd(eweps,eweps);
2912             ewitab           = _mm_slli_epi32(ewitab,2);
2913             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2914             ewtabD           = _mm_setzero_pd();
2915             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2916             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2917             ewtabFn          = _mm_setzero_pd();
2918             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2919             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2920             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2921             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2922             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2923
2924             d                = _mm_sub_pd(r21,rswitch);
2925             d                = _mm_max_pd(d,_mm_setzero_pd());
2926             d2               = _mm_mul_pd(d,d);
2927             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2928
2929             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2930
2931             /* Evaluate switch function */
2932             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2933             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2934             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2935
2936             fscal            = felec;
2937
2938             fscal            = _mm_and_pd(fscal,cutoff_mask);
2939
2940             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2941
2942             /* Update vectorial force */
2943             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2944             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2945             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2946             
2947             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2948             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2949             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2950
2951             }
2952
2953             /**************************
2954              * CALCULATE INTERACTIONS *
2955              **************************/
2956
2957             if (gmx_mm_any_lt(rsq22,rcutoff2))
2958             {
2959
2960             r22              = _mm_mul_pd(rsq22,rinv22);
2961
2962             /* EWALD ELECTROSTATICS */
2963
2964             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2965             ewrt             = _mm_mul_pd(r22,ewtabscale);
2966             ewitab           = _mm_cvttpd_epi32(ewrt);
2967 #ifdef __XOP__
2968             eweps            = _mm_frcz_pd(ewrt);
2969 #else
2970             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2971 #endif
2972             twoeweps         = _mm_add_pd(eweps,eweps);
2973             ewitab           = _mm_slli_epi32(ewitab,2);
2974             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2975             ewtabD           = _mm_setzero_pd();
2976             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2977             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2978             ewtabFn          = _mm_setzero_pd();
2979             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2980             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
2981             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2982             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2983             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2984
2985             d                = _mm_sub_pd(r22,rswitch);
2986             d                = _mm_max_pd(d,_mm_setzero_pd());
2987             d2               = _mm_mul_pd(d,d);
2988             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2989
2990             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2991
2992             /* Evaluate switch function */
2993             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2994             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2995             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2996
2997             fscal            = felec;
2998
2999             fscal            = _mm_and_pd(fscal,cutoff_mask);
3000
3001             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3002
3003             /* Update vectorial force */
3004             fix2             = _mm_macc_pd(dx22,fscal,fix2);
3005             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
3006             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
3007             
3008             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
3009             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
3010             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
3011
3012             }
3013
3014             /**************************
3015              * CALCULATE INTERACTIONS *
3016              **************************/
3017
3018             if (gmx_mm_any_lt(rsq23,rcutoff2))
3019             {
3020
3021             r23              = _mm_mul_pd(rsq23,rinv23);
3022
3023             /* EWALD ELECTROSTATICS */
3024
3025             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3026             ewrt             = _mm_mul_pd(r23,ewtabscale);
3027             ewitab           = _mm_cvttpd_epi32(ewrt);
3028 #ifdef __XOP__
3029             eweps            = _mm_frcz_pd(ewrt);
3030 #else
3031             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3032 #endif
3033             twoeweps         = _mm_add_pd(eweps,eweps);
3034             ewitab           = _mm_slli_epi32(ewitab,2);
3035             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3036             ewtabD           = _mm_setzero_pd();
3037             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3038             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3039             ewtabFn          = _mm_setzero_pd();
3040             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3041             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
3042             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3043             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
3044             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
3045
3046             d                = _mm_sub_pd(r23,rswitch);
3047             d                = _mm_max_pd(d,_mm_setzero_pd());
3048             d2               = _mm_mul_pd(d,d);
3049             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3050
3051             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3052
3053             /* Evaluate switch function */
3054             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3055             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
3056             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
3057
3058             fscal            = felec;
3059
3060             fscal            = _mm_and_pd(fscal,cutoff_mask);
3061
3062             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3063
3064             /* Update vectorial force */
3065             fix2             = _mm_macc_pd(dx23,fscal,fix2);
3066             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
3067             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
3068             
3069             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
3070             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
3071             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
3072
3073             }
3074
3075             /**************************
3076              * CALCULATE INTERACTIONS *
3077              **************************/
3078
3079             if (gmx_mm_any_lt(rsq31,rcutoff2))
3080             {
3081
3082             r31              = _mm_mul_pd(rsq31,rinv31);
3083
3084             /* EWALD ELECTROSTATICS */
3085
3086             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3087             ewrt             = _mm_mul_pd(r31,ewtabscale);
3088             ewitab           = _mm_cvttpd_epi32(ewrt);
3089 #ifdef __XOP__
3090             eweps            = _mm_frcz_pd(ewrt);
3091 #else
3092             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3093 #endif
3094             twoeweps         = _mm_add_pd(eweps,eweps);
3095             ewitab           = _mm_slli_epi32(ewitab,2);
3096             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3097             ewtabD           = _mm_setzero_pd();
3098             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3099             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3100             ewtabFn          = _mm_setzero_pd();
3101             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3102             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
3103             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3104             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
3105             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
3106
3107             d                = _mm_sub_pd(r31,rswitch);
3108             d                = _mm_max_pd(d,_mm_setzero_pd());
3109             d2               = _mm_mul_pd(d,d);
3110             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3111
3112             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3113
3114             /* Evaluate switch function */
3115             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3116             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
3117             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
3118
3119             fscal            = felec;
3120
3121             fscal            = _mm_and_pd(fscal,cutoff_mask);
3122
3123             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3124
3125             /* Update vectorial force */
3126             fix3             = _mm_macc_pd(dx31,fscal,fix3);
3127             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
3128             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
3129             
3130             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
3131             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
3132             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
3133
3134             }
3135
3136             /**************************
3137              * CALCULATE INTERACTIONS *
3138              **************************/
3139
3140             if (gmx_mm_any_lt(rsq32,rcutoff2))
3141             {
3142
3143             r32              = _mm_mul_pd(rsq32,rinv32);
3144
3145             /* EWALD ELECTROSTATICS */
3146
3147             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3148             ewrt             = _mm_mul_pd(r32,ewtabscale);
3149             ewitab           = _mm_cvttpd_epi32(ewrt);
3150 #ifdef __XOP__
3151             eweps            = _mm_frcz_pd(ewrt);
3152 #else
3153             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3154 #endif
3155             twoeweps         = _mm_add_pd(eweps,eweps);
3156             ewitab           = _mm_slli_epi32(ewitab,2);
3157             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3158             ewtabD           = _mm_setzero_pd();
3159             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3160             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3161             ewtabFn          = _mm_setzero_pd();
3162             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3163             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
3164             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3165             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
3166             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
3167
3168             d                = _mm_sub_pd(r32,rswitch);
3169             d                = _mm_max_pd(d,_mm_setzero_pd());
3170             d2               = _mm_mul_pd(d,d);
3171             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3172
3173             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3174
3175             /* Evaluate switch function */
3176             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3177             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
3178             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
3179
3180             fscal            = felec;
3181
3182             fscal            = _mm_and_pd(fscal,cutoff_mask);
3183
3184             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3185
3186             /* Update vectorial force */
3187             fix3             = _mm_macc_pd(dx32,fscal,fix3);
3188             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
3189             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
3190             
3191             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
3192             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
3193             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
3194
3195             }
3196
3197             /**************************
3198              * CALCULATE INTERACTIONS *
3199              **************************/
3200
3201             if (gmx_mm_any_lt(rsq33,rcutoff2))
3202             {
3203
3204             r33              = _mm_mul_pd(rsq33,rinv33);
3205
3206             /* EWALD ELECTROSTATICS */
3207
3208             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3209             ewrt             = _mm_mul_pd(r33,ewtabscale);
3210             ewitab           = _mm_cvttpd_epi32(ewrt);
3211 #ifdef __XOP__
3212             eweps            = _mm_frcz_pd(ewrt);
3213 #else
3214             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3215 #endif
3216             twoeweps         = _mm_add_pd(eweps,eweps);
3217             ewitab           = _mm_slli_epi32(ewitab,2);
3218             ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3219             ewtabD           = _mm_setzero_pd();
3220             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3221             ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3222             ewtabFn          = _mm_setzero_pd();
3223             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3224             felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
3225             velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3226             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
3227             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
3228
3229             d                = _mm_sub_pd(r33,rswitch);
3230             d                = _mm_max_pd(d,_mm_setzero_pd());
3231             d2               = _mm_mul_pd(d,d);
3232             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3233
3234             dsw              = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3235
3236             /* Evaluate switch function */
3237             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3238             felec            = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
3239             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
3240
3241             fscal            = felec;
3242
3243             fscal            = _mm_and_pd(fscal,cutoff_mask);
3244
3245             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3246
3247             /* Update vectorial force */
3248             fix3             = _mm_macc_pd(dx33,fscal,fix3);
3249             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
3250             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
3251             
3252             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
3253             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
3254             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
3255
3256             }
3257
3258             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3259
3260             /* Inner loop uses 647 flops */
3261         }
3262
3263         /* End of innermost loop */
3264
3265         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3266                                               f+i_coord_offset,fshift+i_shift_offset);
3267
3268         /* Increment number of inner iterations */
3269         inneriter                  += j_index_end - j_index_start;
3270
3271         /* Outer loop uses 24 flops */
3272     }
3273
3274     /* Increment number of outer iterations */
3275     outeriter        += nri;
3276
3277     /* Update outer/inner flops */
3278
3279     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*647);
3280 }