70b5ba5a37ab495849066b8c0a266816995de52f
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_avx_128_fma_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
48 #include "kernelutil_x86_avx_128_fma_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
52  * Electrostatics interaction: CubicSplineTable
53  * VdW interaction:            CubicSplineTable
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB;
75     int              j_coord_offsetA,j_coord_offsetB;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80     int              vdwioffset0;
81     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82     int              vdwioffset1;
83     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84     int              vdwioffset2;
85     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86     int              vdwjidx0A,vdwjidx0B;
87     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
88     int              vdwjidx1A,vdwjidx1B;
89     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90     int              vdwjidx2A,vdwjidx2B;
91     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
94     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
95     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
102     real             *charge;
103     int              nvdwtype;
104     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
105     int              *vdwtype;
106     real             *vdwparam;
107     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
108     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
109     __m128i          vfitab;
110     __m128i          ifour       = _mm_set1_epi32(4);
111     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
112     real             *vftab;
113     __m128d          dummy_mask,cutoff_mask;
114     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115     __m128d          one     = _mm_set1_pd(1.0);
116     __m128d          two     = _mm_set1_pd(2.0);
117     x                = xx[0];
118     f                = ff[0];
119
120     nri              = nlist->nri;
121     iinr             = nlist->iinr;
122     jindex           = nlist->jindex;
123     jjnr             = nlist->jjnr;
124     shiftidx         = nlist->shift;
125     gid              = nlist->gid;
126     shiftvec         = fr->shift_vec[0];
127     fshift           = fr->fshift[0];
128     facel            = _mm_set1_pd(fr->epsfac);
129     charge           = mdatoms->chargeA;
130     nvdwtype         = fr->ntype;
131     vdwparam         = fr->nbfp;
132     vdwtype          = mdatoms->typeA;
133
134     vftab            = kernel_data->table_elec_vdw->data;
135     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
136
137     /* Setup water-specific parameters */
138     inr              = nlist->iinr[0];
139     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
140     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
141     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
142     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
143
144     jq0              = _mm_set1_pd(charge[inr+0]);
145     jq1              = _mm_set1_pd(charge[inr+1]);
146     jq2              = _mm_set1_pd(charge[inr+2]);
147     vdwjidx0A        = 2*vdwtype[inr+0];
148     qq00             = _mm_mul_pd(iq0,jq0);
149     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
150     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
151     qq01             = _mm_mul_pd(iq0,jq1);
152     qq02             = _mm_mul_pd(iq0,jq2);
153     qq10             = _mm_mul_pd(iq1,jq0);
154     qq11             = _mm_mul_pd(iq1,jq1);
155     qq12             = _mm_mul_pd(iq1,jq2);
156     qq20             = _mm_mul_pd(iq2,jq0);
157     qq21             = _mm_mul_pd(iq2,jq1);
158     qq22             = _mm_mul_pd(iq2,jq2);
159
160     /* Avoid stupid compiler warnings */
161     jnrA = jnrB = 0;
162     j_coord_offsetA = 0;
163     j_coord_offsetB = 0;
164
165     outeriter        = 0;
166     inneriter        = 0;
167
168     /* Start outer loop over neighborlists */
169     for(iidx=0; iidx<nri; iidx++)
170     {
171         /* Load shift vector for this list */
172         i_shift_offset   = DIM*shiftidx[iidx];
173
174         /* Load limits for loop over neighbors */
175         j_index_start    = jindex[iidx];
176         j_index_end      = jindex[iidx+1];
177
178         /* Get outer coordinate index */
179         inr              = iinr[iidx];
180         i_coord_offset   = DIM*inr;
181
182         /* Load i particle coords and add shift vector */
183         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
184                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
185
186         fix0             = _mm_setzero_pd();
187         fiy0             = _mm_setzero_pd();
188         fiz0             = _mm_setzero_pd();
189         fix1             = _mm_setzero_pd();
190         fiy1             = _mm_setzero_pd();
191         fiz1             = _mm_setzero_pd();
192         fix2             = _mm_setzero_pd();
193         fiy2             = _mm_setzero_pd();
194         fiz2             = _mm_setzero_pd();
195
196         /* Reset potential sums */
197         velecsum         = _mm_setzero_pd();
198         vvdwsum          = _mm_setzero_pd();
199
200         /* Start inner kernel loop */
201         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
202         {
203
204             /* Get j neighbor index, and coordinate index */
205             jnrA             = jjnr[jidx];
206             jnrB             = jjnr[jidx+1];
207             j_coord_offsetA  = DIM*jnrA;
208             j_coord_offsetB  = DIM*jnrB;
209
210             /* load j atom coordinates */
211             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
212                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
213
214             /* Calculate displacement vector */
215             dx00             = _mm_sub_pd(ix0,jx0);
216             dy00             = _mm_sub_pd(iy0,jy0);
217             dz00             = _mm_sub_pd(iz0,jz0);
218             dx01             = _mm_sub_pd(ix0,jx1);
219             dy01             = _mm_sub_pd(iy0,jy1);
220             dz01             = _mm_sub_pd(iz0,jz1);
221             dx02             = _mm_sub_pd(ix0,jx2);
222             dy02             = _mm_sub_pd(iy0,jy2);
223             dz02             = _mm_sub_pd(iz0,jz2);
224             dx10             = _mm_sub_pd(ix1,jx0);
225             dy10             = _mm_sub_pd(iy1,jy0);
226             dz10             = _mm_sub_pd(iz1,jz0);
227             dx11             = _mm_sub_pd(ix1,jx1);
228             dy11             = _mm_sub_pd(iy1,jy1);
229             dz11             = _mm_sub_pd(iz1,jz1);
230             dx12             = _mm_sub_pd(ix1,jx2);
231             dy12             = _mm_sub_pd(iy1,jy2);
232             dz12             = _mm_sub_pd(iz1,jz2);
233             dx20             = _mm_sub_pd(ix2,jx0);
234             dy20             = _mm_sub_pd(iy2,jy0);
235             dz20             = _mm_sub_pd(iz2,jz0);
236             dx21             = _mm_sub_pd(ix2,jx1);
237             dy21             = _mm_sub_pd(iy2,jy1);
238             dz21             = _mm_sub_pd(iz2,jz1);
239             dx22             = _mm_sub_pd(ix2,jx2);
240             dy22             = _mm_sub_pd(iy2,jy2);
241             dz22             = _mm_sub_pd(iz2,jz2);
242
243             /* Calculate squared distance and things based on it */
244             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
245             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
246             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
247             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
248             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
249             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
250             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
251             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
252             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
253
254             rinv00           = gmx_mm_invsqrt_pd(rsq00);
255             rinv01           = gmx_mm_invsqrt_pd(rsq01);
256             rinv02           = gmx_mm_invsqrt_pd(rsq02);
257             rinv10           = gmx_mm_invsqrt_pd(rsq10);
258             rinv11           = gmx_mm_invsqrt_pd(rsq11);
259             rinv12           = gmx_mm_invsqrt_pd(rsq12);
260             rinv20           = gmx_mm_invsqrt_pd(rsq20);
261             rinv21           = gmx_mm_invsqrt_pd(rsq21);
262             rinv22           = gmx_mm_invsqrt_pd(rsq22);
263
264             fjx0             = _mm_setzero_pd();
265             fjy0             = _mm_setzero_pd();
266             fjz0             = _mm_setzero_pd();
267             fjx1             = _mm_setzero_pd();
268             fjy1             = _mm_setzero_pd();
269             fjz1             = _mm_setzero_pd();
270             fjx2             = _mm_setzero_pd();
271             fjy2             = _mm_setzero_pd();
272             fjz2             = _mm_setzero_pd();
273
274             /**************************
275              * CALCULATE INTERACTIONS *
276              **************************/
277
278             r00              = _mm_mul_pd(rsq00,rinv00);
279
280             /* Calculate table index by multiplying r with table scale and truncate to integer */
281             rt               = _mm_mul_pd(r00,vftabscale);
282             vfitab           = _mm_cvttpd_epi32(rt);
283 #ifdef __XOP__
284             vfeps            = _mm_frcz_pd(rt);
285 #else
286             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
287 #endif
288             twovfeps         = _mm_add_pd(vfeps,vfeps);
289             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
290
291             /* CUBIC SPLINE TABLE ELECTROSTATICS */
292             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
293             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
294             GMX_MM_TRANSPOSE2_PD(Y,F);
295             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
296             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
297             GMX_MM_TRANSPOSE2_PD(G,H);
298             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
299             VV               = _mm_macc_pd(vfeps,Fp,Y);
300             velec            = _mm_mul_pd(qq00,VV);
301             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
302             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
303
304             /* CUBIC SPLINE TABLE DISPERSION */
305             vfitab           = _mm_add_epi32(vfitab,ifour);
306             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
307             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
308             GMX_MM_TRANSPOSE2_PD(Y,F);
309             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
310             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
311             GMX_MM_TRANSPOSE2_PD(G,H);
312             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
313             VV               = _mm_macc_pd(vfeps,Fp,Y);
314             vvdw6            = _mm_mul_pd(c6_00,VV);
315             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
316             fvdw6            = _mm_mul_pd(c6_00,FF);
317
318             /* CUBIC SPLINE TABLE REPULSION */
319             vfitab           = _mm_add_epi32(vfitab,ifour);
320             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
321             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
322             GMX_MM_TRANSPOSE2_PD(Y,F);
323             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
324             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
325             GMX_MM_TRANSPOSE2_PD(G,H);
326             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
327             VV               = _mm_macc_pd(vfeps,Fp,Y);
328             vvdw12           = _mm_mul_pd(c12_00,VV);
329             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
330             fvdw12           = _mm_mul_pd(c12_00,FF);
331             vvdw             = _mm_add_pd(vvdw12,vvdw6);
332             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
333
334             /* Update potential sum for this i atom from the interaction with this j atom. */
335             velecsum         = _mm_add_pd(velecsum,velec);
336             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
337
338             fscal            = _mm_add_pd(felec,fvdw);
339
340             /* Update vectorial force */
341             fix0             = _mm_macc_pd(dx00,fscal,fix0);
342             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
343             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
344             
345             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
346             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
347             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
348
349             /**************************
350              * CALCULATE INTERACTIONS *
351              **************************/
352
353             r01              = _mm_mul_pd(rsq01,rinv01);
354
355             /* Calculate table index by multiplying r with table scale and truncate to integer */
356             rt               = _mm_mul_pd(r01,vftabscale);
357             vfitab           = _mm_cvttpd_epi32(rt);
358 #ifdef __XOP__
359             vfeps            = _mm_frcz_pd(rt);
360 #else
361             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
362 #endif
363             twovfeps         = _mm_add_pd(vfeps,vfeps);
364             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
365
366             /* CUBIC SPLINE TABLE ELECTROSTATICS */
367             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
368             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
369             GMX_MM_TRANSPOSE2_PD(Y,F);
370             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
371             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
372             GMX_MM_TRANSPOSE2_PD(G,H);
373             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
374             VV               = _mm_macc_pd(vfeps,Fp,Y);
375             velec            = _mm_mul_pd(qq01,VV);
376             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
377             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
378
379             /* Update potential sum for this i atom from the interaction with this j atom. */
380             velecsum         = _mm_add_pd(velecsum,velec);
381
382             fscal            = felec;
383
384             /* Update vectorial force */
385             fix0             = _mm_macc_pd(dx01,fscal,fix0);
386             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
387             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
388             
389             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
390             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
391             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
392
393             /**************************
394              * CALCULATE INTERACTIONS *
395              **************************/
396
397             r02              = _mm_mul_pd(rsq02,rinv02);
398
399             /* Calculate table index by multiplying r with table scale and truncate to integer */
400             rt               = _mm_mul_pd(r02,vftabscale);
401             vfitab           = _mm_cvttpd_epi32(rt);
402 #ifdef __XOP__
403             vfeps            = _mm_frcz_pd(rt);
404 #else
405             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
406 #endif
407             twovfeps         = _mm_add_pd(vfeps,vfeps);
408             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
409
410             /* CUBIC SPLINE TABLE ELECTROSTATICS */
411             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
412             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
413             GMX_MM_TRANSPOSE2_PD(Y,F);
414             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
415             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
416             GMX_MM_TRANSPOSE2_PD(G,H);
417             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
418             VV               = _mm_macc_pd(vfeps,Fp,Y);
419             velec            = _mm_mul_pd(qq02,VV);
420             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
421             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
422
423             /* Update potential sum for this i atom from the interaction with this j atom. */
424             velecsum         = _mm_add_pd(velecsum,velec);
425
426             fscal            = felec;
427
428             /* Update vectorial force */
429             fix0             = _mm_macc_pd(dx02,fscal,fix0);
430             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
431             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
432             
433             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
434             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
435             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
436
437             /**************************
438              * CALCULATE INTERACTIONS *
439              **************************/
440
441             r10              = _mm_mul_pd(rsq10,rinv10);
442
443             /* Calculate table index by multiplying r with table scale and truncate to integer */
444             rt               = _mm_mul_pd(r10,vftabscale);
445             vfitab           = _mm_cvttpd_epi32(rt);
446 #ifdef __XOP__
447             vfeps            = _mm_frcz_pd(rt);
448 #else
449             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
450 #endif
451             twovfeps         = _mm_add_pd(vfeps,vfeps);
452             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
453
454             /* CUBIC SPLINE TABLE ELECTROSTATICS */
455             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
456             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
457             GMX_MM_TRANSPOSE2_PD(Y,F);
458             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
459             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
460             GMX_MM_TRANSPOSE2_PD(G,H);
461             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
462             VV               = _mm_macc_pd(vfeps,Fp,Y);
463             velec            = _mm_mul_pd(qq10,VV);
464             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
465             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
466
467             /* Update potential sum for this i atom from the interaction with this j atom. */
468             velecsum         = _mm_add_pd(velecsum,velec);
469
470             fscal            = felec;
471
472             /* Update vectorial force */
473             fix1             = _mm_macc_pd(dx10,fscal,fix1);
474             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
475             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
476             
477             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
478             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
479             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
480
481             /**************************
482              * CALCULATE INTERACTIONS *
483              **************************/
484
485             r11              = _mm_mul_pd(rsq11,rinv11);
486
487             /* Calculate table index by multiplying r with table scale and truncate to integer */
488             rt               = _mm_mul_pd(r11,vftabscale);
489             vfitab           = _mm_cvttpd_epi32(rt);
490 #ifdef __XOP__
491             vfeps            = _mm_frcz_pd(rt);
492 #else
493             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
494 #endif
495             twovfeps         = _mm_add_pd(vfeps,vfeps);
496             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
497
498             /* CUBIC SPLINE TABLE ELECTROSTATICS */
499             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
500             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
501             GMX_MM_TRANSPOSE2_PD(Y,F);
502             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
503             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
504             GMX_MM_TRANSPOSE2_PD(G,H);
505             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
506             VV               = _mm_macc_pd(vfeps,Fp,Y);
507             velec            = _mm_mul_pd(qq11,VV);
508             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
509             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
510
511             /* Update potential sum for this i atom from the interaction with this j atom. */
512             velecsum         = _mm_add_pd(velecsum,velec);
513
514             fscal            = felec;
515
516             /* Update vectorial force */
517             fix1             = _mm_macc_pd(dx11,fscal,fix1);
518             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
519             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
520             
521             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
522             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
523             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
524
525             /**************************
526              * CALCULATE INTERACTIONS *
527              **************************/
528
529             r12              = _mm_mul_pd(rsq12,rinv12);
530
531             /* Calculate table index by multiplying r with table scale and truncate to integer */
532             rt               = _mm_mul_pd(r12,vftabscale);
533             vfitab           = _mm_cvttpd_epi32(rt);
534 #ifdef __XOP__
535             vfeps            = _mm_frcz_pd(rt);
536 #else
537             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
538 #endif
539             twovfeps         = _mm_add_pd(vfeps,vfeps);
540             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
541
542             /* CUBIC SPLINE TABLE ELECTROSTATICS */
543             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
544             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
545             GMX_MM_TRANSPOSE2_PD(Y,F);
546             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
547             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
548             GMX_MM_TRANSPOSE2_PD(G,H);
549             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
550             VV               = _mm_macc_pd(vfeps,Fp,Y);
551             velec            = _mm_mul_pd(qq12,VV);
552             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
553             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
554
555             /* Update potential sum for this i atom from the interaction with this j atom. */
556             velecsum         = _mm_add_pd(velecsum,velec);
557
558             fscal            = felec;
559
560             /* Update vectorial force */
561             fix1             = _mm_macc_pd(dx12,fscal,fix1);
562             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
563             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
564             
565             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
566             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
567             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
568
569             /**************************
570              * CALCULATE INTERACTIONS *
571              **************************/
572
573             r20              = _mm_mul_pd(rsq20,rinv20);
574
575             /* Calculate table index by multiplying r with table scale and truncate to integer */
576             rt               = _mm_mul_pd(r20,vftabscale);
577             vfitab           = _mm_cvttpd_epi32(rt);
578 #ifdef __XOP__
579             vfeps            = _mm_frcz_pd(rt);
580 #else
581             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
582 #endif
583             twovfeps         = _mm_add_pd(vfeps,vfeps);
584             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
585
586             /* CUBIC SPLINE TABLE ELECTROSTATICS */
587             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
588             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
589             GMX_MM_TRANSPOSE2_PD(Y,F);
590             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
591             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
592             GMX_MM_TRANSPOSE2_PD(G,H);
593             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
594             VV               = _mm_macc_pd(vfeps,Fp,Y);
595             velec            = _mm_mul_pd(qq20,VV);
596             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
597             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
598
599             /* Update potential sum for this i atom from the interaction with this j atom. */
600             velecsum         = _mm_add_pd(velecsum,velec);
601
602             fscal            = felec;
603
604             /* Update vectorial force */
605             fix2             = _mm_macc_pd(dx20,fscal,fix2);
606             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
607             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
608             
609             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
610             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
611             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
612
613             /**************************
614              * CALCULATE INTERACTIONS *
615              **************************/
616
617             r21              = _mm_mul_pd(rsq21,rinv21);
618
619             /* Calculate table index by multiplying r with table scale and truncate to integer */
620             rt               = _mm_mul_pd(r21,vftabscale);
621             vfitab           = _mm_cvttpd_epi32(rt);
622 #ifdef __XOP__
623             vfeps            = _mm_frcz_pd(rt);
624 #else
625             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
626 #endif
627             twovfeps         = _mm_add_pd(vfeps,vfeps);
628             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
629
630             /* CUBIC SPLINE TABLE ELECTROSTATICS */
631             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
632             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
633             GMX_MM_TRANSPOSE2_PD(Y,F);
634             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
635             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
636             GMX_MM_TRANSPOSE2_PD(G,H);
637             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
638             VV               = _mm_macc_pd(vfeps,Fp,Y);
639             velec            = _mm_mul_pd(qq21,VV);
640             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
641             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
642
643             /* Update potential sum for this i atom from the interaction with this j atom. */
644             velecsum         = _mm_add_pd(velecsum,velec);
645
646             fscal            = felec;
647
648             /* Update vectorial force */
649             fix2             = _mm_macc_pd(dx21,fscal,fix2);
650             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
651             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
652             
653             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
654             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
655             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
656
657             /**************************
658              * CALCULATE INTERACTIONS *
659              **************************/
660
661             r22              = _mm_mul_pd(rsq22,rinv22);
662
663             /* Calculate table index by multiplying r with table scale and truncate to integer */
664             rt               = _mm_mul_pd(r22,vftabscale);
665             vfitab           = _mm_cvttpd_epi32(rt);
666 #ifdef __XOP__
667             vfeps            = _mm_frcz_pd(rt);
668 #else
669             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
670 #endif
671             twovfeps         = _mm_add_pd(vfeps,vfeps);
672             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
673
674             /* CUBIC SPLINE TABLE ELECTROSTATICS */
675             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
676             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
677             GMX_MM_TRANSPOSE2_PD(Y,F);
678             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
679             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
680             GMX_MM_TRANSPOSE2_PD(G,H);
681             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
682             VV               = _mm_macc_pd(vfeps,Fp,Y);
683             velec            = _mm_mul_pd(qq22,VV);
684             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
685             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
686
687             /* Update potential sum for this i atom from the interaction with this j atom. */
688             velecsum         = _mm_add_pd(velecsum,velec);
689
690             fscal            = felec;
691
692             /* Update vectorial force */
693             fix2             = _mm_macc_pd(dx22,fscal,fix2);
694             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
695             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
696             
697             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
698             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
699             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
700
701             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
702
703             /* Inner loop uses 444 flops */
704         }
705
706         if(jidx<j_index_end)
707         {
708
709             jnrA             = jjnr[jidx];
710             j_coord_offsetA  = DIM*jnrA;
711
712             /* load j atom coordinates */
713             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
714                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
715
716             /* Calculate displacement vector */
717             dx00             = _mm_sub_pd(ix0,jx0);
718             dy00             = _mm_sub_pd(iy0,jy0);
719             dz00             = _mm_sub_pd(iz0,jz0);
720             dx01             = _mm_sub_pd(ix0,jx1);
721             dy01             = _mm_sub_pd(iy0,jy1);
722             dz01             = _mm_sub_pd(iz0,jz1);
723             dx02             = _mm_sub_pd(ix0,jx2);
724             dy02             = _mm_sub_pd(iy0,jy2);
725             dz02             = _mm_sub_pd(iz0,jz2);
726             dx10             = _mm_sub_pd(ix1,jx0);
727             dy10             = _mm_sub_pd(iy1,jy0);
728             dz10             = _mm_sub_pd(iz1,jz0);
729             dx11             = _mm_sub_pd(ix1,jx1);
730             dy11             = _mm_sub_pd(iy1,jy1);
731             dz11             = _mm_sub_pd(iz1,jz1);
732             dx12             = _mm_sub_pd(ix1,jx2);
733             dy12             = _mm_sub_pd(iy1,jy2);
734             dz12             = _mm_sub_pd(iz1,jz2);
735             dx20             = _mm_sub_pd(ix2,jx0);
736             dy20             = _mm_sub_pd(iy2,jy0);
737             dz20             = _mm_sub_pd(iz2,jz0);
738             dx21             = _mm_sub_pd(ix2,jx1);
739             dy21             = _mm_sub_pd(iy2,jy1);
740             dz21             = _mm_sub_pd(iz2,jz1);
741             dx22             = _mm_sub_pd(ix2,jx2);
742             dy22             = _mm_sub_pd(iy2,jy2);
743             dz22             = _mm_sub_pd(iz2,jz2);
744
745             /* Calculate squared distance and things based on it */
746             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
747             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
748             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
749             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
750             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
751             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
752             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
753             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
754             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
755
756             rinv00           = gmx_mm_invsqrt_pd(rsq00);
757             rinv01           = gmx_mm_invsqrt_pd(rsq01);
758             rinv02           = gmx_mm_invsqrt_pd(rsq02);
759             rinv10           = gmx_mm_invsqrt_pd(rsq10);
760             rinv11           = gmx_mm_invsqrt_pd(rsq11);
761             rinv12           = gmx_mm_invsqrt_pd(rsq12);
762             rinv20           = gmx_mm_invsqrt_pd(rsq20);
763             rinv21           = gmx_mm_invsqrt_pd(rsq21);
764             rinv22           = gmx_mm_invsqrt_pd(rsq22);
765
766             fjx0             = _mm_setzero_pd();
767             fjy0             = _mm_setzero_pd();
768             fjz0             = _mm_setzero_pd();
769             fjx1             = _mm_setzero_pd();
770             fjy1             = _mm_setzero_pd();
771             fjz1             = _mm_setzero_pd();
772             fjx2             = _mm_setzero_pd();
773             fjy2             = _mm_setzero_pd();
774             fjz2             = _mm_setzero_pd();
775
776             /**************************
777              * CALCULATE INTERACTIONS *
778              **************************/
779
780             r00              = _mm_mul_pd(rsq00,rinv00);
781
782             /* Calculate table index by multiplying r with table scale and truncate to integer */
783             rt               = _mm_mul_pd(r00,vftabscale);
784             vfitab           = _mm_cvttpd_epi32(rt);
785 #ifdef __XOP__
786             vfeps            = _mm_frcz_pd(rt);
787 #else
788             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
789 #endif
790             twovfeps         = _mm_add_pd(vfeps,vfeps);
791             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
792
793             /* CUBIC SPLINE TABLE ELECTROSTATICS */
794             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
795             F                = _mm_setzero_pd();
796             GMX_MM_TRANSPOSE2_PD(Y,F);
797             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
798             H                = _mm_setzero_pd();
799             GMX_MM_TRANSPOSE2_PD(G,H);
800             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
801             VV               = _mm_macc_pd(vfeps,Fp,Y);
802             velec            = _mm_mul_pd(qq00,VV);
803             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
804             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
805
806             /* CUBIC SPLINE TABLE DISPERSION */
807             vfitab           = _mm_add_epi32(vfitab,ifour);
808             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
809             F                = _mm_setzero_pd();
810             GMX_MM_TRANSPOSE2_PD(Y,F);
811             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
812             H                = _mm_setzero_pd();
813             GMX_MM_TRANSPOSE2_PD(G,H);
814             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
815             VV               = _mm_macc_pd(vfeps,Fp,Y);
816             vvdw6            = _mm_mul_pd(c6_00,VV);
817             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
818             fvdw6            = _mm_mul_pd(c6_00,FF);
819
820             /* CUBIC SPLINE TABLE REPULSION */
821             vfitab           = _mm_add_epi32(vfitab,ifour);
822             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
823             F                = _mm_setzero_pd();
824             GMX_MM_TRANSPOSE2_PD(Y,F);
825             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
826             H                = _mm_setzero_pd();
827             GMX_MM_TRANSPOSE2_PD(G,H);
828             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
829             VV               = _mm_macc_pd(vfeps,Fp,Y);
830             vvdw12           = _mm_mul_pd(c12_00,VV);
831             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
832             fvdw12           = _mm_mul_pd(c12_00,FF);
833             vvdw             = _mm_add_pd(vvdw12,vvdw6);
834             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
835
836             /* Update potential sum for this i atom from the interaction with this j atom. */
837             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
838             velecsum         = _mm_add_pd(velecsum,velec);
839             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
840             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
841
842             fscal            = _mm_add_pd(felec,fvdw);
843
844             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
845
846             /* Update vectorial force */
847             fix0             = _mm_macc_pd(dx00,fscal,fix0);
848             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
849             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
850             
851             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
852             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
853             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
854
855             /**************************
856              * CALCULATE INTERACTIONS *
857              **************************/
858
859             r01              = _mm_mul_pd(rsq01,rinv01);
860
861             /* Calculate table index by multiplying r with table scale and truncate to integer */
862             rt               = _mm_mul_pd(r01,vftabscale);
863             vfitab           = _mm_cvttpd_epi32(rt);
864 #ifdef __XOP__
865             vfeps            = _mm_frcz_pd(rt);
866 #else
867             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
868 #endif
869             twovfeps         = _mm_add_pd(vfeps,vfeps);
870             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
871
872             /* CUBIC SPLINE TABLE ELECTROSTATICS */
873             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
874             F                = _mm_setzero_pd();
875             GMX_MM_TRANSPOSE2_PD(Y,F);
876             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
877             H                = _mm_setzero_pd();
878             GMX_MM_TRANSPOSE2_PD(G,H);
879             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
880             VV               = _mm_macc_pd(vfeps,Fp,Y);
881             velec            = _mm_mul_pd(qq01,VV);
882             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
883             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
884
885             /* Update potential sum for this i atom from the interaction with this j atom. */
886             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
887             velecsum         = _mm_add_pd(velecsum,velec);
888
889             fscal            = felec;
890
891             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
892
893             /* Update vectorial force */
894             fix0             = _mm_macc_pd(dx01,fscal,fix0);
895             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
896             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
897             
898             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
899             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
900             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
901
902             /**************************
903              * CALCULATE INTERACTIONS *
904              **************************/
905
906             r02              = _mm_mul_pd(rsq02,rinv02);
907
908             /* Calculate table index by multiplying r with table scale and truncate to integer */
909             rt               = _mm_mul_pd(r02,vftabscale);
910             vfitab           = _mm_cvttpd_epi32(rt);
911 #ifdef __XOP__
912             vfeps            = _mm_frcz_pd(rt);
913 #else
914             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
915 #endif
916             twovfeps         = _mm_add_pd(vfeps,vfeps);
917             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
918
919             /* CUBIC SPLINE TABLE ELECTROSTATICS */
920             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
921             F                = _mm_setzero_pd();
922             GMX_MM_TRANSPOSE2_PD(Y,F);
923             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
924             H                = _mm_setzero_pd();
925             GMX_MM_TRANSPOSE2_PD(G,H);
926             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
927             VV               = _mm_macc_pd(vfeps,Fp,Y);
928             velec            = _mm_mul_pd(qq02,VV);
929             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
930             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
931
932             /* Update potential sum for this i atom from the interaction with this j atom. */
933             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
934             velecsum         = _mm_add_pd(velecsum,velec);
935
936             fscal            = felec;
937
938             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
939
940             /* Update vectorial force */
941             fix0             = _mm_macc_pd(dx02,fscal,fix0);
942             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
943             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
944             
945             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
946             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
947             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
948
949             /**************************
950              * CALCULATE INTERACTIONS *
951              **************************/
952
953             r10              = _mm_mul_pd(rsq10,rinv10);
954
955             /* Calculate table index by multiplying r with table scale and truncate to integer */
956             rt               = _mm_mul_pd(r10,vftabscale);
957             vfitab           = _mm_cvttpd_epi32(rt);
958 #ifdef __XOP__
959             vfeps            = _mm_frcz_pd(rt);
960 #else
961             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
962 #endif
963             twovfeps         = _mm_add_pd(vfeps,vfeps);
964             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
965
966             /* CUBIC SPLINE TABLE ELECTROSTATICS */
967             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
968             F                = _mm_setzero_pd();
969             GMX_MM_TRANSPOSE2_PD(Y,F);
970             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
971             H                = _mm_setzero_pd();
972             GMX_MM_TRANSPOSE2_PD(G,H);
973             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
974             VV               = _mm_macc_pd(vfeps,Fp,Y);
975             velec            = _mm_mul_pd(qq10,VV);
976             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
977             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
978
979             /* Update potential sum for this i atom from the interaction with this j atom. */
980             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
981             velecsum         = _mm_add_pd(velecsum,velec);
982
983             fscal            = felec;
984
985             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
986
987             /* Update vectorial force */
988             fix1             = _mm_macc_pd(dx10,fscal,fix1);
989             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
990             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
991             
992             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
993             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
994             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
995
996             /**************************
997              * CALCULATE INTERACTIONS *
998              **************************/
999
1000             r11              = _mm_mul_pd(rsq11,rinv11);
1001
1002             /* Calculate table index by multiplying r with table scale and truncate to integer */
1003             rt               = _mm_mul_pd(r11,vftabscale);
1004             vfitab           = _mm_cvttpd_epi32(rt);
1005 #ifdef __XOP__
1006             vfeps            = _mm_frcz_pd(rt);
1007 #else
1008             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1009 #endif
1010             twovfeps         = _mm_add_pd(vfeps,vfeps);
1011             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1012
1013             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1014             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1015             F                = _mm_setzero_pd();
1016             GMX_MM_TRANSPOSE2_PD(Y,F);
1017             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1018             H                = _mm_setzero_pd();
1019             GMX_MM_TRANSPOSE2_PD(G,H);
1020             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1021             VV               = _mm_macc_pd(vfeps,Fp,Y);
1022             velec            = _mm_mul_pd(qq11,VV);
1023             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1024             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1025
1026             /* Update potential sum for this i atom from the interaction with this j atom. */
1027             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1028             velecsum         = _mm_add_pd(velecsum,velec);
1029
1030             fscal            = felec;
1031
1032             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1033
1034             /* Update vectorial force */
1035             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1036             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1037             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1038             
1039             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1040             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1041             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1042
1043             /**************************
1044              * CALCULATE INTERACTIONS *
1045              **************************/
1046
1047             r12              = _mm_mul_pd(rsq12,rinv12);
1048
1049             /* Calculate table index by multiplying r with table scale and truncate to integer */
1050             rt               = _mm_mul_pd(r12,vftabscale);
1051             vfitab           = _mm_cvttpd_epi32(rt);
1052 #ifdef __XOP__
1053             vfeps            = _mm_frcz_pd(rt);
1054 #else
1055             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1056 #endif
1057             twovfeps         = _mm_add_pd(vfeps,vfeps);
1058             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1059
1060             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1061             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1062             F                = _mm_setzero_pd();
1063             GMX_MM_TRANSPOSE2_PD(Y,F);
1064             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1065             H                = _mm_setzero_pd();
1066             GMX_MM_TRANSPOSE2_PD(G,H);
1067             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1068             VV               = _mm_macc_pd(vfeps,Fp,Y);
1069             velec            = _mm_mul_pd(qq12,VV);
1070             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1071             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1072
1073             /* Update potential sum for this i atom from the interaction with this j atom. */
1074             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1075             velecsum         = _mm_add_pd(velecsum,velec);
1076
1077             fscal            = felec;
1078
1079             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1080
1081             /* Update vectorial force */
1082             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1083             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1084             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1085             
1086             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1087             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1088             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1089
1090             /**************************
1091              * CALCULATE INTERACTIONS *
1092              **************************/
1093
1094             r20              = _mm_mul_pd(rsq20,rinv20);
1095
1096             /* Calculate table index by multiplying r with table scale and truncate to integer */
1097             rt               = _mm_mul_pd(r20,vftabscale);
1098             vfitab           = _mm_cvttpd_epi32(rt);
1099 #ifdef __XOP__
1100             vfeps            = _mm_frcz_pd(rt);
1101 #else
1102             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1103 #endif
1104             twovfeps         = _mm_add_pd(vfeps,vfeps);
1105             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1106
1107             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1108             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1109             F                = _mm_setzero_pd();
1110             GMX_MM_TRANSPOSE2_PD(Y,F);
1111             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1112             H                = _mm_setzero_pd();
1113             GMX_MM_TRANSPOSE2_PD(G,H);
1114             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1115             VV               = _mm_macc_pd(vfeps,Fp,Y);
1116             velec            = _mm_mul_pd(qq20,VV);
1117             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1118             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1119
1120             /* Update potential sum for this i atom from the interaction with this j atom. */
1121             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1122             velecsum         = _mm_add_pd(velecsum,velec);
1123
1124             fscal            = felec;
1125
1126             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1127
1128             /* Update vectorial force */
1129             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1130             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1131             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1132             
1133             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1134             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1135             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1136
1137             /**************************
1138              * CALCULATE INTERACTIONS *
1139              **************************/
1140
1141             r21              = _mm_mul_pd(rsq21,rinv21);
1142
1143             /* Calculate table index by multiplying r with table scale and truncate to integer */
1144             rt               = _mm_mul_pd(r21,vftabscale);
1145             vfitab           = _mm_cvttpd_epi32(rt);
1146 #ifdef __XOP__
1147             vfeps            = _mm_frcz_pd(rt);
1148 #else
1149             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1150 #endif
1151             twovfeps         = _mm_add_pd(vfeps,vfeps);
1152             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1153
1154             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1155             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1156             F                = _mm_setzero_pd();
1157             GMX_MM_TRANSPOSE2_PD(Y,F);
1158             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1159             H                = _mm_setzero_pd();
1160             GMX_MM_TRANSPOSE2_PD(G,H);
1161             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1162             VV               = _mm_macc_pd(vfeps,Fp,Y);
1163             velec            = _mm_mul_pd(qq21,VV);
1164             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1165             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1166
1167             /* Update potential sum for this i atom from the interaction with this j atom. */
1168             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1169             velecsum         = _mm_add_pd(velecsum,velec);
1170
1171             fscal            = felec;
1172
1173             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1174
1175             /* Update vectorial force */
1176             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1177             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1178             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1179             
1180             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1181             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1182             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1183
1184             /**************************
1185              * CALCULATE INTERACTIONS *
1186              **************************/
1187
1188             r22              = _mm_mul_pd(rsq22,rinv22);
1189
1190             /* Calculate table index by multiplying r with table scale and truncate to integer */
1191             rt               = _mm_mul_pd(r22,vftabscale);
1192             vfitab           = _mm_cvttpd_epi32(rt);
1193 #ifdef __XOP__
1194             vfeps            = _mm_frcz_pd(rt);
1195 #else
1196             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1197 #endif
1198             twovfeps         = _mm_add_pd(vfeps,vfeps);
1199             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1200
1201             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1202             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1203             F                = _mm_setzero_pd();
1204             GMX_MM_TRANSPOSE2_PD(Y,F);
1205             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1206             H                = _mm_setzero_pd();
1207             GMX_MM_TRANSPOSE2_PD(G,H);
1208             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1209             VV               = _mm_macc_pd(vfeps,Fp,Y);
1210             velec            = _mm_mul_pd(qq22,VV);
1211             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1212             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1213
1214             /* Update potential sum for this i atom from the interaction with this j atom. */
1215             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1216             velecsum         = _mm_add_pd(velecsum,velec);
1217
1218             fscal            = felec;
1219
1220             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1221
1222             /* Update vectorial force */
1223             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1224             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1225             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1226             
1227             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1228             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1229             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1230
1231             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1232
1233             /* Inner loop uses 444 flops */
1234         }
1235
1236         /* End of innermost loop */
1237
1238         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1239                                               f+i_coord_offset,fshift+i_shift_offset);
1240
1241         ggid                        = gid[iidx];
1242         /* Update potential energies */
1243         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1244         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1245
1246         /* Increment number of inner iterations */
1247         inneriter                  += j_index_end - j_index_start;
1248
1249         /* Outer loop uses 20 flops */
1250     }
1251
1252     /* Increment number of outer iterations */
1253     outeriter        += nri;
1254
1255     /* Update outer/inner flops */
1256
1257     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*444);
1258 }
1259 /*
1260  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1261  * Electrostatics interaction: CubicSplineTable
1262  * VdW interaction:            CubicSplineTable
1263  * Geometry:                   Water3-Water3
1264  * Calculate force/pot:        Force
1265  */
1266 void
1267 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1268                     (t_nblist                    * gmx_restrict       nlist,
1269                      rvec                        * gmx_restrict          xx,
1270                      rvec                        * gmx_restrict          ff,
1271                      t_forcerec                  * gmx_restrict          fr,
1272                      t_mdatoms                   * gmx_restrict     mdatoms,
1273                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1274                      t_nrnb                      * gmx_restrict        nrnb)
1275 {
1276     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1277      * just 0 for non-waters.
1278      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1279      * jnr indices corresponding to data put in the four positions in the SIMD register.
1280      */
1281     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1282     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1283     int              jnrA,jnrB;
1284     int              j_coord_offsetA,j_coord_offsetB;
1285     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1286     real             rcutoff_scalar;
1287     real             *shiftvec,*fshift,*x,*f;
1288     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1289     int              vdwioffset0;
1290     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1291     int              vdwioffset1;
1292     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1293     int              vdwioffset2;
1294     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1295     int              vdwjidx0A,vdwjidx0B;
1296     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1297     int              vdwjidx1A,vdwjidx1B;
1298     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1299     int              vdwjidx2A,vdwjidx2B;
1300     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1301     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1302     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1303     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1304     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1305     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1306     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1307     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1308     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1309     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1310     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1311     real             *charge;
1312     int              nvdwtype;
1313     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1314     int              *vdwtype;
1315     real             *vdwparam;
1316     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1317     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1318     __m128i          vfitab;
1319     __m128i          ifour       = _mm_set1_epi32(4);
1320     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1321     real             *vftab;
1322     __m128d          dummy_mask,cutoff_mask;
1323     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1324     __m128d          one     = _mm_set1_pd(1.0);
1325     __m128d          two     = _mm_set1_pd(2.0);
1326     x                = xx[0];
1327     f                = ff[0];
1328
1329     nri              = nlist->nri;
1330     iinr             = nlist->iinr;
1331     jindex           = nlist->jindex;
1332     jjnr             = nlist->jjnr;
1333     shiftidx         = nlist->shift;
1334     gid              = nlist->gid;
1335     shiftvec         = fr->shift_vec[0];
1336     fshift           = fr->fshift[0];
1337     facel            = _mm_set1_pd(fr->epsfac);
1338     charge           = mdatoms->chargeA;
1339     nvdwtype         = fr->ntype;
1340     vdwparam         = fr->nbfp;
1341     vdwtype          = mdatoms->typeA;
1342
1343     vftab            = kernel_data->table_elec_vdw->data;
1344     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1345
1346     /* Setup water-specific parameters */
1347     inr              = nlist->iinr[0];
1348     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1349     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1350     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1351     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1352
1353     jq0              = _mm_set1_pd(charge[inr+0]);
1354     jq1              = _mm_set1_pd(charge[inr+1]);
1355     jq2              = _mm_set1_pd(charge[inr+2]);
1356     vdwjidx0A        = 2*vdwtype[inr+0];
1357     qq00             = _mm_mul_pd(iq0,jq0);
1358     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1359     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1360     qq01             = _mm_mul_pd(iq0,jq1);
1361     qq02             = _mm_mul_pd(iq0,jq2);
1362     qq10             = _mm_mul_pd(iq1,jq0);
1363     qq11             = _mm_mul_pd(iq1,jq1);
1364     qq12             = _mm_mul_pd(iq1,jq2);
1365     qq20             = _mm_mul_pd(iq2,jq0);
1366     qq21             = _mm_mul_pd(iq2,jq1);
1367     qq22             = _mm_mul_pd(iq2,jq2);
1368
1369     /* Avoid stupid compiler warnings */
1370     jnrA = jnrB = 0;
1371     j_coord_offsetA = 0;
1372     j_coord_offsetB = 0;
1373
1374     outeriter        = 0;
1375     inneriter        = 0;
1376
1377     /* Start outer loop over neighborlists */
1378     for(iidx=0; iidx<nri; iidx++)
1379     {
1380         /* Load shift vector for this list */
1381         i_shift_offset   = DIM*shiftidx[iidx];
1382
1383         /* Load limits for loop over neighbors */
1384         j_index_start    = jindex[iidx];
1385         j_index_end      = jindex[iidx+1];
1386
1387         /* Get outer coordinate index */
1388         inr              = iinr[iidx];
1389         i_coord_offset   = DIM*inr;
1390
1391         /* Load i particle coords and add shift vector */
1392         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1393                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1394
1395         fix0             = _mm_setzero_pd();
1396         fiy0             = _mm_setzero_pd();
1397         fiz0             = _mm_setzero_pd();
1398         fix1             = _mm_setzero_pd();
1399         fiy1             = _mm_setzero_pd();
1400         fiz1             = _mm_setzero_pd();
1401         fix2             = _mm_setzero_pd();
1402         fiy2             = _mm_setzero_pd();
1403         fiz2             = _mm_setzero_pd();
1404
1405         /* Start inner kernel loop */
1406         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1407         {
1408
1409             /* Get j neighbor index, and coordinate index */
1410             jnrA             = jjnr[jidx];
1411             jnrB             = jjnr[jidx+1];
1412             j_coord_offsetA  = DIM*jnrA;
1413             j_coord_offsetB  = DIM*jnrB;
1414
1415             /* load j atom coordinates */
1416             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1417                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1418
1419             /* Calculate displacement vector */
1420             dx00             = _mm_sub_pd(ix0,jx0);
1421             dy00             = _mm_sub_pd(iy0,jy0);
1422             dz00             = _mm_sub_pd(iz0,jz0);
1423             dx01             = _mm_sub_pd(ix0,jx1);
1424             dy01             = _mm_sub_pd(iy0,jy1);
1425             dz01             = _mm_sub_pd(iz0,jz1);
1426             dx02             = _mm_sub_pd(ix0,jx2);
1427             dy02             = _mm_sub_pd(iy0,jy2);
1428             dz02             = _mm_sub_pd(iz0,jz2);
1429             dx10             = _mm_sub_pd(ix1,jx0);
1430             dy10             = _mm_sub_pd(iy1,jy0);
1431             dz10             = _mm_sub_pd(iz1,jz0);
1432             dx11             = _mm_sub_pd(ix1,jx1);
1433             dy11             = _mm_sub_pd(iy1,jy1);
1434             dz11             = _mm_sub_pd(iz1,jz1);
1435             dx12             = _mm_sub_pd(ix1,jx2);
1436             dy12             = _mm_sub_pd(iy1,jy2);
1437             dz12             = _mm_sub_pd(iz1,jz2);
1438             dx20             = _mm_sub_pd(ix2,jx0);
1439             dy20             = _mm_sub_pd(iy2,jy0);
1440             dz20             = _mm_sub_pd(iz2,jz0);
1441             dx21             = _mm_sub_pd(ix2,jx1);
1442             dy21             = _mm_sub_pd(iy2,jy1);
1443             dz21             = _mm_sub_pd(iz2,jz1);
1444             dx22             = _mm_sub_pd(ix2,jx2);
1445             dy22             = _mm_sub_pd(iy2,jy2);
1446             dz22             = _mm_sub_pd(iz2,jz2);
1447
1448             /* Calculate squared distance and things based on it */
1449             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1450             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1451             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1452             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1453             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1454             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1455             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1456             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1457             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1458
1459             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1460             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1461             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1462             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1463             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1464             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1465             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1466             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1467             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1468
1469             fjx0             = _mm_setzero_pd();
1470             fjy0             = _mm_setzero_pd();
1471             fjz0             = _mm_setzero_pd();
1472             fjx1             = _mm_setzero_pd();
1473             fjy1             = _mm_setzero_pd();
1474             fjz1             = _mm_setzero_pd();
1475             fjx2             = _mm_setzero_pd();
1476             fjy2             = _mm_setzero_pd();
1477             fjz2             = _mm_setzero_pd();
1478
1479             /**************************
1480              * CALCULATE INTERACTIONS *
1481              **************************/
1482
1483             r00              = _mm_mul_pd(rsq00,rinv00);
1484
1485             /* Calculate table index by multiplying r with table scale and truncate to integer */
1486             rt               = _mm_mul_pd(r00,vftabscale);
1487             vfitab           = _mm_cvttpd_epi32(rt);
1488 #ifdef __XOP__
1489             vfeps            = _mm_frcz_pd(rt);
1490 #else
1491             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1492 #endif
1493             twovfeps         = _mm_add_pd(vfeps,vfeps);
1494             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1495
1496             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1497             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1498             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1499             GMX_MM_TRANSPOSE2_PD(Y,F);
1500             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1501             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1502             GMX_MM_TRANSPOSE2_PD(G,H);
1503             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1504             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1505             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1506
1507             /* CUBIC SPLINE TABLE DISPERSION */
1508             vfitab           = _mm_add_epi32(vfitab,ifour);
1509             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1510             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1511             GMX_MM_TRANSPOSE2_PD(Y,F);
1512             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1513             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1514             GMX_MM_TRANSPOSE2_PD(G,H);
1515             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1516             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1517             fvdw6            = _mm_mul_pd(c6_00,FF);
1518
1519             /* CUBIC SPLINE TABLE REPULSION */
1520             vfitab           = _mm_add_epi32(vfitab,ifour);
1521             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1522             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1523             GMX_MM_TRANSPOSE2_PD(Y,F);
1524             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1525             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1526             GMX_MM_TRANSPOSE2_PD(G,H);
1527             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1528             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1529             fvdw12           = _mm_mul_pd(c12_00,FF);
1530             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1531
1532             fscal            = _mm_add_pd(felec,fvdw);
1533
1534             /* Update vectorial force */
1535             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1536             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1537             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1538             
1539             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1540             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1541             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1542
1543             /**************************
1544              * CALCULATE INTERACTIONS *
1545              **************************/
1546
1547             r01              = _mm_mul_pd(rsq01,rinv01);
1548
1549             /* Calculate table index by multiplying r with table scale and truncate to integer */
1550             rt               = _mm_mul_pd(r01,vftabscale);
1551             vfitab           = _mm_cvttpd_epi32(rt);
1552 #ifdef __XOP__
1553             vfeps            = _mm_frcz_pd(rt);
1554 #else
1555             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1556 #endif
1557             twovfeps         = _mm_add_pd(vfeps,vfeps);
1558             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1559
1560             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1561             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1562             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1563             GMX_MM_TRANSPOSE2_PD(Y,F);
1564             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1565             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1566             GMX_MM_TRANSPOSE2_PD(G,H);
1567             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1568             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1569             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1570
1571             fscal            = felec;
1572
1573             /* Update vectorial force */
1574             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1575             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1576             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1577             
1578             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1579             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1580             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1581
1582             /**************************
1583              * CALCULATE INTERACTIONS *
1584              **************************/
1585
1586             r02              = _mm_mul_pd(rsq02,rinv02);
1587
1588             /* Calculate table index by multiplying r with table scale and truncate to integer */
1589             rt               = _mm_mul_pd(r02,vftabscale);
1590             vfitab           = _mm_cvttpd_epi32(rt);
1591 #ifdef __XOP__
1592             vfeps            = _mm_frcz_pd(rt);
1593 #else
1594             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1595 #endif
1596             twovfeps         = _mm_add_pd(vfeps,vfeps);
1597             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1598
1599             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1600             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1601             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1602             GMX_MM_TRANSPOSE2_PD(Y,F);
1603             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1604             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1605             GMX_MM_TRANSPOSE2_PD(G,H);
1606             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1607             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1608             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1609
1610             fscal            = felec;
1611
1612             /* Update vectorial force */
1613             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1614             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1615             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1616             
1617             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1618             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1619             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1620
1621             /**************************
1622              * CALCULATE INTERACTIONS *
1623              **************************/
1624
1625             r10              = _mm_mul_pd(rsq10,rinv10);
1626
1627             /* Calculate table index by multiplying r with table scale and truncate to integer */
1628             rt               = _mm_mul_pd(r10,vftabscale);
1629             vfitab           = _mm_cvttpd_epi32(rt);
1630 #ifdef __XOP__
1631             vfeps            = _mm_frcz_pd(rt);
1632 #else
1633             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1634 #endif
1635             twovfeps         = _mm_add_pd(vfeps,vfeps);
1636             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1637
1638             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1639             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1640             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1641             GMX_MM_TRANSPOSE2_PD(Y,F);
1642             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1643             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1644             GMX_MM_TRANSPOSE2_PD(G,H);
1645             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1646             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1647             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1648
1649             fscal            = felec;
1650
1651             /* Update vectorial force */
1652             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1653             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1654             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1655             
1656             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1657             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1658             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1659
1660             /**************************
1661              * CALCULATE INTERACTIONS *
1662              **************************/
1663
1664             r11              = _mm_mul_pd(rsq11,rinv11);
1665
1666             /* Calculate table index by multiplying r with table scale and truncate to integer */
1667             rt               = _mm_mul_pd(r11,vftabscale);
1668             vfitab           = _mm_cvttpd_epi32(rt);
1669 #ifdef __XOP__
1670             vfeps            = _mm_frcz_pd(rt);
1671 #else
1672             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1673 #endif
1674             twovfeps         = _mm_add_pd(vfeps,vfeps);
1675             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1676
1677             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1678             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1679             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1680             GMX_MM_TRANSPOSE2_PD(Y,F);
1681             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1682             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1683             GMX_MM_TRANSPOSE2_PD(G,H);
1684             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1685             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1686             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1687
1688             fscal            = felec;
1689
1690             /* Update vectorial force */
1691             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1692             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1693             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1694             
1695             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1696             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1697             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1698
1699             /**************************
1700              * CALCULATE INTERACTIONS *
1701              **************************/
1702
1703             r12              = _mm_mul_pd(rsq12,rinv12);
1704
1705             /* Calculate table index by multiplying r with table scale and truncate to integer */
1706             rt               = _mm_mul_pd(r12,vftabscale);
1707             vfitab           = _mm_cvttpd_epi32(rt);
1708 #ifdef __XOP__
1709             vfeps            = _mm_frcz_pd(rt);
1710 #else
1711             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1712 #endif
1713             twovfeps         = _mm_add_pd(vfeps,vfeps);
1714             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1715
1716             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1717             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1718             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1719             GMX_MM_TRANSPOSE2_PD(Y,F);
1720             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1721             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1722             GMX_MM_TRANSPOSE2_PD(G,H);
1723             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1724             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1725             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1726
1727             fscal            = felec;
1728
1729             /* Update vectorial force */
1730             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1731             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1732             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1733             
1734             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1735             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1736             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1737
1738             /**************************
1739              * CALCULATE INTERACTIONS *
1740              **************************/
1741
1742             r20              = _mm_mul_pd(rsq20,rinv20);
1743
1744             /* Calculate table index by multiplying r with table scale and truncate to integer */
1745             rt               = _mm_mul_pd(r20,vftabscale);
1746             vfitab           = _mm_cvttpd_epi32(rt);
1747 #ifdef __XOP__
1748             vfeps            = _mm_frcz_pd(rt);
1749 #else
1750             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1751 #endif
1752             twovfeps         = _mm_add_pd(vfeps,vfeps);
1753             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1754
1755             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1756             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1757             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1758             GMX_MM_TRANSPOSE2_PD(Y,F);
1759             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1760             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1761             GMX_MM_TRANSPOSE2_PD(G,H);
1762             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1763             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1764             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1765
1766             fscal            = felec;
1767
1768             /* Update vectorial force */
1769             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1770             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1771             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1772             
1773             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1774             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1775             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1776
1777             /**************************
1778              * CALCULATE INTERACTIONS *
1779              **************************/
1780
1781             r21              = _mm_mul_pd(rsq21,rinv21);
1782
1783             /* Calculate table index by multiplying r with table scale and truncate to integer */
1784             rt               = _mm_mul_pd(r21,vftabscale);
1785             vfitab           = _mm_cvttpd_epi32(rt);
1786 #ifdef __XOP__
1787             vfeps            = _mm_frcz_pd(rt);
1788 #else
1789             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1790 #endif
1791             twovfeps         = _mm_add_pd(vfeps,vfeps);
1792             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1793
1794             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1795             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1796             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1797             GMX_MM_TRANSPOSE2_PD(Y,F);
1798             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1799             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1800             GMX_MM_TRANSPOSE2_PD(G,H);
1801             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1802             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1803             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1804
1805             fscal            = felec;
1806
1807             /* Update vectorial force */
1808             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1809             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1810             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1811             
1812             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1813             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1814             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1815
1816             /**************************
1817              * CALCULATE INTERACTIONS *
1818              **************************/
1819
1820             r22              = _mm_mul_pd(rsq22,rinv22);
1821
1822             /* Calculate table index by multiplying r with table scale and truncate to integer */
1823             rt               = _mm_mul_pd(r22,vftabscale);
1824             vfitab           = _mm_cvttpd_epi32(rt);
1825 #ifdef __XOP__
1826             vfeps            = _mm_frcz_pd(rt);
1827 #else
1828             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1829 #endif
1830             twovfeps         = _mm_add_pd(vfeps,vfeps);
1831             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1832
1833             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1834             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1835             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1836             GMX_MM_TRANSPOSE2_PD(Y,F);
1837             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1838             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1839             GMX_MM_TRANSPOSE2_PD(G,H);
1840             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1841             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1842             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1843
1844             fscal            = felec;
1845
1846             /* Update vectorial force */
1847             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1848             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1849             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1850             
1851             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1852             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1853             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1854
1855             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1856
1857             /* Inner loop uses 400 flops */
1858         }
1859
1860         if(jidx<j_index_end)
1861         {
1862
1863             jnrA             = jjnr[jidx];
1864             j_coord_offsetA  = DIM*jnrA;
1865
1866             /* load j atom coordinates */
1867             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1868                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1869
1870             /* Calculate displacement vector */
1871             dx00             = _mm_sub_pd(ix0,jx0);
1872             dy00             = _mm_sub_pd(iy0,jy0);
1873             dz00             = _mm_sub_pd(iz0,jz0);
1874             dx01             = _mm_sub_pd(ix0,jx1);
1875             dy01             = _mm_sub_pd(iy0,jy1);
1876             dz01             = _mm_sub_pd(iz0,jz1);
1877             dx02             = _mm_sub_pd(ix0,jx2);
1878             dy02             = _mm_sub_pd(iy0,jy2);
1879             dz02             = _mm_sub_pd(iz0,jz2);
1880             dx10             = _mm_sub_pd(ix1,jx0);
1881             dy10             = _mm_sub_pd(iy1,jy0);
1882             dz10             = _mm_sub_pd(iz1,jz0);
1883             dx11             = _mm_sub_pd(ix1,jx1);
1884             dy11             = _mm_sub_pd(iy1,jy1);
1885             dz11             = _mm_sub_pd(iz1,jz1);
1886             dx12             = _mm_sub_pd(ix1,jx2);
1887             dy12             = _mm_sub_pd(iy1,jy2);
1888             dz12             = _mm_sub_pd(iz1,jz2);
1889             dx20             = _mm_sub_pd(ix2,jx0);
1890             dy20             = _mm_sub_pd(iy2,jy0);
1891             dz20             = _mm_sub_pd(iz2,jz0);
1892             dx21             = _mm_sub_pd(ix2,jx1);
1893             dy21             = _mm_sub_pd(iy2,jy1);
1894             dz21             = _mm_sub_pd(iz2,jz1);
1895             dx22             = _mm_sub_pd(ix2,jx2);
1896             dy22             = _mm_sub_pd(iy2,jy2);
1897             dz22             = _mm_sub_pd(iz2,jz2);
1898
1899             /* Calculate squared distance and things based on it */
1900             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1901             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1902             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1903             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1904             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1905             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1906             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1907             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1908             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1909
1910             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1911             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1912             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1913             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1914             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1915             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1916             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1917             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1918             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1919
1920             fjx0             = _mm_setzero_pd();
1921             fjy0             = _mm_setzero_pd();
1922             fjz0             = _mm_setzero_pd();
1923             fjx1             = _mm_setzero_pd();
1924             fjy1             = _mm_setzero_pd();
1925             fjz1             = _mm_setzero_pd();
1926             fjx2             = _mm_setzero_pd();
1927             fjy2             = _mm_setzero_pd();
1928             fjz2             = _mm_setzero_pd();
1929
1930             /**************************
1931              * CALCULATE INTERACTIONS *
1932              **************************/
1933
1934             r00              = _mm_mul_pd(rsq00,rinv00);
1935
1936             /* Calculate table index by multiplying r with table scale and truncate to integer */
1937             rt               = _mm_mul_pd(r00,vftabscale);
1938             vfitab           = _mm_cvttpd_epi32(rt);
1939 #ifdef __XOP__
1940             vfeps            = _mm_frcz_pd(rt);
1941 #else
1942             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1943 #endif
1944             twovfeps         = _mm_add_pd(vfeps,vfeps);
1945             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1946
1947             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1948             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1949             F                = _mm_setzero_pd();
1950             GMX_MM_TRANSPOSE2_PD(Y,F);
1951             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1952             H                = _mm_setzero_pd();
1953             GMX_MM_TRANSPOSE2_PD(G,H);
1954             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1955             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1956             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1957
1958             /* CUBIC SPLINE TABLE DISPERSION */
1959             vfitab           = _mm_add_epi32(vfitab,ifour);
1960             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1961             F                = _mm_setzero_pd();
1962             GMX_MM_TRANSPOSE2_PD(Y,F);
1963             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1964             H                = _mm_setzero_pd();
1965             GMX_MM_TRANSPOSE2_PD(G,H);
1966             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1967             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1968             fvdw6            = _mm_mul_pd(c6_00,FF);
1969
1970             /* CUBIC SPLINE TABLE REPULSION */
1971             vfitab           = _mm_add_epi32(vfitab,ifour);
1972             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1973             F                = _mm_setzero_pd();
1974             GMX_MM_TRANSPOSE2_PD(Y,F);
1975             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1976             H                = _mm_setzero_pd();
1977             GMX_MM_TRANSPOSE2_PD(G,H);
1978             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1979             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1980             fvdw12           = _mm_mul_pd(c12_00,FF);
1981             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1982
1983             fscal            = _mm_add_pd(felec,fvdw);
1984
1985             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1986
1987             /* Update vectorial force */
1988             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1989             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1990             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1991             
1992             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1993             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1994             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1995
1996             /**************************
1997              * CALCULATE INTERACTIONS *
1998              **************************/
1999
2000             r01              = _mm_mul_pd(rsq01,rinv01);
2001
2002             /* Calculate table index by multiplying r with table scale and truncate to integer */
2003             rt               = _mm_mul_pd(r01,vftabscale);
2004             vfitab           = _mm_cvttpd_epi32(rt);
2005 #ifdef __XOP__
2006             vfeps            = _mm_frcz_pd(rt);
2007 #else
2008             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2009 #endif
2010             twovfeps         = _mm_add_pd(vfeps,vfeps);
2011             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2012
2013             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2014             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2015             F                = _mm_setzero_pd();
2016             GMX_MM_TRANSPOSE2_PD(Y,F);
2017             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2018             H                = _mm_setzero_pd();
2019             GMX_MM_TRANSPOSE2_PD(G,H);
2020             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2021             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2022             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
2023
2024             fscal            = felec;
2025
2026             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2027
2028             /* Update vectorial force */
2029             fix0             = _mm_macc_pd(dx01,fscal,fix0);
2030             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
2031             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
2032             
2033             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
2034             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
2035             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
2036
2037             /**************************
2038              * CALCULATE INTERACTIONS *
2039              **************************/
2040
2041             r02              = _mm_mul_pd(rsq02,rinv02);
2042
2043             /* Calculate table index by multiplying r with table scale and truncate to integer */
2044             rt               = _mm_mul_pd(r02,vftabscale);
2045             vfitab           = _mm_cvttpd_epi32(rt);
2046 #ifdef __XOP__
2047             vfeps            = _mm_frcz_pd(rt);
2048 #else
2049             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2050 #endif
2051             twovfeps         = _mm_add_pd(vfeps,vfeps);
2052             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2053
2054             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2055             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2056             F                = _mm_setzero_pd();
2057             GMX_MM_TRANSPOSE2_PD(Y,F);
2058             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2059             H                = _mm_setzero_pd();
2060             GMX_MM_TRANSPOSE2_PD(G,H);
2061             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2062             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2063             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2064
2065             fscal            = felec;
2066
2067             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2068
2069             /* Update vectorial force */
2070             fix0             = _mm_macc_pd(dx02,fscal,fix0);
2071             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
2072             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
2073             
2074             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
2075             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
2076             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
2077
2078             /**************************
2079              * CALCULATE INTERACTIONS *
2080              **************************/
2081
2082             r10              = _mm_mul_pd(rsq10,rinv10);
2083
2084             /* Calculate table index by multiplying r with table scale and truncate to integer */
2085             rt               = _mm_mul_pd(r10,vftabscale);
2086             vfitab           = _mm_cvttpd_epi32(rt);
2087 #ifdef __XOP__
2088             vfeps            = _mm_frcz_pd(rt);
2089 #else
2090             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2091 #endif
2092             twovfeps         = _mm_add_pd(vfeps,vfeps);
2093             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2094
2095             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2096             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2097             F                = _mm_setzero_pd();
2098             GMX_MM_TRANSPOSE2_PD(Y,F);
2099             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2100             H                = _mm_setzero_pd();
2101             GMX_MM_TRANSPOSE2_PD(G,H);
2102             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2103             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2104             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2105
2106             fscal            = felec;
2107
2108             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2109
2110             /* Update vectorial force */
2111             fix1             = _mm_macc_pd(dx10,fscal,fix1);
2112             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
2113             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
2114             
2115             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
2116             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
2117             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
2118
2119             /**************************
2120              * CALCULATE INTERACTIONS *
2121              **************************/
2122
2123             r11              = _mm_mul_pd(rsq11,rinv11);
2124
2125             /* Calculate table index by multiplying r with table scale and truncate to integer */
2126             rt               = _mm_mul_pd(r11,vftabscale);
2127             vfitab           = _mm_cvttpd_epi32(rt);
2128 #ifdef __XOP__
2129             vfeps            = _mm_frcz_pd(rt);
2130 #else
2131             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2132 #endif
2133             twovfeps         = _mm_add_pd(vfeps,vfeps);
2134             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2135
2136             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2137             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2138             F                = _mm_setzero_pd();
2139             GMX_MM_TRANSPOSE2_PD(Y,F);
2140             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2141             H                = _mm_setzero_pd();
2142             GMX_MM_TRANSPOSE2_PD(G,H);
2143             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2144             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2145             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2146
2147             fscal            = felec;
2148
2149             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2150
2151             /* Update vectorial force */
2152             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2153             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2154             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2155             
2156             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2157             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2158             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2159
2160             /**************************
2161              * CALCULATE INTERACTIONS *
2162              **************************/
2163
2164             r12              = _mm_mul_pd(rsq12,rinv12);
2165
2166             /* Calculate table index by multiplying r with table scale and truncate to integer */
2167             rt               = _mm_mul_pd(r12,vftabscale);
2168             vfitab           = _mm_cvttpd_epi32(rt);
2169 #ifdef __XOP__
2170             vfeps            = _mm_frcz_pd(rt);
2171 #else
2172             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2173 #endif
2174             twovfeps         = _mm_add_pd(vfeps,vfeps);
2175             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2176
2177             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2178             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2179             F                = _mm_setzero_pd();
2180             GMX_MM_TRANSPOSE2_PD(Y,F);
2181             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2182             H                = _mm_setzero_pd();
2183             GMX_MM_TRANSPOSE2_PD(G,H);
2184             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2185             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2186             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2187
2188             fscal            = felec;
2189
2190             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2191
2192             /* Update vectorial force */
2193             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2194             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2195             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2196             
2197             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2198             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2199             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2200
2201             /**************************
2202              * CALCULATE INTERACTIONS *
2203              **************************/
2204
2205             r20              = _mm_mul_pd(rsq20,rinv20);
2206
2207             /* Calculate table index by multiplying r with table scale and truncate to integer */
2208             rt               = _mm_mul_pd(r20,vftabscale);
2209             vfitab           = _mm_cvttpd_epi32(rt);
2210 #ifdef __XOP__
2211             vfeps            = _mm_frcz_pd(rt);
2212 #else
2213             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2214 #endif
2215             twovfeps         = _mm_add_pd(vfeps,vfeps);
2216             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2217
2218             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2219             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2220             F                = _mm_setzero_pd();
2221             GMX_MM_TRANSPOSE2_PD(Y,F);
2222             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2223             H                = _mm_setzero_pd();
2224             GMX_MM_TRANSPOSE2_PD(G,H);
2225             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2226             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2227             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2228
2229             fscal            = felec;
2230
2231             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2232
2233             /* Update vectorial force */
2234             fix2             = _mm_macc_pd(dx20,fscal,fix2);
2235             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
2236             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
2237             
2238             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
2239             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
2240             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
2241
2242             /**************************
2243              * CALCULATE INTERACTIONS *
2244              **************************/
2245
2246             r21              = _mm_mul_pd(rsq21,rinv21);
2247
2248             /* Calculate table index by multiplying r with table scale and truncate to integer */
2249             rt               = _mm_mul_pd(r21,vftabscale);
2250             vfitab           = _mm_cvttpd_epi32(rt);
2251 #ifdef __XOP__
2252             vfeps            = _mm_frcz_pd(rt);
2253 #else
2254             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2255 #endif
2256             twovfeps         = _mm_add_pd(vfeps,vfeps);
2257             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2258
2259             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2260             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2261             F                = _mm_setzero_pd();
2262             GMX_MM_TRANSPOSE2_PD(Y,F);
2263             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2264             H                = _mm_setzero_pd();
2265             GMX_MM_TRANSPOSE2_PD(G,H);
2266             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2267             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2268             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2269
2270             fscal            = felec;
2271
2272             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2273
2274             /* Update vectorial force */
2275             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2276             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2277             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2278             
2279             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2280             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2281             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2282
2283             /**************************
2284              * CALCULATE INTERACTIONS *
2285              **************************/
2286
2287             r22              = _mm_mul_pd(rsq22,rinv22);
2288
2289             /* Calculate table index by multiplying r with table scale and truncate to integer */
2290             rt               = _mm_mul_pd(r22,vftabscale);
2291             vfitab           = _mm_cvttpd_epi32(rt);
2292 #ifdef __XOP__
2293             vfeps            = _mm_frcz_pd(rt);
2294 #else
2295             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2296 #endif
2297             twovfeps         = _mm_add_pd(vfeps,vfeps);
2298             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2299
2300             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2301             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2302             F                = _mm_setzero_pd();
2303             GMX_MM_TRANSPOSE2_PD(Y,F);
2304             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2305             H                = _mm_setzero_pd();
2306             GMX_MM_TRANSPOSE2_PD(G,H);
2307             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2308             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2309             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2310
2311             fscal            = felec;
2312
2313             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2314
2315             /* Update vectorial force */
2316             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2317             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2318             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2319             
2320             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2321             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2322             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2323
2324             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2325
2326             /* Inner loop uses 400 flops */
2327         }
2328
2329         /* End of innermost loop */
2330
2331         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2332                                               f+i_coord_offset,fshift+i_shift_offset);
2333
2334         /* Increment number of inner iterations */
2335         inneriter                  += j_index_end - j_index_start;
2336
2337         /* Outer loop uses 18 flops */
2338     }
2339
2340     /* Increment number of outer iterations */
2341     outeriter        += nri;
2342
2343     /* Update outer/inner flops */
2344
2345     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*400);
2346 }