Merge branch release-4-6 into release-5-0
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_avx_128_fma_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
50 #include "kernelutil_x86_avx_128_fma_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
54  * Electrostatics interaction: Coulomb
55  * VdW interaction:            CubicSplineTable
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB;
77     int              j_coord_offsetA,j_coord_offsetB;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B;
89     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B;
91     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B;
93     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     int              nvdwtype;
106     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107     int              *vdwtype;
108     real             *vdwparam;
109     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
110     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
111     __m128i          vfitab;
112     __m128i          ifour       = _mm_set1_epi32(4);
113     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
114     real             *vftab;
115     __m128d          dummy_mask,cutoff_mask;
116     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
117     __m128d          one     = _mm_set1_pd(1.0);
118     __m128d          two     = _mm_set1_pd(2.0);
119     x                = xx[0];
120     f                = ff[0];
121
122     nri              = nlist->nri;
123     iinr             = nlist->iinr;
124     jindex           = nlist->jindex;
125     jjnr             = nlist->jjnr;
126     shiftidx         = nlist->shift;
127     gid              = nlist->gid;
128     shiftvec         = fr->shift_vec[0];
129     fshift           = fr->fshift[0];
130     facel            = _mm_set1_pd(fr->epsfac);
131     charge           = mdatoms->chargeA;
132     nvdwtype         = fr->ntype;
133     vdwparam         = fr->nbfp;
134     vdwtype          = mdatoms->typeA;
135
136     vftab            = kernel_data->table_vdw->data;
137     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
138
139     /* Setup water-specific parameters */
140     inr              = nlist->iinr[0];
141     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
142     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
143     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
144     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
145
146     jq0              = _mm_set1_pd(charge[inr+0]);
147     jq1              = _mm_set1_pd(charge[inr+1]);
148     jq2              = _mm_set1_pd(charge[inr+2]);
149     vdwjidx0A        = 2*vdwtype[inr+0];
150     qq00             = _mm_mul_pd(iq0,jq0);
151     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
152     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
153     qq01             = _mm_mul_pd(iq0,jq1);
154     qq02             = _mm_mul_pd(iq0,jq2);
155     qq10             = _mm_mul_pd(iq1,jq0);
156     qq11             = _mm_mul_pd(iq1,jq1);
157     qq12             = _mm_mul_pd(iq1,jq2);
158     qq20             = _mm_mul_pd(iq2,jq0);
159     qq21             = _mm_mul_pd(iq2,jq1);
160     qq22             = _mm_mul_pd(iq2,jq2);
161
162     /* Avoid stupid compiler warnings */
163     jnrA = jnrB = 0;
164     j_coord_offsetA = 0;
165     j_coord_offsetB = 0;
166
167     outeriter        = 0;
168     inneriter        = 0;
169
170     /* Start outer loop over neighborlists */
171     for(iidx=0; iidx<nri; iidx++)
172     {
173         /* Load shift vector for this list */
174         i_shift_offset   = DIM*shiftidx[iidx];
175
176         /* Load limits for loop over neighbors */
177         j_index_start    = jindex[iidx];
178         j_index_end      = jindex[iidx+1];
179
180         /* Get outer coordinate index */
181         inr              = iinr[iidx];
182         i_coord_offset   = DIM*inr;
183
184         /* Load i particle coords and add shift vector */
185         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
186                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
187
188         fix0             = _mm_setzero_pd();
189         fiy0             = _mm_setzero_pd();
190         fiz0             = _mm_setzero_pd();
191         fix1             = _mm_setzero_pd();
192         fiy1             = _mm_setzero_pd();
193         fiz1             = _mm_setzero_pd();
194         fix2             = _mm_setzero_pd();
195         fiy2             = _mm_setzero_pd();
196         fiz2             = _mm_setzero_pd();
197
198         /* Reset potential sums */
199         velecsum         = _mm_setzero_pd();
200         vvdwsum          = _mm_setzero_pd();
201
202         /* Start inner kernel loop */
203         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
204         {
205
206             /* Get j neighbor index, and coordinate index */
207             jnrA             = jjnr[jidx];
208             jnrB             = jjnr[jidx+1];
209             j_coord_offsetA  = DIM*jnrA;
210             j_coord_offsetB  = DIM*jnrB;
211
212             /* load j atom coordinates */
213             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215
216             /* Calculate displacement vector */
217             dx00             = _mm_sub_pd(ix0,jx0);
218             dy00             = _mm_sub_pd(iy0,jy0);
219             dz00             = _mm_sub_pd(iz0,jz0);
220             dx01             = _mm_sub_pd(ix0,jx1);
221             dy01             = _mm_sub_pd(iy0,jy1);
222             dz01             = _mm_sub_pd(iz0,jz1);
223             dx02             = _mm_sub_pd(ix0,jx2);
224             dy02             = _mm_sub_pd(iy0,jy2);
225             dz02             = _mm_sub_pd(iz0,jz2);
226             dx10             = _mm_sub_pd(ix1,jx0);
227             dy10             = _mm_sub_pd(iy1,jy0);
228             dz10             = _mm_sub_pd(iz1,jz0);
229             dx11             = _mm_sub_pd(ix1,jx1);
230             dy11             = _mm_sub_pd(iy1,jy1);
231             dz11             = _mm_sub_pd(iz1,jz1);
232             dx12             = _mm_sub_pd(ix1,jx2);
233             dy12             = _mm_sub_pd(iy1,jy2);
234             dz12             = _mm_sub_pd(iz1,jz2);
235             dx20             = _mm_sub_pd(ix2,jx0);
236             dy20             = _mm_sub_pd(iy2,jy0);
237             dz20             = _mm_sub_pd(iz2,jz0);
238             dx21             = _mm_sub_pd(ix2,jx1);
239             dy21             = _mm_sub_pd(iy2,jy1);
240             dz21             = _mm_sub_pd(iz2,jz1);
241             dx22             = _mm_sub_pd(ix2,jx2);
242             dy22             = _mm_sub_pd(iy2,jy2);
243             dz22             = _mm_sub_pd(iz2,jz2);
244
245             /* Calculate squared distance and things based on it */
246             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
247             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
248             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
249             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
250             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
251             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
252             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
253             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
254             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
255
256             rinv00           = gmx_mm_invsqrt_pd(rsq00);
257             rinv01           = gmx_mm_invsqrt_pd(rsq01);
258             rinv02           = gmx_mm_invsqrt_pd(rsq02);
259             rinv10           = gmx_mm_invsqrt_pd(rsq10);
260             rinv11           = gmx_mm_invsqrt_pd(rsq11);
261             rinv12           = gmx_mm_invsqrt_pd(rsq12);
262             rinv20           = gmx_mm_invsqrt_pd(rsq20);
263             rinv21           = gmx_mm_invsqrt_pd(rsq21);
264             rinv22           = gmx_mm_invsqrt_pd(rsq22);
265
266             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
267             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
268             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
269             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
270             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
271             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
272             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
273             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
274             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
275
276             fjx0             = _mm_setzero_pd();
277             fjy0             = _mm_setzero_pd();
278             fjz0             = _mm_setzero_pd();
279             fjx1             = _mm_setzero_pd();
280             fjy1             = _mm_setzero_pd();
281             fjz1             = _mm_setzero_pd();
282             fjx2             = _mm_setzero_pd();
283             fjy2             = _mm_setzero_pd();
284             fjz2             = _mm_setzero_pd();
285
286             /**************************
287              * CALCULATE INTERACTIONS *
288              **************************/
289
290             r00              = _mm_mul_pd(rsq00,rinv00);
291
292             /* Calculate table index by multiplying r with table scale and truncate to integer */
293             rt               = _mm_mul_pd(r00,vftabscale);
294             vfitab           = _mm_cvttpd_epi32(rt);
295 #ifdef __XOP__
296             vfeps            = _mm_frcz_pd(rt);
297 #else
298             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
299 #endif
300             twovfeps         = _mm_add_pd(vfeps,vfeps);
301             vfitab           = _mm_slli_epi32(vfitab,3);
302
303             /* COULOMB ELECTROSTATICS */
304             velec            = _mm_mul_pd(qq00,rinv00);
305             felec            = _mm_mul_pd(velec,rinvsq00);
306
307             /* CUBIC SPLINE TABLE DISPERSION */
308             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
309             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
310             GMX_MM_TRANSPOSE2_PD(Y,F);
311             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
312             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
313             GMX_MM_TRANSPOSE2_PD(G,H);
314             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
315             VV               = _mm_macc_pd(vfeps,Fp,Y);
316             vvdw6            = _mm_mul_pd(c6_00,VV);
317             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
318             fvdw6            = _mm_mul_pd(c6_00,FF);
319
320             /* CUBIC SPLINE TABLE REPULSION */
321             vfitab           = _mm_add_epi32(vfitab,ifour);
322             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
323             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
324             GMX_MM_TRANSPOSE2_PD(Y,F);
325             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
326             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
327             GMX_MM_TRANSPOSE2_PD(G,H);
328             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
329             VV               = _mm_macc_pd(vfeps,Fp,Y);
330             vvdw12           = _mm_mul_pd(c12_00,VV);
331             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
332             fvdw12           = _mm_mul_pd(c12_00,FF);
333             vvdw             = _mm_add_pd(vvdw12,vvdw6);
334             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
335
336             /* Update potential sum for this i atom from the interaction with this j atom. */
337             velecsum         = _mm_add_pd(velecsum,velec);
338             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
339
340             fscal            = _mm_add_pd(felec,fvdw);
341
342             /* Update vectorial force */
343             fix0             = _mm_macc_pd(dx00,fscal,fix0);
344             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
345             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
346             
347             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
348             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
349             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
350
351             /**************************
352              * CALCULATE INTERACTIONS *
353              **************************/
354
355             /* COULOMB ELECTROSTATICS */
356             velec            = _mm_mul_pd(qq01,rinv01);
357             felec            = _mm_mul_pd(velec,rinvsq01);
358
359             /* Update potential sum for this i atom from the interaction with this j atom. */
360             velecsum         = _mm_add_pd(velecsum,velec);
361
362             fscal            = felec;
363
364             /* Update vectorial force */
365             fix0             = _mm_macc_pd(dx01,fscal,fix0);
366             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
367             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
368             
369             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
370             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
371             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
372
373             /**************************
374              * CALCULATE INTERACTIONS *
375              **************************/
376
377             /* COULOMB ELECTROSTATICS */
378             velec            = _mm_mul_pd(qq02,rinv02);
379             felec            = _mm_mul_pd(velec,rinvsq02);
380
381             /* Update potential sum for this i atom from the interaction with this j atom. */
382             velecsum         = _mm_add_pd(velecsum,velec);
383
384             fscal            = felec;
385
386             /* Update vectorial force */
387             fix0             = _mm_macc_pd(dx02,fscal,fix0);
388             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
389             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
390             
391             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
392             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
393             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
394
395             /**************************
396              * CALCULATE INTERACTIONS *
397              **************************/
398
399             /* COULOMB ELECTROSTATICS */
400             velec            = _mm_mul_pd(qq10,rinv10);
401             felec            = _mm_mul_pd(velec,rinvsq10);
402
403             /* Update potential sum for this i atom from the interaction with this j atom. */
404             velecsum         = _mm_add_pd(velecsum,velec);
405
406             fscal            = felec;
407
408             /* Update vectorial force */
409             fix1             = _mm_macc_pd(dx10,fscal,fix1);
410             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
411             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
412             
413             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
414             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
415             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
416
417             /**************************
418              * CALCULATE INTERACTIONS *
419              **************************/
420
421             /* COULOMB ELECTROSTATICS */
422             velec            = _mm_mul_pd(qq11,rinv11);
423             felec            = _mm_mul_pd(velec,rinvsq11);
424
425             /* Update potential sum for this i atom from the interaction with this j atom. */
426             velecsum         = _mm_add_pd(velecsum,velec);
427
428             fscal            = felec;
429
430             /* Update vectorial force */
431             fix1             = _mm_macc_pd(dx11,fscal,fix1);
432             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
433             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
434             
435             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
436             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
437             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
438
439             /**************************
440              * CALCULATE INTERACTIONS *
441              **************************/
442
443             /* COULOMB ELECTROSTATICS */
444             velec            = _mm_mul_pd(qq12,rinv12);
445             felec            = _mm_mul_pd(velec,rinvsq12);
446
447             /* Update potential sum for this i atom from the interaction with this j atom. */
448             velecsum         = _mm_add_pd(velecsum,velec);
449
450             fscal            = felec;
451
452             /* Update vectorial force */
453             fix1             = _mm_macc_pd(dx12,fscal,fix1);
454             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
455             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
456             
457             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
458             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
459             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
460
461             /**************************
462              * CALCULATE INTERACTIONS *
463              **************************/
464
465             /* COULOMB ELECTROSTATICS */
466             velec            = _mm_mul_pd(qq20,rinv20);
467             felec            = _mm_mul_pd(velec,rinvsq20);
468
469             /* Update potential sum for this i atom from the interaction with this j atom. */
470             velecsum         = _mm_add_pd(velecsum,velec);
471
472             fscal            = felec;
473
474             /* Update vectorial force */
475             fix2             = _mm_macc_pd(dx20,fscal,fix2);
476             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
477             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
478             
479             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
480             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
481             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
482
483             /**************************
484              * CALCULATE INTERACTIONS *
485              **************************/
486
487             /* COULOMB ELECTROSTATICS */
488             velec            = _mm_mul_pd(qq21,rinv21);
489             felec            = _mm_mul_pd(velec,rinvsq21);
490
491             /* Update potential sum for this i atom from the interaction with this j atom. */
492             velecsum         = _mm_add_pd(velecsum,velec);
493
494             fscal            = felec;
495
496             /* Update vectorial force */
497             fix2             = _mm_macc_pd(dx21,fscal,fix2);
498             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
499             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
500             
501             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
502             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
503             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
504
505             /**************************
506              * CALCULATE INTERACTIONS *
507              **************************/
508
509             /* COULOMB ELECTROSTATICS */
510             velec            = _mm_mul_pd(qq22,rinv22);
511             felec            = _mm_mul_pd(velec,rinvsq22);
512
513             /* Update potential sum for this i atom from the interaction with this j atom. */
514             velecsum         = _mm_add_pd(velecsum,velec);
515
516             fscal            = felec;
517
518             /* Update vectorial force */
519             fix2             = _mm_macc_pd(dx22,fscal,fix2);
520             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
521             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
522             
523             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
524             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
525             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
526
527             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
528
529             /* Inner loop uses 314 flops */
530         }
531
532         if(jidx<j_index_end)
533         {
534
535             jnrA             = jjnr[jidx];
536             j_coord_offsetA  = DIM*jnrA;
537
538             /* load j atom coordinates */
539             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
540                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
541
542             /* Calculate displacement vector */
543             dx00             = _mm_sub_pd(ix0,jx0);
544             dy00             = _mm_sub_pd(iy0,jy0);
545             dz00             = _mm_sub_pd(iz0,jz0);
546             dx01             = _mm_sub_pd(ix0,jx1);
547             dy01             = _mm_sub_pd(iy0,jy1);
548             dz01             = _mm_sub_pd(iz0,jz1);
549             dx02             = _mm_sub_pd(ix0,jx2);
550             dy02             = _mm_sub_pd(iy0,jy2);
551             dz02             = _mm_sub_pd(iz0,jz2);
552             dx10             = _mm_sub_pd(ix1,jx0);
553             dy10             = _mm_sub_pd(iy1,jy0);
554             dz10             = _mm_sub_pd(iz1,jz0);
555             dx11             = _mm_sub_pd(ix1,jx1);
556             dy11             = _mm_sub_pd(iy1,jy1);
557             dz11             = _mm_sub_pd(iz1,jz1);
558             dx12             = _mm_sub_pd(ix1,jx2);
559             dy12             = _mm_sub_pd(iy1,jy2);
560             dz12             = _mm_sub_pd(iz1,jz2);
561             dx20             = _mm_sub_pd(ix2,jx0);
562             dy20             = _mm_sub_pd(iy2,jy0);
563             dz20             = _mm_sub_pd(iz2,jz0);
564             dx21             = _mm_sub_pd(ix2,jx1);
565             dy21             = _mm_sub_pd(iy2,jy1);
566             dz21             = _mm_sub_pd(iz2,jz1);
567             dx22             = _mm_sub_pd(ix2,jx2);
568             dy22             = _mm_sub_pd(iy2,jy2);
569             dz22             = _mm_sub_pd(iz2,jz2);
570
571             /* Calculate squared distance and things based on it */
572             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
573             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
574             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
575             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
576             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
577             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
578             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
579             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
580             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
581
582             rinv00           = gmx_mm_invsqrt_pd(rsq00);
583             rinv01           = gmx_mm_invsqrt_pd(rsq01);
584             rinv02           = gmx_mm_invsqrt_pd(rsq02);
585             rinv10           = gmx_mm_invsqrt_pd(rsq10);
586             rinv11           = gmx_mm_invsqrt_pd(rsq11);
587             rinv12           = gmx_mm_invsqrt_pd(rsq12);
588             rinv20           = gmx_mm_invsqrt_pd(rsq20);
589             rinv21           = gmx_mm_invsqrt_pd(rsq21);
590             rinv22           = gmx_mm_invsqrt_pd(rsq22);
591
592             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
593             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
594             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
595             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
596             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
597             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
598             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
599             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
600             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
601
602             fjx0             = _mm_setzero_pd();
603             fjy0             = _mm_setzero_pd();
604             fjz0             = _mm_setzero_pd();
605             fjx1             = _mm_setzero_pd();
606             fjy1             = _mm_setzero_pd();
607             fjz1             = _mm_setzero_pd();
608             fjx2             = _mm_setzero_pd();
609             fjy2             = _mm_setzero_pd();
610             fjz2             = _mm_setzero_pd();
611
612             /**************************
613              * CALCULATE INTERACTIONS *
614              **************************/
615
616             r00              = _mm_mul_pd(rsq00,rinv00);
617
618             /* Calculate table index by multiplying r with table scale and truncate to integer */
619             rt               = _mm_mul_pd(r00,vftabscale);
620             vfitab           = _mm_cvttpd_epi32(rt);
621 #ifdef __XOP__
622             vfeps            = _mm_frcz_pd(rt);
623 #else
624             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
625 #endif
626             twovfeps         = _mm_add_pd(vfeps,vfeps);
627             vfitab           = _mm_slli_epi32(vfitab,3);
628
629             /* COULOMB ELECTROSTATICS */
630             velec            = _mm_mul_pd(qq00,rinv00);
631             felec            = _mm_mul_pd(velec,rinvsq00);
632
633             /* CUBIC SPLINE TABLE DISPERSION */
634             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
635             F                = _mm_setzero_pd();
636             GMX_MM_TRANSPOSE2_PD(Y,F);
637             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
638             H                = _mm_setzero_pd();
639             GMX_MM_TRANSPOSE2_PD(G,H);
640             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
641             VV               = _mm_macc_pd(vfeps,Fp,Y);
642             vvdw6            = _mm_mul_pd(c6_00,VV);
643             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
644             fvdw6            = _mm_mul_pd(c6_00,FF);
645
646             /* CUBIC SPLINE TABLE REPULSION */
647             vfitab           = _mm_add_epi32(vfitab,ifour);
648             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
649             F                = _mm_setzero_pd();
650             GMX_MM_TRANSPOSE2_PD(Y,F);
651             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
652             H                = _mm_setzero_pd();
653             GMX_MM_TRANSPOSE2_PD(G,H);
654             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
655             VV               = _mm_macc_pd(vfeps,Fp,Y);
656             vvdw12           = _mm_mul_pd(c12_00,VV);
657             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
658             fvdw12           = _mm_mul_pd(c12_00,FF);
659             vvdw             = _mm_add_pd(vvdw12,vvdw6);
660             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
661
662             /* Update potential sum for this i atom from the interaction with this j atom. */
663             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
664             velecsum         = _mm_add_pd(velecsum,velec);
665             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
666             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
667
668             fscal            = _mm_add_pd(felec,fvdw);
669
670             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
671
672             /* Update vectorial force */
673             fix0             = _mm_macc_pd(dx00,fscal,fix0);
674             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
675             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
676             
677             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
678             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
679             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
680
681             /**************************
682              * CALCULATE INTERACTIONS *
683              **************************/
684
685             /* COULOMB ELECTROSTATICS */
686             velec            = _mm_mul_pd(qq01,rinv01);
687             felec            = _mm_mul_pd(velec,rinvsq01);
688
689             /* Update potential sum for this i atom from the interaction with this j atom. */
690             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
691             velecsum         = _mm_add_pd(velecsum,velec);
692
693             fscal            = felec;
694
695             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
696
697             /* Update vectorial force */
698             fix0             = _mm_macc_pd(dx01,fscal,fix0);
699             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
700             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
701             
702             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
703             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
704             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
705
706             /**************************
707              * CALCULATE INTERACTIONS *
708              **************************/
709
710             /* COULOMB ELECTROSTATICS */
711             velec            = _mm_mul_pd(qq02,rinv02);
712             felec            = _mm_mul_pd(velec,rinvsq02);
713
714             /* Update potential sum for this i atom from the interaction with this j atom. */
715             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
716             velecsum         = _mm_add_pd(velecsum,velec);
717
718             fscal            = felec;
719
720             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
721
722             /* Update vectorial force */
723             fix0             = _mm_macc_pd(dx02,fscal,fix0);
724             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
725             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
726             
727             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
728             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
729             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
730
731             /**************************
732              * CALCULATE INTERACTIONS *
733              **************************/
734
735             /* COULOMB ELECTROSTATICS */
736             velec            = _mm_mul_pd(qq10,rinv10);
737             felec            = _mm_mul_pd(velec,rinvsq10);
738
739             /* Update potential sum for this i atom from the interaction with this j atom. */
740             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
741             velecsum         = _mm_add_pd(velecsum,velec);
742
743             fscal            = felec;
744
745             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
746
747             /* Update vectorial force */
748             fix1             = _mm_macc_pd(dx10,fscal,fix1);
749             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
750             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
751             
752             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
753             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
754             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
755
756             /**************************
757              * CALCULATE INTERACTIONS *
758              **************************/
759
760             /* COULOMB ELECTROSTATICS */
761             velec            = _mm_mul_pd(qq11,rinv11);
762             felec            = _mm_mul_pd(velec,rinvsq11);
763
764             /* Update potential sum for this i atom from the interaction with this j atom. */
765             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
766             velecsum         = _mm_add_pd(velecsum,velec);
767
768             fscal            = felec;
769
770             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
771
772             /* Update vectorial force */
773             fix1             = _mm_macc_pd(dx11,fscal,fix1);
774             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
775             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
776             
777             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
778             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
779             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
780
781             /**************************
782              * CALCULATE INTERACTIONS *
783              **************************/
784
785             /* COULOMB ELECTROSTATICS */
786             velec            = _mm_mul_pd(qq12,rinv12);
787             felec            = _mm_mul_pd(velec,rinvsq12);
788
789             /* Update potential sum for this i atom from the interaction with this j atom. */
790             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
791             velecsum         = _mm_add_pd(velecsum,velec);
792
793             fscal            = felec;
794
795             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
796
797             /* Update vectorial force */
798             fix1             = _mm_macc_pd(dx12,fscal,fix1);
799             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
800             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
801             
802             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
803             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
804             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
805
806             /**************************
807              * CALCULATE INTERACTIONS *
808              **************************/
809
810             /* COULOMB ELECTROSTATICS */
811             velec            = _mm_mul_pd(qq20,rinv20);
812             felec            = _mm_mul_pd(velec,rinvsq20);
813
814             /* Update potential sum for this i atom from the interaction with this j atom. */
815             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
816             velecsum         = _mm_add_pd(velecsum,velec);
817
818             fscal            = felec;
819
820             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
821
822             /* Update vectorial force */
823             fix2             = _mm_macc_pd(dx20,fscal,fix2);
824             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
825             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
826             
827             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
828             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
829             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
830
831             /**************************
832              * CALCULATE INTERACTIONS *
833              **************************/
834
835             /* COULOMB ELECTROSTATICS */
836             velec            = _mm_mul_pd(qq21,rinv21);
837             felec            = _mm_mul_pd(velec,rinvsq21);
838
839             /* Update potential sum for this i atom from the interaction with this j atom. */
840             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
841             velecsum         = _mm_add_pd(velecsum,velec);
842
843             fscal            = felec;
844
845             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
846
847             /* Update vectorial force */
848             fix2             = _mm_macc_pd(dx21,fscal,fix2);
849             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
850             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
851             
852             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
853             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
854             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
855
856             /**************************
857              * CALCULATE INTERACTIONS *
858              **************************/
859
860             /* COULOMB ELECTROSTATICS */
861             velec            = _mm_mul_pd(qq22,rinv22);
862             felec            = _mm_mul_pd(velec,rinvsq22);
863
864             /* Update potential sum for this i atom from the interaction with this j atom. */
865             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
866             velecsum         = _mm_add_pd(velecsum,velec);
867
868             fscal            = felec;
869
870             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
871
872             /* Update vectorial force */
873             fix2             = _mm_macc_pd(dx22,fscal,fix2);
874             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
875             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
876             
877             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
878             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
879             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
880
881             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
882
883             /* Inner loop uses 314 flops */
884         }
885
886         /* End of innermost loop */
887
888         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
889                                               f+i_coord_offset,fshift+i_shift_offset);
890
891         ggid                        = gid[iidx];
892         /* Update potential energies */
893         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
894         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
895
896         /* Increment number of inner iterations */
897         inneriter                  += j_index_end - j_index_start;
898
899         /* Outer loop uses 20 flops */
900     }
901
902     /* Increment number of outer iterations */
903     outeriter        += nri;
904
905     /* Update outer/inner flops */
906
907     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*314);
908 }
909 /*
910  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_double
911  * Electrostatics interaction: Coulomb
912  * VdW interaction:            CubicSplineTable
913  * Geometry:                   Water3-Water3
914  * Calculate force/pot:        Force
915  */
916 void
917 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_double
918                     (t_nblist                    * gmx_restrict       nlist,
919                      rvec                        * gmx_restrict          xx,
920                      rvec                        * gmx_restrict          ff,
921                      t_forcerec                  * gmx_restrict          fr,
922                      t_mdatoms                   * gmx_restrict     mdatoms,
923                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
924                      t_nrnb                      * gmx_restrict        nrnb)
925 {
926     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
927      * just 0 for non-waters.
928      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
929      * jnr indices corresponding to data put in the four positions in the SIMD register.
930      */
931     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
932     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
933     int              jnrA,jnrB;
934     int              j_coord_offsetA,j_coord_offsetB;
935     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
936     real             rcutoff_scalar;
937     real             *shiftvec,*fshift,*x,*f;
938     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
939     int              vdwioffset0;
940     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
941     int              vdwioffset1;
942     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
943     int              vdwioffset2;
944     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
945     int              vdwjidx0A,vdwjidx0B;
946     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
947     int              vdwjidx1A,vdwjidx1B;
948     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
949     int              vdwjidx2A,vdwjidx2B;
950     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
951     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
952     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
953     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
954     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
955     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
956     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
957     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
958     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
959     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
960     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
961     real             *charge;
962     int              nvdwtype;
963     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
964     int              *vdwtype;
965     real             *vdwparam;
966     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
967     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
968     __m128i          vfitab;
969     __m128i          ifour       = _mm_set1_epi32(4);
970     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
971     real             *vftab;
972     __m128d          dummy_mask,cutoff_mask;
973     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
974     __m128d          one     = _mm_set1_pd(1.0);
975     __m128d          two     = _mm_set1_pd(2.0);
976     x                = xx[0];
977     f                = ff[0];
978
979     nri              = nlist->nri;
980     iinr             = nlist->iinr;
981     jindex           = nlist->jindex;
982     jjnr             = nlist->jjnr;
983     shiftidx         = nlist->shift;
984     gid              = nlist->gid;
985     shiftvec         = fr->shift_vec[0];
986     fshift           = fr->fshift[0];
987     facel            = _mm_set1_pd(fr->epsfac);
988     charge           = mdatoms->chargeA;
989     nvdwtype         = fr->ntype;
990     vdwparam         = fr->nbfp;
991     vdwtype          = mdatoms->typeA;
992
993     vftab            = kernel_data->table_vdw->data;
994     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
995
996     /* Setup water-specific parameters */
997     inr              = nlist->iinr[0];
998     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
999     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1000     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1001     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1002
1003     jq0              = _mm_set1_pd(charge[inr+0]);
1004     jq1              = _mm_set1_pd(charge[inr+1]);
1005     jq2              = _mm_set1_pd(charge[inr+2]);
1006     vdwjidx0A        = 2*vdwtype[inr+0];
1007     qq00             = _mm_mul_pd(iq0,jq0);
1008     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1009     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1010     qq01             = _mm_mul_pd(iq0,jq1);
1011     qq02             = _mm_mul_pd(iq0,jq2);
1012     qq10             = _mm_mul_pd(iq1,jq0);
1013     qq11             = _mm_mul_pd(iq1,jq1);
1014     qq12             = _mm_mul_pd(iq1,jq2);
1015     qq20             = _mm_mul_pd(iq2,jq0);
1016     qq21             = _mm_mul_pd(iq2,jq1);
1017     qq22             = _mm_mul_pd(iq2,jq2);
1018
1019     /* Avoid stupid compiler warnings */
1020     jnrA = jnrB = 0;
1021     j_coord_offsetA = 0;
1022     j_coord_offsetB = 0;
1023
1024     outeriter        = 0;
1025     inneriter        = 0;
1026
1027     /* Start outer loop over neighborlists */
1028     for(iidx=0; iidx<nri; iidx++)
1029     {
1030         /* Load shift vector for this list */
1031         i_shift_offset   = DIM*shiftidx[iidx];
1032
1033         /* Load limits for loop over neighbors */
1034         j_index_start    = jindex[iidx];
1035         j_index_end      = jindex[iidx+1];
1036
1037         /* Get outer coordinate index */
1038         inr              = iinr[iidx];
1039         i_coord_offset   = DIM*inr;
1040
1041         /* Load i particle coords and add shift vector */
1042         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1043                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1044
1045         fix0             = _mm_setzero_pd();
1046         fiy0             = _mm_setzero_pd();
1047         fiz0             = _mm_setzero_pd();
1048         fix1             = _mm_setzero_pd();
1049         fiy1             = _mm_setzero_pd();
1050         fiz1             = _mm_setzero_pd();
1051         fix2             = _mm_setzero_pd();
1052         fiy2             = _mm_setzero_pd();
1053         fiz2             = _mm_setzero_pd();
1054
1055         /* Start inner kernel loop */
1056         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1057         {
1058
1059             /* Get j neighbor index, and coordinate index */
1060             jnrA             = jjnr[jidx];
1061             jnrB             = jjnr[jidx+1];
1062             j_coord_offsetA  = DIM*jnrA;
1063             j_coord_offsetB  = DIM*jnrB;
1064
1065             /* load j atom coordinates */
1066             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1067                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1068
1069             /* Calculate displacement vector */
1070             dx00             = _mm_sub_pd(ix0,jx0);
1071             dy00             = _mm_sub_pd(iy0,jy0);
1072             dz00             = _mm_sub_pd(iz0,jz0);
1073             dx01             = _mm_sub_pd(ix0,jx1);
1074             dy01             = _mm_sub_pd(iy0,jy1);
1075             dz01             = _mm_sub_pd(iz0,jz1);
1076             dx02             = _mm_sub_pd(ix0,jx2);
1077             dy02             = _mm_sub_pd(iy0,jy2);
1078             dz02             = _mm_sub_pd(iz0,jz2);
1079             dx10             = _mm_sub_pd(ix1,jx0);
1080             dy10             = _mm_sub_pd(iy1,jy0);
1081             dz10             = _mm_sub_pd(iz1,jz0);
1082             dx11             = _mm_sub_pd(ix1,jx1);
1083             dy11             = _mm_sub_pd(iy1,jy1);
1084             dz11             = _mm_sub_pd(iz1,jz1);
1085             dx12             = _mm_sub_pd(ix1,jx2);
1086             dy12             = _mm_sub_pd(iy1,jy2);
1087             dz12             = _mm_sub_pd(iz1,jz2);
1088             dx20             = _mm_sub_pd(ix2,jx0);
1089             dy20             = _mm_sub_pd(iy2,jy0);
1090             dz20             = _mm_sub_pd(iz2,jz0);
1091             dx21             = _mm_sub_pd(ix2,jx1);
1092             dy21             = _mm_sub_pd(iy2,jy1);
1093             dz21             = _mm_sub_pd(iz2,jz1);
1094             dx22             = _mm_sub_pd(ix2,jx2);
1095             dy22             = _mm_sub_pd(iy2,jy2);
1096             dz22             = _mm_sub_pd(iz2,jz2);
1097
1098             /* Calculate squared distance and things based on it */
1099             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1100             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1101             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1102             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1103             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1104             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1105             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1106             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1107             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1108
1109             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1110             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1111             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1112             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1113             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1114             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1115             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1116             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1117             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1118
1119             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1120             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1121             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1122             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1123             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1124             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1125             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1126             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1127             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1128
1129             fjx0             = _mm_setzero_pd();
1130             fjy0             = _mm_setzero_pd();
1131             fjz0             = _mm_setzero_pd();
1132             fjx1             = _mm_setzero_pd();
1133             fjy1             = _mm_setzero_pd();
1134             fjz1             = _mm_setzero_pd();
1135             fjx2             = _mm_setzero_pd();
1136             fjy2             = _mm_setzero_pd();
1137             fjz2             = _mm_setzero_pd();
1138
1139             /**************************
1140              * CALCULATE INTERACTIONS *
1141              **************************/
1142
1143             r00              = _mm_mul_pd(rsq00,rinv00);
1144
1145             /* Calculate table index by multiplying r with table scale and truncate to integer */
1146             rt               = _mm_mul_pd(r00,vftabscale);
1147             vfitab           = _mm_cvttpd_epi32(rt);
1148 #ifdef __XOP__
1149             vfeps            = _mm_frcz_pd(rt);
1150 #else
1151             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1152 #endif
1153             twovfeps         = _mm_add_pd(vfeps,vfeps);
1154             vfitab           = _mm_slli_epi32(vfitab,3);
1155
1156             /* COULOMB ELECTROSTATICS */
1157             velec            = _mm_mul_pd(qq00,rinv00);
1158             felec            = _mm_mul_pd(velec,rinvsq00);
1159
1160             /* CUBIC SPLINE TABLE DISPERSION */
1161             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1162             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1163             GMX_MM_TRANSPOSE2_PD(Y,F);
1164             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1165             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1166             GMX_MM_TRANSPOSE2_PD(G,H);
1167             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1168             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1169             fvdw6            = _mm_mul_pd(c6_00,FF);
1170
1171             /* CUBIC SPLINE TABLE REPULSION */
1172             vfitab           = _mm_add_epi32(vfitab,ifour);
1173             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1174             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1175             GMX_MM_TRANSPOSE2_PD(Y,F);
1176             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1177             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1178             GMX_MM_TRANSPOSE2_PD(G,H);
1179             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1180             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1181             fvdw12           = _mm_mul_pd(c12_00,FF);
1182             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1183
1184             fscal            = _mm_add_pd(felec,fvdw);
1185
1186             /* Update vectorial force */
1187             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1188             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1189             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1190             
1191             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1192             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1193             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1194
1195             /**************************
1196              * CALCULATE INTERACTIONS *
1197              **************************/
1198
1199             /* COULOMB ELECTROSTATICS */
1200             velec            = _mm_mul_pd(qq01,rinv01);
1201             felec            = _mm_mul_pd(velec,rinvsq01);
1202
1203             fscal            = felec;
1204
1205             /* Update vectorial force */
1206             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1207             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1208             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1209             
1210             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1211             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1212             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1213
1214             /**************************
1215              * CALCULATE INTERACTIONS *
1216              **************************/
1217
1218             /* COULOMB ELECTROSTATICS */
1219             velec            = _mm_mul_pd(qq02,rinv02);
1220             felec            = _mm_mul_pd(velec,rinvsq02);
1221
1222             fscal            = felec;
1223
1224             /* Update vectorial force */
1225             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1226             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1227             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1228             
1229             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1230             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1231             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1232
1233             /**************************
1234              * CALCULATE INTERACTIONS *
1235              **************************/
1236
1237             /* COULOMB ELECTROSTATICS */
1238             velec            = _mm_mul_pd(qq10,rinv10);
1239             felec            = _mm_mul_pd(velec,rinvsq10);
1240
1241             fscal            = felec;
1242
1243             /* Update vectorial force */
1244             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1245             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1246             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1247             
1248             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1249             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1250             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1251
1252             /**************************
1253              * CALCULATE INTERACTIONS *
1254              **************************/
1255
1256             /* COULOMB ELECTROSTATICS */
1257             velec            = _mm_mul_pd(qq11,rinv11);
1258             felec            = _mm_mul_pd(velec,rinvsq11);
1259
1260             fscal            = felec;
1261
1262             /* Update vectorial force */
1263             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1264             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1265             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1266             
1267             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1268             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1269             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1270
1271             /**************************
1272              * CALCULATE INTERACTIONS *
1273              **************************/
1274
1275             /* COULOMB ELECTROSTATICS */
1276             velec            = _mm_mul_pd(qq12,rinv12);
1277             felec            = _mm_mul_pd(velec,rinvsq12);
1278
1279             fscal            = felec;
1280
1281             /* Update vectorial force */
1282             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1283             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1284             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1285             
1286             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1287             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1288             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1289
1290             /**************************
1291              * CALCULATE INTERACTIONS *
1292              **************************/
1293
1294             /* COULOMB ELECTROSTATICS */
1295             velec            = _mm_mul_pd(qq20,rinv20);
1296             felec            = _mm_mul_pd(velec,rinvsq20);
1297
1298             fscal            = felec;
1299
1300             /* Update vectorial force */
1301             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1302             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1303             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1304             
1305             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1306             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1307             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1308
1309             /**************************
1310              * CALCULATE INTERACTIONS *
1311              **************************/
1312
1313             /* COULOMB ELECTROSTATICS */
1314             velec            = _mm_mul_pd(qq21,rinv21);
1315             felec            = _mm_mul_pd(velec,rinvsq21);
1316
1317             fscal            = felec;
1318
1319             /* Update vectorial force */
1320             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1321             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1322             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1323             
1324             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1325             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1326             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1327
1328             /**************************
1329              * CALCULATE INTERACTIONS *
1330              **************************/
1331
1332             /* COULOMB ELECTROSTATICS */
1333             velec            = _mm_mul_pd(qq22,rinv22);
1334             felec            = _mm_mul_pd(velec,rinvsq22);
1335
1336             fscal            = felec;
1337
1338             /* Update vectorial force */
1339             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1340             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1341             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1342             
1343             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1344             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1345             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1346
1347             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1348
1349             /* Inner loop uses 297 flops */
1350         }
1351
1352         if(jidx<j_index_end)
1353         {
1354
1355             jnrA             = jjnr[jidx];
1356             j_coord_offsetA  = DIM*jnrA;
1357
1358             /* load j atom coordinates */
1359             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1360                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1361
1362             /* Calculate displacement vector */
1363             dx00             = _mm_sub_pd(ix0,jx0);
1364             dy00             = _mm_sub_pd(iy0,jy0);
1365             dz00             = _mm_sub_pd(iz0,jz0);
1366             dx01             = _mm_sub_pd(ix0,jx1);
1367             dy01             = _mm_sub_pd(iy0,jy1);
1368             dz01             = _mm_sub_pd(iz0,jz1);
1369             dx02             = _mm_sub_pd(ix0,jx2);
1370             dy02             = _mm_sub_pd(iy0,jy2);
1371             dz02             = _mm_sub_pd(iz0,jz2);
1372             dx10             = _mm_sub_pd(ix1,jx0);
1373             dy10             = _mm_sub_pd(iy1,jy0);
1374             dz10             = _mm_sub_pd(iz1,jz0);
1375             dx11             = _mm_sub_pd(ix1,jx1);
1376             dy11             = _mm_sub_pd(iy1,jy1);
1377             dz11             = _mm_sub_pd(iz1,jz1);
1378             dx12             = _mm_sub_pd(ix1,jx2);
1379             dy12             = _mm_sub_pd(iy1,jy2);
1380             dz12             = _mm_sub_pd(iz1,jz2);
1381             dx20             = _mm_sub_pd(ix2,jx0);
1382             dy20             = _mm_sub_pd(iy2,jy0);
1383             dz20             = _mm_sub_pd(iz2,jz0);
1384             dx21             = _mm_sub_pd(ix2,jx1);
1385             dy21             = _mm_sub_pd(iy2,jy1);
1386             dz21             = _mm_sub_pd(iz2,jz1);
1387             dx22             = _mm_sub_pd(ix2,jx2);
1388             dy22             = _mm_sub_pd(iy2,jy2);
1389             dz22             = _mm_sub_pd(iz2,jz2);
1390
1391             /* Calculate squared distance and things based on it */
1392             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1393             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1394             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1395             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1396             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1397             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1398             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1399             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1400             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1401
1402             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1403             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1404             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1405             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1406             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1407             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1408             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1409             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1410             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1411
1412             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1413             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1414             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1415             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1416             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1417             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1418             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1419             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1420             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1421
1422             fjx0             = _mm_setzero_pd();
1423             fjy0             = _mm_setzero_pd();
1424             fjz0             = _mm_setzero_pd();
1425             fjx1             = _mm_setzero_pd();
1426             fjy1             = _mm_setzero_pd();
1427             fjz1             = _mm_setzero_pd();
1428             fjx2             = _mm_setzero_pd();
1429             fjy2             = _mm_setzero_pd();
1430             fjz2             = _mm_setzero_pd();
1431
1432             /**************************
1433              * CALCULATE INTERACTIONS *
1434              **************************/
1435
1436             r00              = _mm_mul_pd(rsq00,rinv00);
1437
1438             /* Calculate table index by multiplying r with table scale and truncate to integer */
1439             rt               = _mm_mul_pd(r00,vftabscale);
1440             vfitab           = _mm_cvttpd_epi32(rt);
1441 #ifdef __XOP__
1442             vfeps            = _mm_frcz_pd(rt);
1443 #else
1444             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1445 #endif
1446             twovfeps         = _mm_add_pd(vfeps,vfeps);
1447             vfitab           = _mm_slli_epi32(vfitab,3);
1448
1449             /* COULOMB ELECTROSTATICS */
1450             velec            = _mm_mul_pd(qq00,rinv00);
1451             felec            = _mm_mul_pd(velec,rinvsq00);
1452
1453             /* CUBIC SPLINE TABLE DISPERSION */
1454             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1455             F                = _mm_setzero_pd();
1456             GMX_MM_TRANSPOSE2_PD(Y,F);
1457             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1458             H                = _mm_setzero_pd();
1459             GMX_MM_TRANSPOSE2_PD(G,H);
1460             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1461             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1462             fvdw6            = _mm_mul_pd(c6_00,FF);
1463
1464             /* CUBIC SPLINE TABLE REPULSION */
1465             vfitab           = _mm_add_epi32(vfitab,ifour);
1466             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1467             F                = _mm_setzero_pd();
1468             GMX_MM_TRANSPOSE2_PD(Y,F);
1469             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1470             H                = _mm_setzero_pd();
1471             GMX_MM_TRANSPOSE2_PD(G,H);
1472             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1473             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1474             fvdw12           = _mm_mul_pd(c12_00,FF);
1475             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1476
1477             fscal            = _mm_add_pd(felec,fvdw);
1478
1479             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1480
1481             /* Update vectorial force */
1482             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1483             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1484             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1485             
1486             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1487             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1488             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1489
1490             /**************************
1491              * CALCULATE INTERACTIONS *
1492              **************************/
1493
1494             /* COULOMB ELECTROSTATICS */
1495             velec            = _mm_mul_pd(qq01,rinv01);
1496             felec            = _mm_mul_pd(velec,rinvsq01);
1497
1498             fscal            = felec;
1499
1500             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1501
1502             /* Update vectorial force */
1503             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1504             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1505             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1506             
1507             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1508             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1509             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1510
1511             /**************************
1512              * CALCULATE INTERACTIONS *
1513              **************************/
1514
1515             /* COULOMB ELECTROSTATICS */
1516             velec            = _mm_mul_pd(qq02,rinv02);
1517             felec            = _mm_mul_pd(velec,rinvsq02);
1518
1519             fscal            = felec;
1520
1521             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1522
1523             /* Update vectorial force */
1524             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1525             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1526             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1527             
1528             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1529             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1530             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1531
1532             /**************************
1533              * CALCULATE INTERACTIONS *
1534              **************************/
1535
1536             /* COULOMB ELECTROSTATICS */
1537             velec            = _mm_mul_pd(qq10,rinv10);
1538             felec            = _mm_mul_pd(velec,rinvsq10);
1539
1540             fscal            = felec;
1541
1542             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1543
1544             /* Update vectorial force */
1545             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1546             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1547             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1548             
1549             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1550             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1551             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1552
1553             /**************************
1554              * CALCULATE INTERACTIONS *
1555              **************************/
1556
1557             /* COULOMB ELECTROSTATICS */
1558             velec            = _mm_mul_pd(qq11,rinv11);
1559             felec            = _mm_mul_pd(velec,rinvsq11);
1560
1561             fscal            = felec;
1562
1563             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1564
1565             /* Update vectorial force */
1566             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1567             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1568             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1569             
1570             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1571             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1572             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1573
1574             /**************************
1575              * CALCULATE INTERACTIONS *
1576              **************************/
1577
1578             /* COULOMB ELECTROSTATICS */
1579             velec            = _mm_mul_pd(qq12,rinv12);
1580             felec            = _mm_mul_pd(velec,rinvsq12);
1581
1582             fscal            = felec;
1583
1584             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1585
1586             /* Update vectorial force */
1587             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1588             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1589             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1590             
1591             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1592             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1593             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1594
1595             /**************************
1596              * CALCULATE INTERACTIONS *
1597              **************************/
1598
1599             /* COULOMB ELECTROSTATICS */
1600             velec            = _mm_mul_pd(qq20,rinv20);
1601             felec            = _mm_mul_pd(velec,rinvsq20);
1602
1603             fscal            = felec;
1604
1605             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1606
1607             /* Update vectorial force */
1608             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1609             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1610             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1611             
1612             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1613             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1614             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1615
1616             /**************************
1617              * CALCULATE INTERACTIONS *
1618              **************************/
1619
1620             /* COULOMB ELECTROSTATICS */
1621             velec            = _mm_mul_pd(qq21,rinv21);
1622             felec            = _mm_mul_pd(velec,rinvsq21);
1623
1624             fscal            = felec;
1625
1626             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1627
1628             /* Update vectorial force */
1629             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1630             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1631             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1632             
1633             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1634             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1635             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1636
1637             /**************************
1638              * CALCULATE INTERACTIONS *
1639              **************************/
1640
1641             /* COULOMB ELECTROSTATICS */
1642             velec            = _mm_mul_pd(qq22,rinv22);
1643             felec            = _mm_mul_pd(velec,rinvsq22);
1644
1645             fscal            = felec;
1646
1647             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1648
1649             /* Update vectorial force */
1650             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1651             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1652             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1653             
1654             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1655             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1656             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1657
1658             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1659
1660             /* Inner loop uses 297 flops */
1661         }
1662
1663         /* End of innermost loop */
1664
1665         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1666                                               f+i_coord_offset,fshift+i_shift_offset);
1667
1668         /* Increment number of inner iterations */
1669         inneriter                  += j_index_end - j_index_start;
1670
1671         /* Outer loop uses 18 flops */
1672     }
1673
1674     /* Increment number of outer iterations */
1675     outeriter        += nri;
1676
1677     /* Update outer/inner flops */
1678
1679     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);
1680 }