41e27d444c7113e7a7abbb2016fcc6018dfe8a2b
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_avx_128_fma_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_double
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Particle
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwioffset3;
73     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74     int              vdwjidx0A,vdwjidx0B;
75     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
77     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79     __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
81     real             *charge;
82     int              nvdwtype;
83     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
84     int              *vdwtype;
85     real             *vdwparam;
86     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
87     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
88     __m128i          vfitab;
89     __m128i          ifour       = _mm_set1_epi32(4);
90     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
91     real             *vftab;
92     __m128d          dummy_mask,cutoff_mask;
93     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
94     __m128d          one     = _mm_set1_pd(1.0);
95     __m128d          two     = _mm_set1_pd(2.0);
96     x                = xx[0];
97     f                = ff[0];
98
99     nri              = nlist->nri;
100     iinr             = nlist->iinr;
101     jindex           = nlist->jindex;
102     jjnr             = nlist->jjnr;
103     shiftidx         = nlist->shift;
104     gid              = nlist->gid;
105     shiftvec         = fr->shift_vec[0];
106     fshift           = fr->fshift[0];
107     facel            = _mm_set1_pd(fr->epsfac);
108     charge           = mdatoms->chargeA;
109     nvdwtype         = fr->ntype;
110     vdwparam         = fr->nbfp;
111     vdwtype          = mdatoms->typeA;
112
113     vftab            = kernel_data->table_elec->data;
114     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
115
116     /* Setup water-specific parameters */
117     inr              = nlist->iinr[0];
118     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
119     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
120     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
121     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
122
123     /* Avoid stupid compiler warnings */
124     jnrA = jnrB = 0;
125     j_coord_offsetA = 0;
126     j_coord_offsetB = 0;
127
128     outeriter        = 0;
129     inneriter        = 0;
130
131     /* Start outer loop over neighborlists */
132     for(iidx=0; iidx<nri; iidx++)
133     {
134         /* Load shift vector for this list */
135         i_shift_offset   = DIM*shiftidx[iidx];
136
137         /* Load limits for loop over neighbors */
138         j_index_start    = jindex[iidx];
139         j_index_end      = jindex[iidx+1];
140
141         /* Get outer coordinate index */
142         inr              = iinr[iidx];
143         i_coord_offset   = DIM*inr;
144
145         /* Load i particle coords and add shift vector */
146         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
147                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
148
149         fix0             = _mm_setzero_pd();
150         fiy0             = _mm_setzero_pd();
151         fiz0             = _mm_setzero_pd();
152         fix1             = _mm_setzero_pd();
153         fiy1             = _mm_setzero_pd();
154         fiz1             = _mm_setzero_pd();
155         fix2             = _mm_setzero_pd();
156         fiy2             = _mm_setzero_pd();
157         fiz2             = _mm_setzero_pd();
158         fix3             = _mm_setzero_pd();
159         fiy3             = _mm_setzero_pd();
160         fiz3             = _mm_setzero_pd();
161
162         /* Reset potential sums */
163         velecsum         = _mm_setzero_pd();
164         vvdwsum          = _mm_setzero_pd();
165
166         /* Start inner kernel loop */
167         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
168         {
169
170             /* Get j neighbor index, and coordinate index */
171             jnrA             = jjnr[jidx];
172             jnrB             = jjnr[jidx+1];
173             j_coord_offsetA  = DIM*jnrA;
174             j_coord_offsetB  = DIM*jnrB;
175
176             /* load j atom coordinates */
177             gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
178                                               &jx0,&jy0,&jz0);
179
180             /* Calculate displacement vector */
181             dx00             = _mm_sub_pd(ix0,jx0);
182             dy00             = _mm_sub_pd(iy0,jy0);
183             dz00             = _mm_sub_pd(iz0,jz0);
184             dx10             = _mm_sub_pd(ix1,jx0);
185             dy10             = _mm_sub_pd(iy1,jy0);
186             dz10             = _mm_sub_pd(iz1,jz0);
187             dx20             = _mm_sub_pd(ix2,jx0);
188             dy20             = _mm_sub_pd(iy2,jy0);
189             dz20             = _mm_sub_pd(iz2,jz0);
190             dx30             = _mm_sub_pd(ix3,jx0);
191             dy30             = _mm_sub_pd(iy3,jy0);
192             dz30             = _mm_sub_pd(iz3,jz0);
193
194             /* Calculate squared distance and things based on it */
195             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
196             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
197             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
198             rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
199
200             rinv10           = gmx_mm_invsqrt_pd(rsq10);
201             rinv20           = gmx_mm_invsqrt_pd(rsq20);
202             rinv30           = gmx_mm_invsqrt_pd(rsq30);
203
204             rinvsq00         = gmx_mm_inv_pd(rsq00);
205
206             /* Load parameters for j particles */
207             jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
208             vdwjidx0A        = 2*vdwtype[jnrA+0];
209             vdwjidx0B        = 2*vdwtype[jnrB+0];
210
211             fjx0             = _mm_setzero_pd();
212             fjy0             = _mm_setzero_pd();
213             fjz0             = _mm_setzero_pd();
214
215             /**************************
216              * CALCULATE INTERACTIONS *
217              **************************/
218
219             /* Compute parameters for interactions between i and j atoms */
220             gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
221                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
222
223             /* LENNARD-JONES DISPERSION/REPULSION */
224
225             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
226             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
227             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
228             vvdw             = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
229             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
230
231             /* Update potential sum for this i atom from the interaction with this j atom. */
232             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
233
234             fscal            = fvdw;
235
236             /* Update vectorial force */
237             fix0             = _mm_macc_pd(dx00,fscal,fix0);
238             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
239             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
240             
241             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
242             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
243             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
244
245             /**************************
246              * CALCULATE INTERACTIONS *
247              **************************/
248
249             r10              = _mm_mul_pd(rsq10,rinv10);
250
251             /* Compute parameters for interactions between i and j atoms */
252             qq10             = _mm_mul_pd(iq1,jq0);
253
254             /* Calculate table index by multiplying r with table scale and truncate to integer */
255             rt               = _mm_mul_pd(r10,vftabscale);
256             vfitab           = _mm_cvttpd_epi32(rt);
257 #ifdef __XOP__
258             vfeps            = _mm_frcz_pd(rt);
259 #else
260             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
261 #endif
262             twovfeps         = _mm_add_pd(vfeps,vfeps);
263             vfitab           = _mm_slli_epi32(vfitab,2);
264
265             /* CUBIC SPLINE TABLE ELECTROSTATICS */
266             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
267             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
268             GMX_MM_TRANSPOSE2_PD(Y,F);
269             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
270             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
271             GMX_MM_TRANSPOSE2_PD(G,H);
272             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
273             VV               = _mm_macc_pd(vfeps,Fp,Y);
274             velec            = _mm_mul_pd(qq10,VV);
275             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
276             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
277
278             /* Update potential sum for this i atom from the interaction with this j atom. */
279             velecsum         = _mm_add_pd(velecsum,velec);
280
281             fscal            = felec;
282
283             /* Update vectorial force */
284             fix1             = _mm_macc_pd(dx10,fscal,fix1);
285             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
286             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
287             
288             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
289             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
290             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
291
292             /**************************
293              * CALCULATE INTERACTIONS *
294              **************************/
295
296             r20              = _mm_mul_pd(rsq20,rinv20);
297
298             /* Compute parameters for interactions between i and j atoms */
299             qq20             = _mm_mul_pd(iq2,jq0);
300
301             /* Calculate table index by multiplying r with table scale and truncate to integer */
302             rt               = _mm_mul_pd(r20,vftabscale);
303             vfitab           = _mm_cvttpd_epi32(rt);
304 #ifdef __XOP__
305             vfeps            = _mm_frcz_pd(rt);
306 #else
307             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
308 #endif
309             twovfeps         = _mm_add_pd(vfeps,vfeps);
310             vfitab           = _mm_slli_epi32(vfitab,2);
311
312             /* CUBIC SPLINE TABLE ELECTROSTATICS */
313             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
314             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
315             GMX_MM_TRANSPOSE2_PD(Y,F);
316             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
317             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
318             GMX_MM_TRANSPOSE2_PD(G,H);
319             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
320             VV               = _mm_macc_pd(vfeps,Fp,Y);
321             velec            = _mm_mul_pd(qq20,VV);
322             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
323             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
324
325             /* Update potential sum for this i atom from the interaction with this j atom. */
326             velecsum         = _mm_add_pd(velecsum,velec);
327
328             fscal            = felec;
329
330             /* Update vectorial force */
331             fix2             = _mm_macc_pd(dx20,fscal,fix2);
332             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
333             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
334             
335             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
336             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
337             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
338
339             /**************************
340              * CALCULATE INTERACTIONS *
341              **************************/
342
343             r30              = _mm_mul_pd(rsq30,rinv30);
344
345             /* Compute parameters for interactions between i and j atoms */
346             qq30             = _mm_mul_pd(iq3,jq0);
347
348             /* Calculate table index by multiplying r with table scale and truncate to integer */
349             rt               = _mm_mul_pd(r30,vftabscale);
350             vfitab           = _mm_cvttpd_epi32(rt);
351 #ifdef __XOP__
352             vfeps            = _mm_frcz_pd(rt);
353 #else
354             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
355 #endif
356             twovfeps         = _mm_add_pd(vfeps,vfeps);
357             vfitab           = _mm_slli_epi32(vfitab,2);
358
359             /* CUBIC SPLINE TABLE ELECTROSTATICS */
360             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
361             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
362             GMX_MM_TRANSPOSE2_PD(Y,F);
363             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
364             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
365             GMX_MM_TRANSPOSE2_PD(G,H);
366             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
367             VV               = _mm_macc_pd(vfeps,Fp,Y);
368             velec            = _mm_mul_pd(qq30,VV);
369             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
370             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
371
372             /* Update potential sum for this i atom from the interaction with this j atom. */
373             velecsum         = _mm_add_pd(velecsum,velec);
374
375             fscal            = felec;
376
377             /* Update vectorial force */
378             fix3             = _mm_macc_pd(dx30,fscal,fix3);
379             fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
380             fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
381             
382             fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
383             fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
384             fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
385
386             gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
387
388             /* Inner loop uses 176 flops */
389         }
390
391         if(jidx<j_index_end)
392         {
393
394             jnrA             = jjnr[jidx];
395             j_coord_offsetA  = DIM*jnrA;
396
397             /* load j atom coordinates */
398             gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
399                                               &jx0,&jy0,&jz0);
400
401             /* Calculate displacement vector */
402             dx00             = _mm_sub_pd(ix0,jx0);
403             dy00             = _mm_sub_pd(iy0,jy0);
404             dz00             = _mm_sub_pd(iz0,jz0);
405             dx10             = _mm_sub_pd(ix1,jx0);
406             dy10             = _mm_sub_pd(iy1,jy0);
407             dz10             = _mm_sub_pd(iz1,jz0);
408             dx20             = _mm_sub_pd(ix2,jx0);
409             dy20             = _mm_sub_pd(iy2,jy0);
410             dz20             = _mm_sub_pd(iz2,jz0);
411             dx30             = _mm_sub_pd(ix3,jx0);
412             dy30             = _mm_sub_pd(iy3,jy0);
413             dz30             = _mm_sub_pd(iz3,jz0);
414
415             /* Calculate squared distance and things based on it */
416             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
417             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
418             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
419             rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
420
421             rinv10           = gmx_mm_invsqrt_pd(rsq10);
422             rinv20           = gmx_mm_invsqrt_pd(rsq20);
423             rinv30           = gmx_mm_invsqrt_pd(rsq30);
424
425             rinvsq00         = gmx_mm_inv_pd(rsq00);
426
427             /* Load parameters for j particles */
428             jq0              = _mm_load_sd(charge+jnrA+0);
429             vdwjidx0A        = 2*vdwtype[jnrA+0];
430
431             fjx0             = _mm_setzero_pd();
432             fjy0             = _mm_setzero_pd();
433             fjz0             = _mm_setzero_pd();
434
435             /**************************
436              * CALCULATE INTERACTIONS *
437              **************************/
438
439             /* Compute parameters for interactions between i and j atoms */
440             gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
441
442             /* LENNARD-JONES DISPERSION/REPULSION */
443
444             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
445             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
446             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
447             vvdw             = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
448             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
449
450             /* Update potential sum for this i atom from the interaction with this j atom. */
451             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
452             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
453
454             fscal            = fvdw;
455
456             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
457
458             /* Update vectorial force */
459             fix0             = _mm_macc_pd(dx00,fscal,fix0);
460             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
461             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
462             
463             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
464             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
465             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
466
467             /**************************
468              * CALCULATE INTERACTIONS *
469              **************************/
470
471             r10              = _mm_mul_pd(rsq10,rinv10);
472
473             /* Compute parameters for interactions between i and j atoms */
474             qq10             = _mm_mul_pd(iq1,jq0);
475
476             /* Calculate table index by multiplying r with table scale and truncate to integer */
477             rt               = _mm_mul_pd(r10,vftabscale);
478             vfitab           = _mm_cvttpd_epi32(rt);
479 #ifdef __XOP__
480             vfeps            = _mm_frcz_pd(rt);
481 #else
482             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
483 #endif
484             twovfeps         = _mm_add_pd(vfeps,vfeps);
485             vfitab           = _mm_slli_epi32(vfitab,2);
486
487             /* CUBIC SPLINE TABLE ELECTROSTATICS */
488             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
489             F                = _mm_setzero_pd();
490             GMX_MM_TRANSPOSE2_PD(Y,F);
491             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
492             H                = _mm_setzero_pd();
493             GMX_MM_TRANSPOSE2_PD(G,H);
494             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
495             VV               = _mm_macc_pd(vfeps,Fp,Y);
496             velec            = _mm_mul_pd(qq10,VV);
497             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
498             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
499
500             /* Update potential sum for this i atom from the interaction with this j atom. */
501             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
502             velecsum         = _mm_add_pd(velecsum,velec);
503
504             fscal            = felec;
505
506             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
507
508             /* Update vectorial force */
509             fix1             = _mm_macc_pd(dx10,fscal,fix1);
510             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
511             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
512             
513             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
514             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
515             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
516
517             /**************************
518              * CALCULATE INTERACTIONS *
519              **************************/
520
521             r20              = _mm_mul_pd(rsq20,rinv20);
522
523             /* Compute parameters for interactions between i and j atoms */
524             qq20             = _mm_mul_pd(iq2,jq0);
525
526             /* Calculate table index by multiplying r with table scale and truncate to integer */
527             rt               = _mm_mul_pd(r20,vftabscale);
528             vfitab           = _mm_cvttpd_epi32(rt);
529 #ifdef __XOP__
530             vfeps            = _mm_frcz_pd(rt);
531 #else
532             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
533 #endif
534             twovfeps         = _mm_add_pd(vfeps,vfeps);
535             vfitab           = _mm_slli_epi32(vfitab,2);
536
537             /* CUBIC SPLINE TABLE ELECTROSTATICS */
538             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
539             F                = _mm_setzero_pd();
540             GMX_MM_TRANSPOSE2_PD(Y,F);
541             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
542             H                = _mm_setzero_pd();
543             GMX_MM_TRANSPOSE2_PD(G,H);
544             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
545             VV               = _mm_macc_pd(vfeps,Fp,Y);
546             velec            = _mm_mul_pd(qq20,VV);
547             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
548             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
549
550             /* Update potential sum for this i atom from the interaction with this j atom. */
551             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
552             velecsum         = _mm_add_pd(velecsum,velec);
553
554             fscal            = felec;
555
556             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
557
558             /* Update vectorial force */
559             fix2             = _mm_macc_pd(dx20,fscal,fix2);
560             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
561             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
562             
563             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
564             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
565             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
566
567             /**************************
568              * CALCULATE INTERACTIONS *
569              **************************/
570
571             r30              = _mm_mul_pd(rsq30,rinv30);
572
573             /* Compute parameters for interactions between i and j atoms */
574             qq30             = _mm_mul_pd(iq3,jq0);
575
576             /* Calculate table index by multiplying r with table scale and truncate to integer */
577             rt               = _mm_mul_pd(r30,vftabscale);
578             vfitab           = _mm_cvttpd_epi32(rt);
579 #ifdef __XOP__
580             vfeps            = _mm_frcz_pd(rt);
581 #else
582             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
583 #endif
584             twovfeps         = _mm_add_pd(vfeps,vfeps);
585             vfitab           = _mm_slli_epi32(vfitab,2);
586
587             /* CUBIC SPLINE TABLE ELECTROSTATICS */
588             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
589             F                = _mm_setzero_pd();
590             GMX_MM_TRANSPOSE2_PD(Y,F);
591             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
592             H                = _mm_setzero_pd();
593             GMX_MM_TRANSPOSE2_PD(G,H);
594             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
595             VV               = _mm_macc_pd(vfeps,Fp,Y);
596             velec            = _mm_mul_pd(qq30,VV);
597             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
598             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
599
600             /* Update potential sum for this i atom from the interaction with this j atom. */
601             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
602             velecsum         = _mm_add_pd(velecsum,velec);
603
604             fscal            = felec;
605
606             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
607
608             /* Update vectorial force */
609             fix3             = _mm_macc_pd(dx30,fscal,fix3);
610             fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
611             fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
612             
613             fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
614             fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
615             fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
616
617             gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
618
619             /* Inner loop uses 176 flops */
620         }
621
622         /* End of innermost loop */
623
624         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
625                                               f+i_coord_offset,fshift+i_shift_offset);
626
627         ggid                        = gid[iidx];
628         /* Update potential energies */
629         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
630         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
631
632         /* Increment number of inner iterations */
633         inneriter                  += j_index_end - j_index_start;
634
635         /* Outer loop uses 26 flops */
636     }
637
638     /* Increment number of outer iterations */
639     outeriter        += nri;
640
641     /* Update outer/inner flops */
642
643     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*176);
644 }
645 /*
646  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_double
647  * Electrostatics interaction: CubicSplineTable
648  * VdW interaction:            LennardJones
649  * Geometry:                   Water4-Particle
650  * Calculate force/pot:        Force
651  */
652 void
653 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_double
654                     (t_nblist * gmx_restrict                nlist,
655                      rvec * gmx_restrict                    xx,
656                      rvec * gmx_restrict                    ff,
657                      t_forcerec * gmx_restrict              fr,
658                      t_mdatoms * gmx_restrict               mdatoms,
659                      nb_kernel_data_t * gmx_restrict        kernel_data,
660                      t_nrnb * gmx_restrict                  nrnb)
661 {
662     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
663      * just 0 for non-waters.
664      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
665      * jnr indices corresponding to data put in the four positions in the SIMD register.
666      */
667     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
668     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
669     int              jnrA,jnrB;
670     int              j_coord_offsetA,j_coord_offsetB;
671     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
672     real             rcutoff_scalar;
673     real             *shiftvec,*fshift,*x,*f;
674     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
675     int              vdwioffset0;
676     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
677     int              vdwioffset1;
678     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
679     int              vdwioffset2;
680     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
681     int              vdwioffset3;
682     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
683     int              vdwjidx0A,vdwjidx0B;
684     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
685     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
686     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
687     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
688     __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
689     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
690     real             *charge;
691     int              nvdwtype;
692     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
693     int              *vdwtype;
694     real             *vdwparam;
695     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
696     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
697     __m128i          vfitab;
698     __m128i          ifour       = _mm_set1_epi32(4);
699     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
700     real             *vftab;
701     __m128d          dummy_mask,cutoff_mask;
702     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
703     __m128d          one     = _mm_set1_pd(1.0);
704     __m128d          two     = _mm_set1_pd(2.0);
705     x                = xx[0];
706     f                = ff[0];
707
708     nri              = nlist->nri;
709     iinr             = nlist->iinr;
710     jindex           = nlist->jindex;
711     jjnr             = nlist->jjnr;
712     shiftidx         = nlist->shift;
713     gid              = nlist->gid;
714     shiftvec         = fr->shift_vec[0];
715     fshift           = fr->fshift[0];
716     facel            = _mm_set1_pd(fr->epsfac);
717     charge           = mdatoms->chargeA;
718     nvdwtype         = fr->ntype;
719     vdwparam         = fr->nbfp;
720     vdwtype          = mdatoms->typeA;
721
722     vftab            = kernel_data->table_elec->data;
723     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
724
725     /* Setup water-specific parameters */
726     inr              = nlist->iinr[0];
727     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
728     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
729     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
730     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
731
732     /* Avoid stupid compiler warnings */
733     jnrA = jnrB = 0;
734     j_coord_offsetA = 0;
735     j_coord_offsetB = 0;
736
737     outeriter        = 0;
738     inneriter        = 0;
739
740     /* Start outer loop over neighborlists */
741     for(iidx=0; iidx<nri; iidx++)
742     {
743         /* Load shift vector for this list */
744         i_shift_offset   = DIM*shiftidx[iidx];
745
746         /* Load limits for loop over neighbors */
747         j_index_start    = jindex[iidx];
748         j_index_end      = jindex[iidx+1];
749
750         /* Get outer coordinate index */
751         inr              = iinr[iidx];
752         i_coord_offset   = DIM*inr;
753
754         /* Load i particle coords and add shift vector */
755         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
756                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
757
758         fix0             = _mm_setzero_pd();
759         fiy0             = _mm_setzero_pd();
760         fiz0             = _mm_setzero_pd();
761         fix1             = _mm_setzero_pd();
762         fiy1             = _mm_setzero_pd();
763         fiz1             = _mm_setzero_pd();
764         fix2             = _mm_setzero_pd();
765         fiy2             = _mm_setzero_pd();
766         fiz2             = _mm_setzero_pd();
767         fix3             = _mm_setzero_pd();
768         fiy3             = _mm_setzero_pd();
769         fiz3             = _mm_setzero_pd();
770
771         /* Start inner kernel loop */
772         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
773         {
774
775             /* Get j neighbor index, and coordinate index */
776             jnrA             = jjnr[jidx];
777             jnrB             = jjnr[jidx+1];
778             j_coord_offsetA  = DIM*jnrA;
779             j_coord_offsetB  = DIM*jnrB;
780
781             /* load j atom coordinates */
782             gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
783                                               &jx0,&jy0,&jz0);
784
785             /* Calculate displacement vector */
786             dx00             = _mm_sub_pd(ix0,jx0);
787             dy00             = _mm_sub_pd(iy0,jy0);
788             dz00             = _mm_sub_pd(iz0,jz0);
789             dx10             = _mm_sub_pd(ix1,jx0);
790             dy10             = _mm_sub_pd(iy1,jy0);
791             dz10             = _mm_sub_pd(iz1,jz0);
792             dx20             = _mm_sub_pd(ix2,jx0);
793             dy20             = _mm_sub_pd(iy2,jy0);
794             dz20             = _mm_sub_pd(iz2,jz0);
795             dx30             = _mm_sub_pd(ix3,jx0);
796             dy30             = _mm_sub_pd(iy3,jy0);
797             dz30             = _mm_sub_pd(iz3,jz0);
798
799             /* Calculate squared distance and things based on it */
800             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
801             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
802             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
803             rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
804
805             rinv10           = gmx_mm_invsqrt_pd(rsq10);
806             rinv20           = gmx_mm_invsqrt_pd(rsq20);
807             rinv30           = gmx_mm_invsqrt_pd(rsq30);
808
809             rinvsq00         = gmx_mm_inv_pd(rsq00);
810
811             /* Load parameters for j particles */
812             jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
813             vdwjidx0A        = 2*vdwtype[jnrA+0];
814             vdwjidx0B        = 2*vdwtype[jnrB+0];
815
816             fjx0             = _mm_setzero_pd();
817             fjy0             = _mm_setzero_pd();
818             fjz0             = _mm_setzero_pd();
819
820             /**************************
821              * CALCULATE INTERACTIONS *
822              **************************/
823
824             /* Compute parameters for interactions between i and j atoms */
825             gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
826                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
827
828             /* LENNARD-JONES DISPERSION/REPULSION */
829
830             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
831             fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
832
833             fscal            = fvdw;
834
835             /* Update vectorial force */
836             fix0             = _mm_macc_pd(dx00,fscal,fix0);
837             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
838             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
839             
840             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
841             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
842             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
843
844             /**************************
845              * CALCULATE INTERACTIONS *
846              **************************/
847
848             r10              = _mm_mul_pd(rsq10,rinv10);
849
850             /* Compute parameters for interactions between i and j atoms */
851             qq10             = _mm_mul_pd(iq1,jq0);
852
853             /* Calculate table index by multiplying r with table scale and truncate to integer */
854             rt               = _mm_mul_pd(r10,vftabscale);
855             vfitab           = _mm_cvttpd_epi32(rt);
856 #ifdef __XOP__
857             vfeps            = _mm_frcz_pd(rt);
858 #else
859             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
860 #endif
861             twovfeps         = _mm_add_pd(vfeps,vfeps);
862             vfitab           = _mm_slli_epi32(vfitab,2);
863
864             /* CUBIC SPLINE TABLE ELECTROSTATICS */
865             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
866             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
867             GMX_MM_TRANSPOSE2_PD(Y,F);
868             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
869             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
870             GMX_MM_TRANSPOSE2_PD(G,H);
871             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
872             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
873             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
874
875             fscal            = felec;
876
877             /* Update vectorial force */
878             fix1             = _mm_macc_pd(dx10,fscal,fix1);
879             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
880             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
881             
882             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
883             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
884             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
885
886             /**************************
887              * CALCULATE INTERACTIONS *
888              **************************/
889
890             r20              = _mm_mul_pd(rsq20,rinv20);
891
892             /* Compute parameters for interactions between i and j atoms */
893             qq20             = _mm_mul_pd(iq2,jq0);
894
895             /* Calculate table index by multiplying r with table scale and truncate to integer */
896             rt               = _mm_mul_pd(r20,vftabscale);
897             vfitab           = _mm_cvttpd_epi32(rt);
898 #ifdef __XOP__
899             vfeps            = _mm_frcz_pd(rt);
900 #else
901             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
902 #endif
903             twovfeps         = _mm_add_pd(vfeps,vfeps);
904             vfitab           = _mm_slli_epi32(vfitab,2);
905
906             /* CUBIC SPLINE TABLE ELECTROSTATICS */
907             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
908             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
909             GMX_MM_TRANSPOSE2_PD(Y,F);
910             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
911             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
912             GMX_MM_TRANSPOSE2_PD(G,H);
913             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
914             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
915             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
916
917             fscal            = felec;
918
919             /* Update vectorial force */
920             fix2             = _mm_macc_pd(dx20,fscal,fix2);
921             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
922             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
923             
924             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
925             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
926             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
927
928             /**************************
929              * CALCULATE INTERACTIONS *
930              **************************/
931
932             r30              = _mm_mul_pd(rsq30,rinv30);
933
934             /* Compute parameters for interactions between i and j atoms */
935             qq30             = _mm_mul_pd(iq3,jq0);
936
937             /* Calculate table index by multiplying r with table scale and truncate to integer */
938             rt               = _mm_mul_pd(r30,vftabscale);
939             vfitab           = _mm_cvttpd_epi32(rt);
940 #ifdef __XOP__
941             vfeps            = _mm_frcz_pd(rt);
942 #else
943             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
944 #endif
945             twovfeps         = _mm_add_pd(vfeps,vfeps);
946             vfitab           = _mm_slli_epi32(vfitab,2);
947
948             /* CUBIC SPLINE TABLE ELECTROSTATICS */
949             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
950             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
951             GMX_MM_TRANSPOSE2_PD(Y,F);
952             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
953             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
954             GMX_MM_TRANSPOSE2_PD(G,H);
955             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
956             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
957             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
958
959             fscal            = felec;
960
961             /* Update vectorial force */
962             fix3             = _mm_macc_pd(dx30,fscal,fix3);
963             fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
964             fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
965             
966             fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
967             fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
968             fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
969
970             gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
971
972             /* Inner loop uses 159 flops */
973         }
974
975         if(jidx<j_index_end)
976         {
977
978             jnrA             = jjnr[jidx];
979             j_coord_offsetA  = DIM*jnrA;
980
981             /* load j atom coordinates */
982             gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
983                                               &jx0,&jy0,&jz0);
984
985             /* Calculate displacement vector */
986             dx00             = _mm_sub_pd(ix0,jx0);
987             dy00             = _mm_sub_pd(iy0,jy0);
988             dz00             = _mm_sub_pd(iz0,jz0);
989             dx10             = _mm_sub_pd(ix1,jx0);
990             dy10             = _mm_sub_pd(iy1,jy0);
991             dz10             = _mm_sub_pd(iz1,jz0);
992             dx20             = _mm_sub_pd(ix2,jx0);
993             dy20             = _mm_sub_pd(iy2,jy0);
994             dz20             = _mm_sub_pd(iz2,jz0);
995             dx30             = _mm_sub_pd(ix3,jx0);
996             dy30             = _mm_sub_pd(iy3,jy0);
997             dz30             = _mm_sub_pd(iz3,jz0);
998
999             /* Calculate squared distance and things based on it */
1000             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1001             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1002             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1003             rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1004
1005             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1006             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1007             rinv30           = gmx_mm_invsqrt_pd(rsq30);
1008
1009             rinvsq00         = gmx_mm_inv_pd(rsq00);
1010
1011             /* Load parameters for j particles */
1012             jq0              = _mm_load_sd(charge+jnrA+0);
1013             vdwjidx0A        = 2*vdwtype[jnrA+0];
1014
1015             fjx0             = _mm_setzero_pd();
1016             fjy0             = _mm_setzero_pd();
1017             fjz0             = _mm_setzero_pd();
1018
1019             /**************************
1020              * CALCULATE INTERACTIONS *
1021              **************************/
1022
1023             /* Compute parameters for interactions between i and j atoms */
1024             gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1025
1026             /* LENNARD-JONES DISPERSION/REPULSION */
1027
1028             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1029             fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1030
1031             fscal            = fvdw;
1032
1033             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1034
1035             /* Update vectorial force */
1036             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1037             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1038             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1039             
1040             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1041             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1042             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1043
1044             /**************************
1045              * CALCULATE INTERACTIONS *
1046              **************************/
1047
1048             r10              = _mm_mul_pd(rsq10,rinv10);
1049
1050             /* Compute parameters for interactions between i and j atoms */
1051             qq10             = _mm_mul_pd(iq1,jq0);
1052
1053             /* Calculate table index by multiplying r with table scale and truncate to integer */
1054             rt               = _mm_mul_pd(r10,vftabscale);
1055             vfitab           = _mm_cvttpd_epi32(rt);
1056 #ifdef __XOP__
1057             vfeps            = _mm_frcz_pd(rt);
1058 #else
1059             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1060 #endif
1061             twovfeps         = _mm_add_pd(vfeps,vfeps);
1062             vfitab           = _mm_slli_epi32(vfitab,2);
1063
1064             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1065             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1066             F                = _mm_setzero_pd();
1067             GMX_MM_TRANSPOSE2_PD(Y,F);
1068             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1069             H                = _mm_setzero_pd();
1070             GMX_MM_TRANSPOSE2_PD(G,H);
1071             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1072             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1073             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1074
1075             fscal            = felec;
1076
1077             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1078
1079             /* Update vectorial force */
1080             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1081             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1082             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1083             
1084             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1085             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1086             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1087
1088             /**************************
1089              * CALCULATE INTERACTIONS *
1090              **************************/
1091
1092             r20              = _mm_mul_pd(rsq20,rinv20);
1093
1094             /* Compute parameters for interactions between i and j atoms */
1095             qq20             = _mm_mul_pd(iq2,jq0);
1096
1097             /* Calculate table index by multiplying r with table scale and truncate to integer */
1098             rt               = _mm_mul_pd(r20,vftabscale);
1099             vfitab           = _mm_cvttpd_epi32(rt);
1100 #ifdef __XOP__
1101             vfeps            = _mm_frcz_pd(rt);
1102 #else
1103             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1104 #endif
1105             twovfeps         = _mm_add_pd(vfeps,vfeps);
1106             vfitab           = _mm_slli_epi32(vfitab,2);
1107
1108             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1109             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1110             F                = _mm_setzero_pd();
1111             GMX_MM_TRANSPOSE2_PD(Y,F);
1112             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1113             H                = _mm_setzero_pd();
1114             GMX_MM_TRANSPOSE2_PD(G,H);
1115             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1116             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1117             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1118
1119             fscal            = felec;
1120
1121             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1122
1123             /* Update vectorial force */
1124             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1125             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1126             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1127             
1128             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1129             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1130             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1131
1132             /**************************
1133              * CALCULATE INTERACTIONS *
1134              **************************/
1135
1136             r30              = _mm_mul_pd(rsq30,rinv30);
1137
1138             /* Compute parameters for interactions between i and j atoms */
1139             qq30             = _mm_mul_pd(iq3,jq0);
1140
1141             /* Calculate table index by multiplying r with table scale and truncate to integer */
1142             rt               = _mm_mul_pd(r30,vftabscale);
1143             vfitab           = _mm_cvttpd_epi32(rt);
1144 #ifdef __XOP__
1145             vfeps            = _mm_frcz_pd(rt);
1146 #else
1147             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1148 #endif
1149             twovfeps         = _mm_add_pd(vfeps,vfeps);
1150             vfitab           = _mm_slli_epi32(vfitab,2);
1151
1152             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1153             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1154             F                = _mm_setzero_pd();
1155             GMX_MM_TRANSPOSE2_PD(Y,F);
1156             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1157             H                = _mm_setzero_pd();
1158             GMX_MM_TRANSPOSE2_PD(G,H);
1159             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1160             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1161             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
1162
1163             fscal            = felec;
1164
1165             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1166
1167             /* Update vectorial force */
1168             fix3             = _mm_macc_pd(dx30,fscal,fix3);
1169             fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
1170             fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
1171             
1172             fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
1173             fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
1174             fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
1175
1176             gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1177
1178             /* Inner loop uses 159 flops */
1179         }
1180
1181         /* End of innermost loop */
1182
1183         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1184                                               f+i_coord_offset,fshift+i_shift_offset);
1185
1186         /* Increment number of inner iterations */
1187         inneriter                  += j_index_end - j_index_start;
1188
1189         /* Outer loop uses 24 flops */
1190     }
1191
1192     /* Increment number of outer iterations */
1193     outeriter        += nri;
1194
1195     /* Update outer/inner flops */
1196
1197     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*159);
1198 }