576abdd9eeed4ee6eb033fb30c8b0c61a58e440d
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_avx_256_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
65     real             rcutoff_scalar;
66     real             *shiftvec,*fshift,*x,*f;
67     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68     real             scratch[4*DIM];
69     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70     real *           vdwioffsetptr0;
71     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72     real *           vdwioffsetptr1;
73     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74     real *           vdwioffsetptr2;
75     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
92     real             *charge;
93     int              nvdwtype;
94     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
95     int              *vdwtype;
96     real             *vdwparam;
97     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
98     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
99     __m128i          vfitab;
100     __m128i          ifour       = _mm_set1_epi32(4);
101     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
102     real             *vftab;
103     __m256d          dummy_mask,cutoff_mask;
104     __m128           tmpmask0,tmpmask1;
105     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
106     __m256d          one     = _mm256_set1_pd(1.0);
107     __m256d          two     = _mm256_set1_pd(2.0);
108     x                = xx[0];
109     f                = ff[0];
110
111     nri              = nlist->nri;
112     iinr             = nlist->iinr;
113     jindex           = nlist->jindex;
114     jjnr             = nlist->jjnr;
115     shiftidx         = nlist->shift;
116     gid              = nlist->gid;
117     shiftvec         = fr->shift_vec[0];
118     fshift           = fr->fshift[0];
119     facel            = _mm256_set1_pd(fr->epsfac);
120     charge           = mdatoms->chargeA;
121     nvdwtype         = fr->ntype;
122     vdwparam         = fr->nbfp;
123     vdwtype          = mdatoms->typeA;
124
125     vftab            = kernel_data->table_elec->data;
126     vftabscale       = _mm256_set1_pd(kernel_data->table_elec->scale);
127
128     /* Setup water-specific parameters */
129     inr              = nlist->iinr[0];
130     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
131     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
132     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
133     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
134
135     jq0              = _mm256_set1_pd(charge[inr+0]);
136     jq1              = _mm256_set1_pd(charge[inr+1]);
137     jq2              = _mm256_set1_pd(charge[inr+2]);
138     vdwjidx0A        = 2*vdwtype[inr+0];
139     qq00             = _mm256_mul_pd(iq0,jq0);
140     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
141     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
142     qq01             = _mm256_mul_pd(iq0,jq1);
143     qq02             = _mm256_mul_pd(iq0,jq2);
144     qq10             = _mm256_mul_pd(iq1,jq0);
145     qq11             = _mm256_mul_pd(iq1,jq1);
146     qq12             = _mm256_mul_pd(iq1,jq2);
147     qq20             = _mm256_mul_pd(iq2,jq0);
148     qq21             = _mm256_mul_pd(iq2,jq1);
149     qq22             = _mm256_mul_pd(iq2,jq2);
150
151     /* Avoid stupid compiler warnings */
152     jnrA = jnrB = jnrC = jnrD = 0;
153     j_coord_offsetA = 0;
154     j_coord_offsetB = 0;
155     j_coord_offsetC = 0;
156     j_coord_offsetD = 0;
157
158     outeriter        = 0;
159     inneriter        = 0;
160
161     for(iidx=0;iidx<4*DIM;iidx++)
162     {
163         scratch[iidx] = 0.0;
164     }
165
166     /* Start outer loop over neighborlists */
167     for(iidx=0; iidx<nri; iidx++)
168     {
169         /* Load shift vector for this list */
170         i_shift_offset   = DIM*shiftidx[iidx];
171
172         /* Load limits for loop over neighbors */
173         j_index_start    = jindex[iidx];
174         j_index_end      = jindex[iidx+1];
175
176         /* Get outer coordinate index */
177         inr              = iinr[iidx];
178         i_coord_offset   = DIM*inr;
179
180         /* Load i particle coords and add shift vector */
181         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
182                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
183
184         fix0             = _mm256_setzero_pd();
185         fiy0             = _mm256_setzero_pd();
186         fiz0             = _mm256_setzero_pd();
187         fix1             = _mm256_setzero_pd();
188         fiy1             = _mm256_setzero_pd();
189         fiz1             = _mm256_setzero_pd();
190         fix2             = _mm256_setzero_pd();
191         fiy2             = _mm256_setzero_pd();
192         fiz2             = _mm256_setzero_pd();
193
194         /* Reset potential sums */
195         velecsum         = _mm256_setzero_pd();
196         vvdwsum          = _mm256_setzero_pd();
197
198         /* Start inner kernel loop */
199         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200         {
201
202             /* Get j neighbor index, and coordinate index */
203             jnrA             = jjnr[jidx];
204             jnrB             = jjnr[jidx+1];
205             jnrC             = jjnr[jidx+2];
206             jnrD             = jjnr[jidx+3];
207             j_coord_offsetA  = DIM*jnrA;
208             j_coord_offsetB  = DIM*jnrB;
209             j_coord_offsetC  = DIM*jnrC;
210             j_coord_offsetD  = DIM*jnrD;
211
212             /* load j atom coordinates */
213             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214                                                  x+j_coord_offsetC,x+j_coord_offsetD,
215                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
216
217             /* Calculate displacement vector */
218             dx00             = _mm256_sub_pd(ix0,jx0);
219             dy00             = _mm256_sub_pd(iy0,jy0);
220             dz00             = _mm256_sub_pd(iz0,jz0);
221             dx01             = _mm256_sub_pd(ix0,jx1);
222             dy01             = _mm256_sub_pd(iy0,jy1);
223             dz01             = _mm256_sub_pd(iz0,jz1);
224             dx02             = _mm256_sub_pd(ix0,jx2);
225             dy02             = _mm256_sub_pd(iy0,jy2);
226             dz02             = _mm256_sub_pd(iz0,jz2);
227             dx10             = _mm256_sub_pd(ix1,jx0);
228             dy10             = _mm256_sub_pd(iy1,jy0);
229             dz10             = _mm256_sub_pd(iz1,jz0);
230             dx11             = _mm256_sub_pd(ix1,jx1);
231             dy11             = _mm256_sub_pd(iy1,jy1);
232             dz11             = _mm256_sub_pd(iz1,jz1);
233             dx12             = _mm256_sub_pd(ix1,jx2);
234             dy12             = _mm256_sub_pd(iy1,jy2);
235             dz12             = _mm256_sub_pd(iz1,jz2);
236             dx20             = _mm256_sub_pd(ix2,jx0);
237             dy20             = _mm256_sub_pd(iy2,jy0);
238             dz20             = _mm256_sub_pd(iz2,jz0);
239             dx21             = _mm256_sub_pd(ix2,jx1);
240             dy21             = _mm256_sub_pd(iy2,jy1);
241             dz21             = _mm256_sub_pd(iz2,jz1);
242             dx22             = _mm256_sub_pd(ix2,jx2);
243             dy22             = _mm256_sub_pd(iy2,jy2);
244             dz22             = _mm256_sub_pd(iz2,jz2);
245
246             /* Calculate squared distance and things based on it */
247             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
248             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
249             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
250             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
251             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
252             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
253             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
254             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
255             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
256
257             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
258             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
259             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
260             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
261             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
262             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
263             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
264             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
265             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
266
267             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
268
269             fjx0             = _mm256_setzero_pd();
270             fjy0             = _mm256_setzero_pd();
271             fjz0             = _mm256_setzero_pd();
272             fjx1             = _mm256_setzero_pd();
273             fjy1             = _mm256_setzero_pd();
274             fjz1             = _mm256_setzero_pd();
275             fjx2             = _mm256_setzero_pd();
276             fjy2             = _mm256_setzero_pd();
277             fjz2             = _mm256_setzero_pd();
278
279             /**************************
280              * CALCULATE INTERACTIONS *
281              **************************/
282
283             r00              = _mm256_mul_pd(rsq00,rinv00);
284
285             /* Calculate table index by multiplying r with table scale and truncate to integer */
286             rt               = _mm256_mul_pd(r00,vftabscale);
287             vfitab           = _mm256_cvttpd_epi32(rt);
288             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
289             vfitab           = _mm_slli_epi32(vfitab,2);
290
291             /* CUBIC SPLINE TABLE ELECTROSTATICS */
292             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
293             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
294             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
295             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
296             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
297             Heps             = _mm256_mul_pd(vfeps,H);
298             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
299             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
300             velec            = _mm256_mul_pd(qq00,VV);
301             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
302             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
303
304             /* LENNARD-JONES DISPERSION/REPULSION */
305
306             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
307             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
308             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
309             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
310             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
311
312             /* Update potential sum for this i atom from the interaction with this j atom. */
313             velecsum         = _mm256_add_pd(velecsum,velec);
314             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
315
316             fscal            = _mm256_add_pd(felec,fvdw);
317
318             /* Calculate temporary vectorial force */
319             tx               = _mm256_mul_pd(fscal,dx00);
320             ty               = _mm256_mul_pd(fscal,dy00);
321             tz               = _mm256_mul_pd(fscal,dz00);
322
323             /* Update vectorial force */
324             fix0             = _mm256_add_pd(fix0,tx);
325             fiy0             = _mm256_add_pd(fiy0,ty);
326             fiz0             = _mm256_add_pd(fiz0,tz);
327
328             fjx0             = _mm256_add_pd(fjx0,tx);
329             fjy0             = _mm256_add_pd(fjy0,ty);
330             fjz0             = _mm256_add_pd(fjz0,tz);
331
332             /**************************
333              * CALCULATE INTERACTIONS *
334              **************************/
335
336             r01              = _mm256_mul_pd(rsq01,rinv01);
337
338             /* Calculate table index by multiplying r with table scale and truncate to integer */
339             rt               = _mm256_mul_pd(r01,vftabscale);
340             vfitab           = _mm256_cvttpd_epi32(rt);
341             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
342             vfitab           = _mm_slli_epi32(vfitab,2);
343
344             /* CUBIC SPLINE TABLE ELECTROSTATICS */
345             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
346             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
347             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
348             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
349             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
350             Heps             = _mm256_mul_pd(vfeps,H);
351             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
352             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
353             velec            = _mm256_mul_pd(qq01,VV);
354             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
355             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
356
357             /* Update potential sum for this i atom from the interaction with this j atom. */
358             velecsum         = _mm256_add_pd(velecsum,velec);
359
360             fscal            = felec;
361
362             /* Calculate temporary vectorial force */
363             tx               = _mm256_mul_pd(fscal,dx01);
364             ty               = _mm256_mul_pd(fscal,dy01);
365             tz               = _mm256_mul_pd(fscal,dz01);
366
367             /* Update vectorial force */
368             fix0             = _mm256_add_pd(fix0,tx);
369             fiy0             = _mm256_add_pd(fiy0,ty);
370             fiz0             = _mm256_add_pd(fiz0,tz);
371
372             fjx1             = _mm256_add_pd(fjx1,tx);
373             fjy1             = _mm256_add_pd(fjy1,ty);
374             fjz1             = _mm256_add_pd(fjz1,tz);
375
376             /**************************
377              * CALCULATE INTERACTIONS *
378              **************************/
379
380             r02              = _mm256_mul_pd(rsq02,rinv02);
381
382             /* Calculate table index by multiplying r with table scale and truncate to integer */
383             rt               = _mm256_mul_pd(r02,vftabscale);
384             vfitab           = _mm256_cvttpd_epi32(rt);
385             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
386             vfitab           = _mm_slli_epi32(vfitab,2);
387
388             /* CUBIC SPLINE TABLE ELECTROSTATICS */
389             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
390             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
391             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
392             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
393             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
394             Heps             = _mm256_mul_pd(vfeps,H);
395             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
396             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
397             velec            = _mm256_mul_pd(qq02,VV);
398             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
399             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
400
401             /* Update potential sum for this i atom from the interaction with this j atom. */
402             velecsum         = _mm256_add_pd(velecsum,velec);
403
404             fscal            = felec;
405
406             /* Calculate temporary vectorial force */
407             tx               = _mm256_mul_pd(fscal,dx02);
408             ty               = _mm256_mul_pd(fscal,dy02);
409             tz               = _mm256_mul_pd(fscal,dz02);
410
411             /* Update vectorial force */
412             fix0             = _mm256_add_pd(fix0,tx);
413             fiy0             = _mm256_add_pd(fiy0,ty);
414             fiz0             = _mm256_add_pd(fiz0,tz);
415
416             fjx2             = _mm256_add_pd(fjx2,tx);
417             fjy2             = _mm256_add_pd(fjy2,ty);
418             fjz2             = _mm256_add_pd(fjz2,tz);
419
420             /**************************
421              * CALCULATE INTERACTIONS *
422              **************************/
423
424             r10              = _mm256_mul_pd(rsq10,rinv10);
425
426             /* Calculate table index by multiplying r with table scale and truncate to integer */
427             rt               = _mm256_mul_pd(r10,vftabscale);
428             vfitab           = _mm256_cvttpd_epi32(rt);
429             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
430             vfitab           = _mm_slli_epi32(vfitab,2);
431
432             /* CUBIC SPLINE TABLE ELECTROSTATICS */
433             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
434             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
435             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
436             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
437             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
438             Heps             = _mm256_mul_pd(vfeps,H);
439             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
440             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
441             velec            = _mm256_mul_pd(qq10,VV);
442             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
443             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
444
445             /* Update potential sum for this i atom from the interaction with this j atom. */
446             velecsum         = _mm256_add_pd(velecsum,velec);
447
448             fscal            = felec;
449
450             /* Calculate temporary vectorial force */
451             tx               = _mm256_mul_pd(fscal,dx10);
452             ty               = _mm256_mul_pd(fscal,dy10);
453             tz               = _mm256_mul_pd(fscal,dz10);
454
455             /* Update vectorial force */
456             fix1             = _mm256_add_pd(fix1,tx);
457             fiy1             = _mm256_add_pd(fiy1,ty);
458             fiz1             = _mm256_add_pd(fiz1,tz);
459
460             fjx0             = _mm256_add_pd(fjx0,tx);
461             fjy0             = _mm256_add_pd(fjy0,ty);
462             fjz0             = _mm256_add_pd(fjz0,tz);
463
464             /**************************
465              * CALCULATE INTERACTIONS *
466              **************************/
467
468             r11              = _mm256_mul_pd(rsq11,rinv11);
469
470             /* Calculate table index by multiplying r with table scale and truncate to integer */
471             rt               = _mm256_mul_pd(r11,vftabscale);
472             vfitab           = _mm256_cvttpd_epi32(rt);
473             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
474             vfitab           = _mm_slli_epi32(vfitab,2);
475
476             /* CUBIC SPLINE TABLE ELECTROSTATICS */
477             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
478             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
479             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
480             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
481             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
482             Heps             = _mm256_mul_pd(vfeps,H);
483             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
484             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
485             velec            = _mm256_mul_pd(qq11,VV);
486             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
487             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
488
489             /* Update potential sum for this i atom from the interaction with this j atom. */
490             velecsum         = _mm256_add_pd(velecsum,velec);
491
492             fscal            = felec;
493
494             /* Calculate temporary vectorial force */
495             tx               = _mm256_mul_pd(fscal,dx11);
496             ty               = _mm256_mul_pd(fscal,dy11);
497             tz               = _mm256_mul_pd(fscal,dz11);
498
499             /* Update vectorial force */
500             fix1             = _mm256_add_pd(fix1,tx);
501             fiy1             = _mm256_add_pd(fiy1,ty);
502             fiz1             = _mm256_add_pd(fiz1,tz);
503
504             fjx1             = _mm256_add_pd(fjx1,tx);
505             fjy1             = _mm256_add_pd(fjy1,ty);
506             fjz1             = _mm256_add_pd(fjz1,tz);
507
508             /**************************
509              * CALCULATE INTERACTIONS *
510              **************************/
511
512             r12              = _mm256_mul_pd(rsq12,rinv12);
513
514             /* Calculate table index by multiplying r with table scale and truncate to integer */
515             rt               = _mm256_mul_pd(r12,vftabscale);
516             vfitab           = _mm256_cvttpd_epi32(rt);
517             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
518             vfitab           = _mm_slli_epi32(vfitab,2);
519
520             /* CUBIC SPLINE TABLE ELECTROSTATICS */
521             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
522             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
523             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
524             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
525             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
526             Heps             = _mm256_mul_pd(vfeps,H);
527             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
528             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
529             velec            = _mm256_mul_pd(qq12,VV);
530             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
531             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
532
533             /* Update potential sum for this i atom from the interaction with this j atom. */
534             velecsum         = _mm256_add_pd(velecsum,velec);
535
536             fscal            = felec;
537
538             /* Calculate temporary vectorial force */
539             tx               = _mm256_mul_pd(fscal,dx12);
540             ty               = _mm256_mul_pd(fscal,dy12);
541             tz               = _mm256_mul_pd(fscal,dz12);
542
543             /* Update vectorial force */
544             fix1             = _mm256_add_pd(fix1,tx);
545             fiy1             = _mm256_add_pd(fiy1,ty);
546             fiz1             = _mm256_add_pd(fiz1,tz);
547
548             fjx2             = _mm256_add_pd(fjx2,tx);
549             fjy2             = _mm256_add_pd(fjy2,ty);
550             fjz2             = _mm256_add_pd(fjz2,tz);
551
552             /**************************
553              * CALCULATE INTERACTIONS *
554              **************************/
555
556             r20              = _mm256_mul_pd(rsq20,rinv20);
557
558             /* Calculate table index by multiplying r with table scale and truncate to integer */
559             rt               = _mm256_mul_pd(r20,vftabscale);
560             vfitab           = _mm256_cvttpd_epi32(rt);
561             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
562             vfitab           = _mm_slli_epi32(vfitab,2);
563
564             /* CUBIC SPLINE TABLE ELECTROSTATICS */
565             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
566             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
567             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
568             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
569             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
570             Heps             = _mm256_mul_pd(vfeps,H);
571             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
572             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
573             velec            = _mm256_mul_pd(qq20,VV);
574             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
575             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
576
577             /* Update potential sum for this i atom from the interaction with this j atom. */
578             velecsum         = _mm256_add_pd(velecsum,velec);
579
580             fscal            = felec;
581
582             /* Calculate temporary vectorial force */
583             tx               = _mm256_mul_pd(fscal,dx20);
584             ty               = _mm256_mul_pd(fscal,dy20);
585             tz               = _mm256_mul_pd(fscal,dz20);
586
587             /* Update vectorial force */
588             fix2             = _mm256_add_pd(fix2,tx);
589             fiy2             = _mm256_add_pd(fiy2,ty);
590             fiz2             = _mm256_add_pd(fiz2,tz);
591
592             fjx0             = _mm256_add_pd(fjx0,tx);
593             fjy0             = _mm256_add_pd(fjy0,ty);
594             fjz0             = _mm256_add_pd(fjz0,tz);
595
596             /**************************
597              * CALCULATE INTERACTIONS *
598              **************************/
599
600             r21              = _mm256_mul_pd(rsq21,rinv21);
601
602             /* Calculate table index by multiplying r with table scale and truncate to integer */
603             rt               = _mm256_mul_pd(r21,vftabscale);
604             vfitab           = _mm256_cvttpd_epi32(rt);
605             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
606             vfitab           = _mm_slli_epi32(vfitab,2);
607
608             /* CUBIC SPLINE TABLE ELECTROSTATICS */
609             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
610             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
611             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
612             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
613             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
614             Heps             = _mm256_mul_pd(vfeps,H);
615             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
616             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
617             velec            = _mm256_mul_pd(qq21,VV);
618             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
619             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
620
621             /* Update potential sum for this i atom from the interaction with this j atom. */
622             velecsum         = _mm256_add_pd(velecsum,velec);
623
624             fscal            = felec;
625
626             /* Calculate temporary vectorial force */
627             tx               = _mm256_mul_pd(fscal,dx21);
628             ty               = _mm256_mul_pd(fscal,dy21);
629             tz               = _mm256_mul_pd(fscal,dz21);
630
631             /* Update vectorial force */
632             fix2             = _mm256_add_pd(fix2,tx);
633             fiy2             = _mm256_add_pd(fiy2,ty);
634             fiz2             = _mm256_add_pd(fiz2,tz);
635
636             fjx1             = _mm256_add_pd(fjx1,tx);
637             fjy1             = _mm256_add_pd(fjy1,ty);
638             fjz1             = _mm256_add_pd(fjz1,tz);
639
640             /**************************
641              * CALCULATE INTERACTIONS *
642              **************************/
643
644             r22              = _mm256_mul_pd(rsq22,rinv22);
645
646             /* Calculate table index by multiplying r with table scale and truncate to integer */
647             rt               = _mm256_mul_pd(r22,vftabscale);
648             vfitab           = _mm256_cvttpd_epi32(rt);
649             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
650             vfitab           = _mm_slli_epi32(vfitab,2);
651
652             /* CUBIC SPLINE TABLE ELECTROSTATICS */
653             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
654             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
655             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
656             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
657             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
658             Heps             = _mm256_mul_pd(vfeps,H);
659             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
660             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
661             velec            = _mm256_mul_pd(qq22,VV);
662             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
663             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
664
665             /* Update potential sum for this i atom from the interaction with this j atom. */
666             velecsum         = _mm256_add_pd(velecsum,velec);
667
668             fscal            = felec;
669
670             /* Calculate temporary vectorial force */
671             tx               = _mm256_mul_pd(fscal,dx22);
672             ty               = _mm256_mul_pd(fscal,dy22);
673             tz               = _mm256_mul_pd(fscal,dz22);
674
675             /* Update vectorial force */
676             fix2             = _mm256_add_pd(fix2,tx);
677             fiy2             = _mm256_add_pd(fiy2,ty);
678             fiz2             = _mm256_add_pd(fiz2,tz);
679
680             fjx2             = _mm256_add_pd(fjx2,tx);
681             fjy2             = _mm256_add_pd(fjy2,ty);
682             fjz2             = _mm256_add_pd(fjz2,tz);
683
684             fjptrA             = f+j_coord_offsetA;
685             fjptrB             = f+j_coord_offsetB;
686             fjptrC             = f+j_coord_offsetC;
687             fjptrD             = f+j_coord_offsetD;
688
689             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
690                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
691
692             /* Inner loop uses 400 flops */
693         }
694
695         if(jidx<j_index_end)
696         {
697
698             /* Get j neighbor index, and coordinate index */
699             jnrlistA         = jjnr[jidx];
700             jnrlistB         = jjnr[jidx+1];
701             jnrlistC         = jjnr[jidx+2];
702             jnrlistD         = jjnr[jidx+3];
703             /* Sign of each element will be negative for non-real atoms.
704              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
705              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
706              */
707             tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
708
709             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
710             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
711             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
712
713             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
714             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
715             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
716             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
717             j_coord_offsetA  = DIM*jnrA;
718             j_coord_offsetB  = DIM*jnrB;
719             j_coord_offsetC  = DIM*jnrC;
720             j_coord_offsetD  = DIM*jnrD;
721
722             /* load j atom coordinates */
723             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
724                                                  x+j_coord_offsetC,x+j_coord_offsetD,
725                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
726
727             /* Calculate displacement vector */
728             dx00             = _mm256_sub_pd(ix0,jx0);
729             dy00             = _mm256_sub_pd(iy0,jy0);
730             dz00             = _mm256_sub_pd(iz0,jz0);
731             dx01             = _mm256_sub_pd(ix0,jx1);
732             dy01             = _mm256_sub_pd(iy0,jy1);
733             dz01             = _mm256_sub_pd(iz0,jz1);
734             dx02             = _mm256_sub_pd(ix0,jx2);
735             dy02             = _mm256_sub_pd(iy0,jy2);
736             dz02             = _mm256_sub_pd(iz0,jz2);
737             dx10             = _mm256_sub_pd(ix1,jx0);
738             dy10             = _mm256_sub_pd(iy1,jy0);
739             dz10             = _mm256_sub_pd(iz1,jz0);
740             dx11             = _mm256_sub_pd(ix1,jx1);
741             dy11             = _mm256_sub_pd(iy1,jy1);
742             dz11             = _mm256_sub_pd(iz1,jz1);
743             dx12             = _mm256_sub_pd(ix1,jx2);
744             dy12             = _mm256_sub_pd(iy1,jy2);
745             dz12             = _mm256_sub_pd(iz1,jz2);
746             dx20             = _mm256_sub_pd(ix2,jx0);
747             dy20             = _mm256_sub_pd(iy2,jy0);
748             dz20             = _mm256_sub_pd(iz2,jz0);
749             dx21             = _mm256_sub_pd(ix2,jx1);
750             dy21             = _mm256_sub_pd(iy2,jy1);
751             dz21             = _mm256_sub_pd(iz2,jz1);
752             dx22             = _mm256_sub_pd(ix2,jx2);
753             dy22             = _mm256_sub_pd(iy2,jy2);
754             dz22             = _mm256_sub_pd(iz2,jz2);
755
756             /* Calculate squared distance and things based on it */
757             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
758             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
759             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
760             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
761             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
762             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
763             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
764             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
765             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
766
767             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
768             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
769             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
770             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
771             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
772             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
773             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
774             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
775             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
776
777             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
778
779             fjx0             = _mm256_setzero_pd();
780             fjy0             = _mm256_setzero_pd();
781             fjz0             = _mm256_setzero_pd();
782             fjx1             = _mm256_setzero_pd();
783             fjy1             = _mm256_setzero_pd();
784             fjz1             = _mm256_setzero_pd();
785             fjx2             = _mm256_setzero_pd();
786             fjy2             = _mm256_setzero_pd();
787             fjz2             = _mm256_setzero_pd();
788
789             /**************************
790              * CALCULATE INTERACTIONS *
791              **************************/
792
793             r00              = _mm256_mul_pd(rsq00,rinv00);
794             r00              = _mm256_andnot_pd(dummy_mask,r00);
795
796             /* Calculate table index by multiplying r with table scale and truncate to integer */
797             rt               = _mm256_mul_pd(r00,vftabscale);
798             vfitab           = _mm256_cvttpd_epi32(rt);
799             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
800             vfitab           = _mm_slli_epi32(vfitab,2);
801
802             /* CUBIC SPLINE TABLE ELECTROSTATICS */
803             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
804             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
805             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
806             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
807             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
808             Heps             = _mm256_mul_pd(vfeps,H);
809             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
810             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
811             velec            = _mm256_mul_pd(qq00,VV);
812             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
813             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
814
815             /* LENNARD-JONES DISPERSION/REPULSION */
816
817             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
818             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
819             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
820             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
821             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
822
823             /* Update potential sum for this i atom from the interaction with this j atom. */
824             velec            = _mm256_andnot_pd(dummy_mask,velec);
825             velecsum         = _mm256_add_pd(velecsum,velec);
826             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
827             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
828
829             fscal            = _mm256_add_pd(felec,fvdw);
830
831             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
832
833             /* Calculate temporary vectorial force */
834             tx               = _mm256_mul_pd(fscal,dx00);
835             ty               = _mm256_mul_pd(fscal,dy00);
836             tz               = _mm256_mul_pd(fscal,dz00);
837
838             /* Update vectorial force */
839             fix0             = _mm256_add_pd(fix0,tx);
840             fiy0             = _mm256_add_pd(fiy0,ty);
841             fiz0             = _mm256_add_pd(fiz0,tz);
842
843             fjx0             = _mm256_add_pd(fjx0,tx);
844             fjy0             = _mm256_add_pd(fjy0,ty);
845             fjz0             = _mm256_add_pd(fjz0,tz);
846
847             /**************************
848              * CALCULATE INTERACTIONS *
849              **************************/
850
851             r01              = _mm256_mul_pd(rsq01,rinv01);
852             r01              = _mm256_andnot_pd(dummy_mask,r01);
853
854             /* Calculate table index by multiplying r with table scale and truncate to integer */
855             rt               = _mm256_mul_pd(r01,vftabscale);
856             vfitab           = _mm256_cvttpd_epi32(rt);
857             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
858             vfitab           = _mm_slli_epi32(vfitab,2);
859
860             /* CUBIC SPLINE TABLE ELECTROSTATICS */
861             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
862             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
863             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
864             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
865             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
866             Heps             = _mm256_mul_pd(vfeps,H);
867             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
868             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
869             velec            = _mm256_mul_pd(qq01,VV);
870             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
871             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
872
873             /* Update potential sum for this i atom from the interaction with this j atom. */
874             velec            = _mm256_andnot_pd(dummy_mask,velec);
875             velecsum         = _mm256_add_pd(velecsum,velec);
876
877             fscal            = felec;
878
879             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
880
881             /* Calculate temporary vectorial force */
882             tx               = _mm256_mul_pd(fscal,dx01);
883             ty               = _mm256_mul_pd(fscal,dy01);
884             tz               = _mm256_mul_pd(fscal,dz01);
885
886             /* Update vectorial force */
887             fix0             = _mm256_add_pd(fix0,tx);
888             fiy0             = _mm256_add_pd(fiy0,ty);
889             fiz0             = _mm256_add_pd(fiz0,tz);
890
891             fjx1             = _mm256_add_pd(fjx1,tx);
892             fjy1             = _mm256_add_pd(fjy1,ty);
893             fjz1             = _mm256_add_pd(fjz1,tz);
894
895             /**************************
896              * CALCULATE INTERACTIONS *
897              **************************/
898
899             r02              = _mm256_mul_pd(rsq02,rinv02);
900             r02              = _mm256_andnot_pd(dummy_mask,r02);
901
902             /* Calculate table index by multiplying r with table scale and truncate to integer */
903             rt               = _mm256_mul_pd(r02,vftabscale);
904             vfitab           = _mm256_cvttpd_epi32(rt);
905             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
906             vfitab           = _mm_slli_epi32(vfitab,2);
907
908             /* CUBIC SPLINE TABLE ELECTROSTATICS */
909             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
910             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
911             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
912             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
913             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
914             Heps             = _mm256_mul_pd(vfeps,H);
915             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
916             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
917             velec            = _mm256_mul_pd(qq02,VV);
918             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
919             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
920
921             /* Update potential sum for this i atom from the interaction with this j atom. */
922             velec            = _mm256_andnot_pd(dummy_mask,velec);
923             velecsum         = _mm256_add_pd(velecsum,velec);
924
925             fscal            = felec;
926
927             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
928
929             /* Calculate temporary vectorial force */
930             tx               = _mm256_mul_pd(fscal,dx02);
931             ty               = _mm256_mul_pd(fscal,dy02);
932             tz               = _mm256_mul_pd(fscal,dz02);
933
934             /* Update vectorial force */
935             fix0             = _mm256_add_pd(fix0,tx);
936             fiy0             = _mm256_add_pd(fiy0,ty);
937             fiz0             = _mm256_add_pd(fiz0,tz);
938
939             fjx2             = _mm256_add_pd(fjx2,tx);
940             fjy2             = _mm256_add_pd(fjy2,ty);
941             fjz2             = _mm256_add_pd(fjz2,tz);
942
943             /**************************
944              * CALCULATE INTERACTIONS *
945              **************************/
946
947             r10              = _mm256_mul_pd(rsq10,rinv10);
948             r10              = _mm256_andnot_pd(dummy_mask,r10);
949
950             /* Calculate table index by multiplying r with table scale and truncate to integer */
951             rt               = _mm256_mul_pd(r10,vftabscale);
952             vfitab           = _mm256_cvttpd_epi32(rt);
953             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
954             vfitab           = _mm_slli_epi32(vfitab,2);
955
956             /* CUBIC SPLINE TABLE ELECTROSTATICS */
957             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
958             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
959             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
960             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
961             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
962             Heps             = _mm256_mul_pd(vfeps,H);
963             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
964             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
965             velec            = _mm256_mul_pd(qq10,VV);
966             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
967             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
968
969             /* Update potential sum for this i atom from the interaction with this j atom. */
970             velec            = _mm256_andnot_pd(dummy_mask,velec);
971             velecsum         = _mm256_add_pd(velecsum,velec);
972
973             fscal            = felec;
974
975             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
976
977             /* Calculate temporary vectorial force */
978             tx               = _mm256_mul_pd(fscal,dx10);
979             ty               = _mm256_mul_pd(fscal,dy10);
980             tz               = _mm256_mul_pd(fscal,dz10);
981
982             /* Update vectorial force */
983             fix1             = _mm256_add_pd(fix1,tx);
984             fiy1             = _mm256_add_pd(fiy1,ty);
985             fiz1             = _mm256_add_pd(fiz1,tz);
986
987             fjx0             = _mm256_add_pd(fjx0,tx);
988             fjy0             = _mm256_add_pd(fjy0,ty);
989             fjz0             = _mm256_add_pd(fjz0,tz);
990
991             /**************************
992              * CALCULATE INTERACTIONS *
993              **************************/
994
995             r11              = _mm256_mul_pd(rsq11,rinv11);
996             r11              = _mm256_andnot_pd(dummy_mask,r11);
997
998             /* Calculate table index by multiplying r with table scale and truncate to integer */
999             rt               = _mm256_mul_pd(r11,vftabscale);
1000             vfitab           = _mm256_cvttpd_epi32(rt);
1001             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1002             vfitab           = _mm_slli_epi32(vfitab,2);
1003
1004             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1005             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1006             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1007             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1008             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1009             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1010             Heps             = _mm256_mul_pd(vfeps,H);
1011             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1012             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1013             velec            = _mm256_mul_pd(qq11,VV);
1014             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1015             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1016
1017             /* Update potential sum for this i atom from the interaction with this j atom. */
1018             velec            = _mm256_andnot_pd(dummy_mask,velec);
1019             velecsum         = _mm256_add_pd(velecsum,velec);
1020
1021             fscal            = felec;
1022
1023             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1024
1025             /* Calculate temporary vectorial force */
1026             tx               = _mm256_mul_pd(fscal,dx11);
1027             ty               = _mm256_mul_pd(fscal,dy11);
1028             tz               = _mm256_mul_pd(fscal,dz11);
1029
1030             /* Update vectorial force */
1031             fix1             = _mm256_add_pd(fix1,tx);
1032             fiy1             = _mm256_add_pd(fiy1,ty);
1033             fiz1             = _mm256_add_pd(fiz1,tz);
1034
1035             fjx1             = _mm256_add_pd(fjx1,tx);
1036             fjy1             = _mm256_add_pd(fjy1,ty);
1037             fjz1             = _mm256_add_pd(fjz1,tz);
1038
1039             /**************************
1040              * CALCULATE INTERACTIONS *
1041              **************************/
1042
1043             r12              = _mm256_mul_pd(rsq12,rinv12);
1044             r12              = _mm256_andnot_pd(dummy_mask,r12);
1045
1046             /* Calculate table index by multiplying r with table scale and truncate to integer */
1047             rt               = _mm256_mul_pd(r12,vftabscale);
1048             vfitab           = _mm256_cvttpd_epi32(rt);
1049             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1050             vfitab           = _mm_slli_epi32(vfitab,2);
1051
1052             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1053             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1054             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1055             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1056             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1057             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1058             Heps             = _mm256_mul_pd(vfeps,H);
1059             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1060             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1061             velec            = _mm256_mul_pd(qq12,VV);
1062             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1063             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1064
1065             /* Update potential sum for this i atom from the interaction with this j atom. */
1066             velec            = _mm256_andnot_pd(dummy_mask,velec);
1067             velecsum         = _mm256_add_pd(velecsum,velec);
1068
1069             fscal            = felec;
1070
1071             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1072
1073             /* Calculate temporary vectorial force */
1074             tx               = _mm256_mul_pd(fscal,dx12);
1075             ty               = _mm256_mul_pd(fscal,dy12);
1076             tz               = _mm256_mul_pd(fscal,dz12);
1077
1078             /* Update vectorial force */
1079             fix1             = _mm256_add_pd(fix1,tx);
1080             fiy1             = _mm256_add_pd(fiy1,ty);
1081             fiz1             = _mm256_add_pd(fiz1,tz);
1082
1083             fjx2             = _mm256_add_pd(fjx2,tx);
1084             fjy2             = _mm256_add_pd(fjy2,ty);
1085             fjz2             = _mm256_add_pd(fjz2,tz);
1086
1087             /**************************
1088              * CALCULATE INTERACTIONS *
1089              **************************/
1090
1091             r20              = _mm256_mul_pd(rsq20,rinv20);
1092             r20              = _mm256_andnot_pd(dummy_mask,r20);
1093
1094             /* Calculate table index by multiplying r with table scale and truncate to integer */
1095             rt               = _mm256_mul_pd(r20,vftabscale);
1096             vfitab           = _mm256_cvttpd_epi32(rt);
1097             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1098             vfitab           = _mm_slli_epi32(vfitab,2);
1099
1100             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1101             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1102             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1103             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1104             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1105             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1106             Heps             = _mm256_mul_pd(vfeps,H);
1107             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1108             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1109             velec            = _mm256_mul_pd(qq20,VV);
1110             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1111             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1112
1113             /* Update potential sum for this i atom from the interaction with this j atom. */
1114             velec            = _mm256_andnot_pd(dummy_mask,velec);
1115             velecsum         = _mm256_add_pd(velecsum,velec);
1116
1117             fscal            = felec;
1118
1119             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1120
1121             /* Calculate temporary vectorial force */
1122             tx               = _mm256_mul_pd(fscal,dx20);
1123             ty               = _mm256_mul_pd(fscal,dy20);
1124             tz               = _mm256_mul_pd(fscal,dz20);
1125
1126             /* Update vectorial force */
1127             fix2             = _mm256_add_pd(fix2,tx);
1128             fiy2             = _mm256_add_pd(fiy2,ty);
1129             fiz2             = _mm256_add_pd(fiz2,tz);
1130
1131             fjx0             = _mm256_add_pd(fjx0,tx);
1132             fjy0             = _mm256_add_pd(fjy0,ty);
1133             fjz0             = _mm256_add_pd(fjz0,tz);
1134
1135             /**************************
1136              * CALCULATE INTERACTIONS *
1137              **************************/
1138
1139             r21              = _mm256_mul_pd(rsq21,rinv21);
1140             r21              = _mm256_andnot_pd(dummy_mask,r21);
1141
1142             /* Calculate table index by multiplying r with table scale and truncate to integer */
1143             rt               = _mm256_mul_pd(r21,vftabscale);
1144             vfitab           = _mm256_cvttpd_epi32(rt);
1145             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1146             vfitab           = _mm_slli_epi32(vfitab,2);
1147
1148             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1149             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1150             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1151             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1152             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1153             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1154             Heps             = _mm256_mul_pd(vfeps,H);
1155             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1156             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1157             velec            = _mm256_mul_pd(qq21,VV);
1158             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1159             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1160
1161             /* Update potential sum for this i atom from the interaction with this j atom. */
1162             velec            = _mm256_andnot_pd(dummy_mask,velec);
1163             velecsum         = _mm256_add_pd(velecsum,velec);
1164
1165             fscal            = felec;
1166
1167             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1168
1169             /* Calculate temporary vectorial force */
1170             tx               = _mm256_mul_pd(fscal,dx21);
1171             ty               = _mm256_mul_pd(fscal,dy21);
1172             tz               = _mm256_mul_pd(fscal,dz21);
1173
1174             /* Update vectorial force */
1175             fix2             = _mm256_add_pd(fix2,tx);
1176             fiy2             = _mm256_add_pd(fiy2,ty);
1177             fiz2             = _mm256_add_pd(fiz2,tz);
1178
1179             fjx1             = _mm256_add_pd(fjx1,tx);
1180             fjy1             = _mm256_add_pd(fjy1,ty);
1181             fjz1             = _mm256_add_pd(fjz1,tz);
1182
1183             /**************************
1184              * CALCULATE INTERACTIONS *
1185              **************************/
1186
1187             r22              = _mm256_mul_pd(rsq22,rinv22);
1188             r22              = _mm256_andnot_pd(dummy_mask,r22);
1189
1190             /* Calculate table index by multiplying r with table scale and truncate to integer */
1191             rt               = _mm256_mul_pd(r22,vftabscale);
1192             vfitab           = _mm256_cvttpd_epi32(rt);
1193             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1194             vfitab           = _mm_slli_epi32(vfitab,2);
1195
1196             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1197             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1198             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1199             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1200             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1201             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1202             Heps             = _mm256_mul_pd(vfeps,H);
1203             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1204             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1205             velec            = _mm256_mul_pd(qq22,VV);
1206             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1207             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1208
1209             /* Update potential sum for this i atom from the interaction with this j atom. */
1210             velec            = _mm256_andnot_pd(dummy_mask,velec);
1211             velecsum         = _mm256_add_pd(velecsum,velec);
1212
1213             fscal            = felec;
1214
1215             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1216
1217             /* Calculate temporary vectorial force */
1218             tx               = _mm256_mul_pd(fscal,dx22);
1219             ty               = _mm256_mul_pd(fscal,dy22);
1220             tz               = _mm256_mul_pd(fscal,dz22);
1221
1222             /* Update vectorial force */
1223             fix2             = _mm256_add_pd(fix2,tx);
1224             fiy2             = _mm256_add_pd(fiy2,ty);
1225             fiz2             = _mm256_add_pd(fiz2,tz);
1226
1227             fjx2             = _mm256_add_pd(fjx2,tx);
1228             fjy2             = _mm256_add_pd(fjy2,ty);
1229             fjz2             = _mm256_add_pd(fjz2,tz);
1230
1231             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1232             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1233             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1234             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1235
1236             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1237                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1238
1239             /* Inner loop uses 409 flops */
1240         }
1241
1242         /* End of innermost loop */
1243
1244         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1245                                                  f+i_coord_offset,fshift+i_shift_offset);
1246
1247         ggid                        = gid[iidx];
1248         /* Update potential energies */
1249         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1250         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1251
1252         /* Increment number of inner iterations */
1253         inneriter                  += j_index_end - j_index_start;
1254
1255         /* Outer loop uses 20 flops */
1256     }
1257
1258     /* Increment number of outer iterations */
1259     outeriter        += nri;
1260
1261     /* Update outer/inner flops */
1262
1263     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1264 }
1265 /*
1266  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
1267  * Electrostatics interaction: CubicSplineTable
1268  * VdW interaction:            LennardJones
1269  * Geometry:                   Water3-Water3
1270  * Calculate force/pot:        Force
1271  */
1272 void
1273 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
1274                     (t_nblist * gmx_restrict                nlist,
1275                      rvec * gmx_restrict                    xx,
1276                      rvec * gmx_restrict                    ff,
1277                      t_forcerec * gmx_restrict              fr,
1278                      t_mdatoms * gmx_restrict               mdatoms,
1279                      nb_kernel_data_t * gmx_restrict        kernel_data,
1280                      t_nrnb * gmx_restrict                  nrnb)
1281 {
1282     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1283      * just 0 for non-waters.
1284      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1285      * jnr indices corresponding to data put in the four positions in the SIMD register.
1286      */
1287     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1288     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1289     int              jnrA,jnrB,jnrC,jnrD;
1290     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1291     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1292     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1293     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1294     real             rcutoff_scalar;
1295     real             *shiftvec,*fshift,*x,*f;
1296     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1297     real             scratch[4*DIM];
1298     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1299     real *           vdwioffsetptr0;
1300     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1301     real *           vdwioffsetptr1;
1302     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1303     real *           vdwioffsetptr2;
1304     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1305     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1306     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1307     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1308     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1309     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1310     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1311     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1312     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1313     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1314     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1315     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1316     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1317     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1318     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1319     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1320     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1321     real             *charge;
1322     int              nvdwtype;
1323     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1324     int              *vdwtype;
1325     real             *vdwparam;
1326     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1327     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1328     __m128i          vfitab;
1329     __m128i          ifour       = _mm_set1_epi32(4);
1330     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1331     real             *vftab;
1332     __m256d          dummy_mask,cutoff_mask;
1333     __m128           tmpmask0,tmpmask1;
1334     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1335     __m256d          one     = _mm256_set1_pd(1.0);
1336     __m256d          two     = _mm256_set1_pd(2.0);
1337     x                = xx[0];
1338     f                = ff[0];
1339
1340     nri              = nlist->nri;
1341     iinr             = nlist->iinr;
1342     jindex           = nlist->jindex;
1343     jjnr             = nlist->jjnr;
1344     shiftidx         = nlist->shift;
1345     gid              = nlist->gid;
1346     shiftvec         = fr->shift_vec[0];
1347     fshift           = fr->fshift[0];
1348     facel            = _mm256_set1_pd(fr->epsfac);
1349     charge           = mdatoms->chargeA;
1350     nvdwtype         = fr->ntype;
1351     vdwparam         = fr->nbfp;
1352     vdwtype          = mdatoms->typeA;
1353
1354     vftab            = kernel_data->table_elec->data;
1355     vftabscale       = _mm256_set1_pd(kernel_data->table_elec->scale);
1356
1357     /* Setup water-specific parameters */
1358     inr              = nlist->iinr[0];
1359     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1360     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1361     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1362     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1363
1364     jq0              = _mm256_set1_pd(charge[inr+0]);
1365     jq1              = _mm256_set1_pd(charge[inr+1]);
1366     jq2              = _mm256_set1_pd(charge[inr+2]);
1367     vdwjidx0A        = 2*vdwtype[inr+0];
1368     qq00             = _mm256_mul_pd(iq0,jq0);
1369     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1370     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1371     qq01             = _mm256_mul_pd(iq0,jq1);
1372     qq02             = _mm256_mul_pd(iq0,jq2);
1373     qq10             = _mm256_mul_pd(iq1,jq0);
1374     qq11             = _mm256_mul_pd(iq1,jq1);
1375     qq12             = _mm256_mul_pd(iq1,jq2);
1376     qq20             = _mm256_mul_pd(iq2,jq0);
1377     qq21             = _mm256_mul_pd(iq2,jq1);
1378     qq22             = _mm256_mul_pd(iq2,jq2);
1379
1380     /* Avoid stupid compiler warnings */
1381     jnrA = jnrB = jnrC = jnrD = 0;
1382     j_coord_offsetA = 0;
1383     j_coord_offsetB = 0;
1384     j_coord_offsetC = 0;
1385     j_coord_offsetD = 0;
1386
1387     outeriter        = 0;
1388     inneriter        = 0;
1389
1390     for(iidx=0;iidx<4*DIM;iidx++)
1391     {
1392         scratch[iidx] = 0.0;
1393     }
1394
1395     /* Start outer loop over neighborlists */
1396     for(iidx=0; iidx<nri; iidx++)
1397     {
1398         /* Load shift vector for this list */
1399         i_shift_offset   = DIM*shiftidx[iidx];
1400
1401         /* Load limits for loop over neighbors */
1402         j_index_start    = jindex[iidx];
1403         j_index_end      = jindex[iidx+1];
1404
1405         /* Get outer coordinate index */
1406         inr              = iinr[iidx];
1407         i_coord_offset   = DIM*inr;
1408
1409         /* Load i particle coords and add shift vector */
1410         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1411                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1412
1413         fix0             = _mm256_setzero_pd();
1414         fiy0             = _mm256_setzero_pd();
1415         fiz0             = _mm256_setzero_pd();
1416         fix1             = _mm256_setzero_pd();
1417         fiy1             = _mm256_setzero_pd();
1418         fiz1             = _mm256_setzero_pd();
1419         fix2             = _mm256_setzero_pd();
1420         fiy2             = _mm256_setzero_pd();
1421         fiz2             = _mm256_setzero_pd();
1422
1423         /* Start inner kernel loop */
1424         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1425         {
1426
1427             /* Get j neighbor index, and coordinate index */
1428             jnrA             = jjnr[jidx];
1429             jnrB             = jjnr[jidx+1];
1430             jnrC             = jjnr[jidx+2];
1431             jnrD             = jjnr[jidx+3];
1432             j_coord_offsetA  = DIM*jnrA;
1433             j_coord_offsetB  = DIM*jnrB;
1434             j_coord_offsetC  = DIM*jnrC;
1435             j_coord_offsetD  = DIM*jnrD;
1436
1437             /* load j atom coordinates */
1438             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1439                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1440                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1441
1442             /* Calculate displacement vector */
1443             dx00             = _mm256_sub_pd(ix0,jx0);
1444             dy00             = _mm256_sub_pd(iy0,jy0);
1445             dz00             = _mm256_sub_pd(iz0,jz0);
1446             dx01             = _mm256_sub_pd(ix0,jx1);
1447             dy01             = _mm256_sub_pd(iy0,jy1);
1448             dz01             = _mm256_sub_pd(iz0,jz1);
1449             dx02             = _mm256_sub_pd(ix0,jx2);
1450             dy02             = _mm256_sub_pd(iy0,jy2);
1451             dz02             = _mm256_sub_pd(iz0,jz2);
1452             dx10             = _mm256_sub_pd(ix1,jx0);
1453             dy10             = _mm256_sub_pd(iy1,jy0);
1454             dz10             = _mm256_sub_pd(iz1,jz0);
1455             dx11             = _mm256_sub_pd(ix1,jx1);
1456             dy11             = _mm256_sub_pd(iy1,jy1);
1457             dz11             = _mm256_sub_pd(iz1,jz1);
1458             dx12             = _mm256_sub_pd(ix1,jx2);
1459             dy12             = _mm256_sub_pd(iy1,jy2);
1460             dz12             = _mm256_sub_pd(iz1,jz2);
1461             dx20             = _mm256_sub_pd(ix2,jx0);
1462             dy20             = _mm256_sub_pd(iy2,jy0);
1463             dz20             = _mm256_sub_pd(iz2,jz0);
1464             dx21             = _mm256_sub_pd(ix2,jx1);
1465             dy21             = _mm256_sub_pd(iy2,jy1);
1466             dz21             = _mm256_sub_pd(iz2,jz1);
1467             dx22             = _mm256_sub_pd(ix2,jx2);
1468             dy22             = _mm256_sub_pd(iy2,jy2);
1469             dz22             = _mm256_sub_pd(iz2,jz2);
1470
1471             /* Calculate squared distance and things based on it */
1472             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1473             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1474             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1475             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1476             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1477             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1478             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1479             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1480             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1481
1482             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1483             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1484             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1485             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1486             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1487             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1488             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1489             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1490             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1491
1492             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1493
1494             fjx0             = _mm256_setzero_pd();
1495             fjy0             = _mm256_setzero_pd();
1496             fjz0             = _mm256_setzero_pd();
1497             fjx1             = _mm256_setzero_pd();
1498             fjy1             = _mm256_setzero_pd();
1499             fjz1             = _mm256_setzero_pd();
1500             fjx2             = _mm256_setzero_pd();
1501             fjy2             = _mm256_setzero_pd();
1502             fjz2             = _mm256_setzero_pd();
1503
1504             /**************************
1505              * CALCULATE INTERACTIONS *
1506              **************************/
1507
1508             r00              = _mm256_mul_pd(rsq00,rinv00);
1509
1510             /* Calculate table index by multiplying r with table scale and truncate to integer */
1511             rt               = _mm256_mul_pd(r00,vftabscale);
1512             vfitab           = _mm256_cvttpd_epi32(rt);
1513             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1514             vfitab           = _mm_slli_epi32(vfitab,2);
1515
1516             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1517             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1518             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1519             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1520             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1521             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1522             Heps             = _mm256_mul_pd(vfeps,H);
1523             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1524             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1525             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1526
1527             /* LENNARD-JONES DISPERSION/REPULSION */
1528
1529             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1530             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1531
1532             fscal            = _mm256_add_pd(felec,fvdw);
1533
1534             /* Calculate temporary vectorial force */
1535             tx               = _mm256_mul_pd(fscal,dx00);
1536             ty               = _mm256_mul_pd(fscal,dy00);
1537             tz               = _mm256_mul_pd(fscal,dz00);
1538
1539             /* Update vectorial force */
1540             fix0             = _mm256_add_pd(fix0,tx);
1541             fiy0             = _mm256_add_pd(fiy0,ty);
1542             fiz0             = _mm256_add_pd(fiz0,tz);
1543
1544             fjx0             = _mm256_add_pd(fjx0,tx);
1545             fjy0             = _mm256_add_pd(fjy0,ty);
1546             fjz0             = _mm256_add_pd(fjz0,tz);
1547
1548             /**************************
1549              * CALCULATE INTERACTIONS *
1550              **************************/
1551
1552             r01              = _mm256_mul_pd(rsq01,rinv01);
1553
1554             /* Calculate table index by multiplying r with table scale and truncate to integer */
1555             rt               = _mm256_mul_pd(r01,vftabscale);
1556             vfitab           = _mm256_cvttpd_epi32(rt);
1557             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1558             vfitab           = _mm_slli_epi32(vfitab,2);
1559
1560             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1561             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1562             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1563             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1564             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1565             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1566             Heps             = _mm256_mul_pd(vfeps,H);
1567             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1568             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1569             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
1570
1571             fscal            = felec;
1572
1573             /* Calculate temporary vectorial force */
1574             tx               = _mm256_mul_pd(fscal,dx01);
1575             ty               = _mm256_mul_pd(fscal,dy01);
1576             tz               = _mm256_mul_pd(fscal,dz01);
1577
1578             /* Update vectorial force */
1579             fix0             = _mm256_add_pd(fix0,tx);
1580             fiy0             = _mm256_add_pd(fiy0,ty);
1581             fiz0             = _mm256_add_pd(fiz0,tz);
1582
1583             fjx1             = _mm256_add_pd(fjx1,tx);
1584             fjy1             = _mm256_add_pd(fjy1,ty);
1585             fjz1             = _mm256_add_pd(fjz1,tz);
1586
1587             /**************************
1588              * CALCULATE INTERACTIONS *
1589              **************************/
1590
1591             r02              = _mm256_mul_pd(rsq02,rinv02);
1592
1593             /* Calculate table index by multiplying r with table scale and truncate to integer */
1594             rt               = _mm256_mul_pd(r02,vftabscale);
1595             vfitab           = _mm256_cvttpd_epi32(rt);
1596             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1597             vfitab           = _mm_slli_epi32(vfitab,2);
1598
1599             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1600             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1601             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1602             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1603             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1604             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1605             Heps             = _mm256_mul_pd(vfeps,H);
1606             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1607             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1608             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
1609
1610             fscal            = felec;
1611
1612             /* Calculate temporary vectorial force */
1613             tx               = _mm256_mul_pd(fscal,dx02);
1614             ty               = _mm256_mul_pd(fscal,dy02);
1615             tz               = _mm256_mul_pd(fscal,dz02);
1616
1617             /* Update vectorial force */
1618             fix0             = _mm256_add_pd(fix0,tx);
1619             fiy0             = _mm256_add_pd(fiy0,ty);
1620             fiz0             = _mm256_add_pd(fiz0,tz);
1621
1622             fjx2             = _mm256_add_pd(fjx2,tx);
1623             fjy2             = _mm256_add_pd(fjy2,ty);
1624             fjz2             = _mm256_add_pd(fjz2,tz);
1625
1626             /**************************
1627              * CALCULATE INTERACTIONS *
1628              **************************/
1629
1630             r10              = _mm256_mul_pd(rsq10,rinv10);
1631
1632             /* Calculate table index by multiplying r with table scale and truncate to integer */
1633             rt               = _mm256_mul_pd(r10,vftabscale);
1634             vfitab           = _mm256_cvttpd_epi32(rt);
1635             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1636             vfitab           = _mm_slli_epi32(vfitab,2);
1637
1638             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1639             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1640             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1641             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1642             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1643             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1644             Heps             = _mm256_mul_pd(vfeps,H);
1645             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1646             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1647             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
1648
1649             fscal            = felec;
1650
1651             /* Calculate temporary vectorial force */
1652             tx               = _mm256_mul_pd(fscal,dx10);
1653             ty               = _mm256_mul_pd(fscal,dy10);
1654             tz               = _mm256_mul_pd(fscal,dz10);
1655
1656             /* Update vectorial force */
1657             fix1             = _mm256_add_pd(fix1,tx);
1658             fiy1             = _mm256_add_pd(fiy1,ty);
1659             fiz1             = _mm256_add_pd(fiz1,tz);
1660
1661             fjx0             = _mm256_add_pd(fjx0,tx);
1662             fjy0             = _mm256_add_pd(fjy0,ty);
1663             fjz0             = _mm256_add_pd(fjz0,tz);
1664
1665             /**************************
1666              * CALCULATE INTERACTIONS *
1667              **************************/
1668
1669             r11              = _mm256_mul_pd(rsq11,rinv11);
1670
1671             /* Calculate table index by multiplying r with table scale and truncate to integer */
1672             rt               = _mm256_mul_pd(r11,vftabscale);
1673             vfitab           = _mm256_cvttpd_epi32(rt);
1674             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1675             vfitab           = _mm_slli_epi32(vfitab,2);
1676
1677             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1678             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1679             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1680             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1681             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1682             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1683             Heps             = _mm256_mul_pd(vfeps,H);
1684             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1685             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1686             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1687
1688             fscal            = felec;
1689
1690             /* Calculate temporary vectorial force */
1691             tx               = _mm256_mul_pd(fscal,dx11);
1692             ty               = _mm256_mul_pd(fscal,dy11);
1693             tz               = _mm256_mul_pd(fscal,dz11);
1694
1695             /* Update vectorial force */
1696             fix1             = _mm256_add_pd(fix1,tx);
1697             fiy1             = _mm256_add_pd(fiy1,ty);
1698             fiz1             = _mm256_add_pd(fiz1,tz);
1699
1700             fjx1             = _mm256_add_pd(fjx1,tx);
1701             fjy1             = _mm256_add_pd(fjy1,ty);
1702             fjz1             = _mm256_add_pd(fjz1,tz);
1703
1704             /**************************
1705              * CALCULATE INTERACTIONS *
1706              **************************/
1707
1708             r12              = _mm256_mul_pd(rsq12,rinv12);
1709
1710             /* Calculate table index by multiplying r with table scale and truncate to integer */
1711             rt               = _mm256_mul_pd(r12,vftabscale);
1712             vfitab           = _mm256_cvttpd_epi32(rt);
1713             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1714             vfitab           = _mm_slli_epi32(vfitab,2);
1715
1716             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1717             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1718             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1719             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1720             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1721             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1722             Heps             = _mm256_mul_pd(vfeps,H);
1723             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1724             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1725             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1726
1727             fscal            = felec;
1728
1729             /* Calculate temporary vectorial force */
1730             tx               = _mm256_mul_pd(fscal,dx12);
1731             ty               = _mm256_mul_pd(fscal,dy12);
1732             tz               = _mm256_mul_pd(fscal,dz12);
1733
1734             /* Update vectorial force */
1735             fix1             = _mm256_add_pd(fix1,tx);
1736             fiy1             = _mm256_add_pd(fiy1,ty);
1737             fiz1             = _mm256_add_pd(fiz1,tz);
1738
1739             fjx2             = _mm256_add_pd(fjx2,tx);
1740             fjy2             = _mm256_add_pd(fjy2,ty);
1741             fjz2             = _mm256_add_pd(fjz2,tz);
1742
1743             /**************************
1744              * CALCULATE INTERACTIONS *
1745              **************************/
1746
1747             r20              = _mm256_mul_pd(rsq20,rinv20);
1748
1749             /* Calculate table index by multiplying r with table scale and truncate to integer */
1750             rt               = _mm256_mul_pd(r20,vftabscale);
1751             vfitab           = _mm256_cvttpd_epi32(rt);
1752             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1753             vfitab           = _mm_slli_epi32(vfitab,2);
1754
1755             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1756             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1757             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1758             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1759             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1760             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1761             Heps             = _mm256_mul_pd(vfeps,H);
1762             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1763             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1764             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1765
1766             fscal            = felec;
1767
1768             /* Calculate temporary vectorial force */
1769             tx               = _mm256_mul_pd(fscal,dx20);
1770             ty               = _mm256_mul_pd(fscal,dy20);
1771             tz               = _mm256_mul_pd(fscal,dz20);
1772
1773             /* Update vectorial force */
1774             fix2             = _mm256_add_pd(fix2,tx);
1775             fiy2             = _mm256_add_pd(fiy2,ty);
1776             fiz2             = _mm256_add_pd(fiz2,tz);
1777
1778             fjx0             = _mm256_add_pd(fjx0,tx);
1779             fjy0             = _mm256_add_pd(fjy0,ty);
1780             fjz0             = _mm256_add_pd(fjz0,tz);
1781
1782             /**************************
1783              * CALCULATE INTERACTIONS *
1784              **************************/
1785
1786             r21              = _mm256_mul_pd(rsq21,rinv21);
1787
1788             /* Calculate table index by multiplying r with table scale and truncate to integer */
1789             rt               = _mm256_mul_pd(r21,vftabscale);
1790             vfitab           = _mm256_cvttpd_epi32(rt);
1791             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1792             vfitab           = _mm_slli_epi32(vfitab,2);
1793
1794             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1795             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1796             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1797             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1798             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1799             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1800             Heps             = _mm256_mul_pd(vfeps,H);
1801             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1802             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1803             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1804
1805             fscal            = felec;
1806
1807             /* Calculate temporary vectorial force */
1808             tx               = _mm256_mul_pd(fscal,dx21);
1809             ty               = _mm256_mul_pd(fscal,dy21);
1810             tz               = _mm256_mul_pd(fscal,dz21);
1811
1812             /* Update vectorial force */
1813             fix2             = _mm256_add_pd(fix2,tx);
1814             fiy2             = _mm256_add_pd(fiy2,ty);
1815             fiz2             = _mm256_add_pd(fiz2,tz);
1816
1817             fjx1             = _mm256_add_pd(fjx1,tx);
1818             fjy1             = _mm256_add_pd(fjy1,ty);
1819             fjz1             = _mm256_add_pd(fjz1,tz);
1820
1821             /**************************
1822              * CALCULATE INTERACTIONS *
1823              **************************/
1824
1825             r22              = _mm256_mul_pd(rsq22,rinv22);
1826
1827             /* Calculate table index by multiplying r with table scale and truncate to integer */
1828             rt               = _mm256_mul_pd(r22,vftabscale);
1829             vfitab           = _mm256_cvttpd_epi32(rt);
1830             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1831             vfitab           = _mm_slli_epi32(vfitab,2);
1832
1833             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1834             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1835             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1836             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1837             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1838             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1839             Heps             = _mm256_mul_pd(vfeps,H);
1840             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1841             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1842             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1843
1844             fscal            = felec;
1845
1846             /* Calculate temporary vectorial force */
1847             tx               = _mm256_mul_pd(fscal,dx22);
1848             ty               = _mm256_mul_pd(fscal,dy22);
1849             tz               = _mm256_mul_pd(fscal,dz22);
1850
1851             /* Update vectorial force */
1852             fix2             = _mm256_add_pd(fix2,tx);
1853             fiy2             = _mm256_add_pd(fiy2,ty);
1854             fiz2             = _mm256_add_pd(fiz2,tz);
1855
1856             fjx2             = _mm256_add_pd(fjx2,tx);
1857             fjy2             = _mm256_add_pd(fjy2,ty);
1858             fjz2             = _mm256_add_pd(fjz2,tz);
1859
1860             fjptrA             = f+j_coord_offsetA;
1861             fjptrB             = f+j_coord_offsetB;
1862             fjptrC             = f+j_coord_offsetC;
1863             fjptrD             = f+j_coord_offsetD;
1864
1865             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1866                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1867
1868             /* Inner loop uses 359 flops */
1869         }
1870
1871         if(jidx<j_index_end)
1872         {
1873
1874             /* Get j neighbor index, and coordinate index */
1875             jnrlistA         = jjnr[jidx];
1876             jnrlistB         = jjnr[jidx+1];
1877             jnrlistC         = jjnr[jidx+2];
1878             jnrlistD         = jjnr[jidx+3];
1879             /* Sign of each element will be negative for non-real atoms.
1880              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1881              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1882              */
1883             tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1884
1885             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1886             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1887             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1888
1889             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1890             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1891             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1892             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1893             j_coord_offsetA  = DIM*jnrA;
1894             j_coord_offsetB  = DIM*jnrB;
1895             j_coord_offsetC  = DIM*jnrC;
1896             j_coord_offsetD  = DIM*jnrD;
1897
1898             /* load j atom coordinates */
1899             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1900                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1901                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1902
1903             /* Calculate displacement vector */
1904             dx00             = _mm256_sub_pd(ix0,jx0);
1905             dy00             = _mm256_sub_pd(iy0,jy0);
1906             dz00             = _mm256_sub_pd(iz0,jz0);
1907             dx01             = _mm256_sub_pd(ix0,jx1);
1908             dy01             = _mm256_sub_pd(iy0,jy1);
1909             dz01             = _mm256_sub_pd(iz0,jz1);
1910             dx02             = _mm256_sub_pd(ix0,jx2);
1911             dy02             = _mm256_sub_pd(iy0,jy2);
1912             dz02             = _mm256_sub_pd(iz0,jz2);
1913             dx10             = _mm256_sub_pd(ix1,jx0);
1914             dy10             = _mm256_sub_pd(iy1,jy0);
1915             dz10             = _mm256_sub_pd(iz1,jz0);
1916             dx11             = _mm256_sub_pd(ix1,jx1);
1917             dy11             = _mm256_sub_pd(iy1,jy1);
1918             dz11             = _mm256_sub_pd(iz1,jz1);
1919             dx12             = _mm256_sub_pd(ix1,jx2);
1920             dy12             = _mm256_sub_pd(iy1,jy2);
1921             dz12             = _mm256_sub_pd(iz1,jz2);
1922             dx20             = _mm256_sub_pd(ix2,jx0);
1923             dy20             = _mm256_sub_pd(iy2,jy0);
1924             dz20             = _mm256_sub_pd(iz2,jz0);
1925             dx21             = _mm256_sub_pd(ix2,jx1);
1926             dy21             = _mm256_sub_pd(iy2,jy1);
1927             dz21             = _mm256_sub_pd(iz2,jz1);
1928             dx22             = _mm256_sub_pd(ix2,jx2);
1929             dy22             = _mm256_sub_pd(iy2,jy2);
1930             dz22             = _mm256_sub_pd(iz2,jz2);
1931
1932             /* Calculate squared distance and things based on it */
1933             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1934             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1935             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1936             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1937             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1938             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1939             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1940             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1941             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1942
1943             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1944             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1945             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1946             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1947             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1948             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1949             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1950             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1951             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1952
1953             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1954
1955             fjx0             = _mm256_setzero_pd();
1956             fjy0             = _mm256_setzero_pd();
1957             fjz0             = _mm256_setzero_pd();
1958             fjx1             = _mm256_setzero_pd();
1959             fjy1             = _mm256_setzero_pd();
1960             fjz1             = _mm256_setzero_pd();
1961             fjx2             = _mm256_setzero_pd();
1962             fjy2             = _mm256_setzero_pd();
1963             fjz2             = _mm256_setzero_pd();
1964
1965             /**************************
1966              * CALCULATE INTERACTIONS *
1967              **************************/
1968
1969             r00              = _mm256_mul_pd(rsq00,rinv00);
1970             r00              = _mm256_andnot_pd(dummy_mask,r00);
1971
1972             /* Calculate table index by multiplying r with table scale and truncate to integer */
1973             rt               = _mm256_mul_pd(r00,vftabscale);
1974             vfitab           = _mm256_cvttpd_epi32(rt);
1975             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1976             vfitab           = _mm_slli_epi32(vfitab,2);
1977
1978             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1979             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1980             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1981             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1982             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1983             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1984             Heps             = _mm256_mul_pd(vfeps,H);
1985             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1986             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1987             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1988
1989             /* LENNARD-JONES DISPERSION/REPULSION */
1990
1991             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1992             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1993
1994             fscal            = _mm256_add_pd(felec,fvdw);
1995
1996             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1997
1998             /* Calculate temporary vectorial force */
1999             tx               = _mm256_mul_pd(fscal,dx00);
2000             ty               = _mm256_mul_pd(fscal,dy00);
2001             tz               = _mm256_mul_pd(fscal,dz00);
2002
2003             /* Update vectorial force */
2004             fix0             = _mm256_add_pd(fix0,tx);
2005             fiy0             = _mm256_add_pd(fiy0,ty);
2006             fiz0             = _mm256_add_pd(fiz0,tz);
2007
2008             fjx0             = _mm256_add_pd(fjx0,tx);
2009             fjy0             = _mm256_add_pd(fjy0,ty);
2010             fjz0             = _mm256_add_pd(fjz0,tz);
2011
2012             /**************************
2013              * CALCULATE INTERACTIONS *
2014              **************************/
2015
2016             r01              = _mm256_mul_pd(rsq01,rinv01);
2017             r01              = _mm256_andnot_pd(dummy_mask,r01);
2018
2019             /* Calculate table index by multiplying r with table scale and truncate to integer */
2020             rt               = _mm256_mul_pd(r01,vftabscale);
2021             vfitab           = _mm256_cvttpd_epi32(rt);
2022             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2023             vfitab           = _mm_slli_epi32(vfitab,2);
2024
2025             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2026             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2027             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2028             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2029             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2030             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2031             Heps             = _mm256_mul_pd(vfeps,H);
2032             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2033             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2034             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
2035
2036             fscal            = felec;
2037
2038             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2039
2040             /* Calculate temporary vectorial force */
2041             tx               = _mm256_mul_pd(fscal,dx01);
2042             ty               = _mm256_mul_pd(fscal,dy01);
2043             tz               = _mm256_mul_pd(fscal,dz01);
2044
2045             /* Update vectorial force */
2046             fix0             = _mm256_add_pd(fix0,tx);
2047             fiy0             = _mm256_add_pd(fiy0,ty);
2048             fiz0             = _mm256_add_pd(fiz0,tz);
2049
2050             fjx1             = _mm256_add_pd(fjx1,tx);
2051             fjy1             = _mm256_add_pd(fjy1,ty);
2052             fjz1             = _mm256_add_pd(fjz1,tz);
2053
2054             /**************************
2055              * CALCULATE INTERACTIONS *
2056              **************************/
2057
2058             r02              = _mm256_mul_pd(rsq02,rinv02);
2059             r02              = _mm256_andnot_pd(dummy_mask,r02);
2060
2061             /* Calculate table index by multiplying r with table scale and truncate to integer */
2062             rt               = _mm256_mul_pd(r02,vftabscale);
2063             vfitab           = _mm256_cvttpd_epi32(rt);
2064             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2065             vfitab           = _mm_slli_epi32(vfitab,2);
2066
2067             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2068             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2069             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2070             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2071             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2072             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2073             Heps             = _mm256_mul_pd(vfeps,H);
2074             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2075             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2076             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
2077
2078             fscal            = felec;
2079
2080             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2081
2082             /* Calculate temporary vectorial force */
2083             tx               = _mm256_mul_pd(fscal,dx02);
2084             ty               = _mm256_mul_pd(fscal,dy02);
2085             tz               = _mm256_mul_pd(fscal,dz02);
2086
2087             /* Update vectorial force */
2088             fix0             = _mm256_add_pd(fix0,tx);
2089             fiy0             = _mm256_add_pd(fiy0,ty);
2090             fiz0             = _mm256_add_pd(fiz0,tz);
2091
2092             fjx2             = _mm256_add_pd(fjx2,tx);
2093             fjy2             = _mm256_add_pd(fjy2,ty);
2094             fjz2             = _mm256_add_pd(fjz2,tz);
2095
2096             /**************************
2097              * CALCULATE INTERACTIONS *
2098              **************************/
2099
2100             r10              = _mm256_mul_pd(rsq10,rinv10);
2101             r10              = _mm256_andnot_pd(dummy_mask,r10);
2102
2103             /* Calculate table index by multiplying r with table scale and truncate to integer */
2104             rt               = _mm256_mul_pd(r10,vftabscale);
2105             vfitab           = _mm256_cvttpd_epi32(rt);
2106             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2107             vfitab           = _mm_slli_epi32(vfitab,2);
2108
2109             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2110             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2111             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2112             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2113             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2114             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2115             Heps             = _mm256_mul_pd(vfeps,H);
2116             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2117             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2118             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
2119
2120             fscal            = felec;
2121
2122             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2123
2124             /* Calculate temporary vectorial force */
2125             tx               = _mm256_mul_pd(fscal,dx10);
2126             ty               = _mm256_mul_pd(fscal,dy10);
2127             tz               = _mm256_mul_pd(fscal,dz10);
2128
2129             /* Update vectorial force */
2130             fix1             = _mm256_add_pd(fix1,tx);
2131             fiy1             = _mm256_add_pd(fiy1,ty);
2132             fiz1             = _mm256_add_pd(fiz1,tz);
2133
2134             fjx0             = _mm256_add_pd(fjx0,tx);
2135             fjy0             = _mm256_add_pd(fjy0,ty);
2136             fjz0             = _mm256_add_pd(fjz0,tz);
2137
2138             /**************************
2139              * CALCULATE INTERACTIONS *
2140              **************************/
2141
2142             r11              = _mm256_mul_pd(rsq11,rinv11);
2143             r11              = _mm256_andnot_pd(dummy_mask,r11);
2144
2145             /* Calculate table index by multiplying r with table scale and truncate to integer */
2146             rt               = _mm256_mul_pd(r11,vftabscale);
2147             vfitab           = _mm256_cvttpd_epi32(rt);
2148             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2149             vfitab           = _mm_slli_epi32(vfitab,2);
2150
2151             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2152             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2153             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2154             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2155             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2156             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2157             Heps             = _mm256_mul_pd(vfeps,H);
2158             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2159             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2160             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2161
2162             fscal            = felec;
2163
2164             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2165
2166             /* Calculate temporary vectorial force */
2167             tx               = _mm256_mul_pd(fscal,dx11);
2168             ty               = _mm256_mul_pd(fscal,dy11);
2169             tz               = _mm256_mul_pd(fscal,dz11);
2170
2171             /* Update vectorial force */
2172             fix1             = _mm256_add_pd(fix1,tx);
2173             fiy1             = _mm256_add_pd(fiy1,ty);
2174             fiz1             = _mm256_add_pd(fiz1,tz);
2175
2176             fjx1             = _mm256_add_pd(fjx1,tx);
2177             fjy1             = _mm256_add_pd(fjy1,ty);
2178             fjz1             = _mm256_add_pd(fjz1,tz);
2179
2180             /**************************
2181              * CALCULATE INTERACTIONS *
2182              **************************/
2183
2184             r12              = _mm256_mul_pd(rsq12,rinv12);
2185             r12              = _mm256_andnot_pd(dummy_mask,r12);
2186
2187             /* Calculate table index by multiplying r with table scale and truncate to integer */
2188             rt               = _mm256_mul_pd(r12,vftabscale);
2189             vfitab           = _mm256_cvttpd_epi32(rt);
2190             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2191             vfitab           = _mm_slli_epi32(vfitab,2);
2192
2193             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2194             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2195             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2196             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2197             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2198             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2199             Heps             = _mm256_mul_pd(vfeps,H);
2200             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2201             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2202             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2203
2204             fscal            = felec;
2205
2206             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2207
2208             /* Calculate temporary vectorial force */
2209             tx               = _mm256_mul_pd(fscal,dx12);
2210             ty               = _mm256_mul_pd(fscal,dy12);
2211             tz               = _mm256_mul_pd(fscal,dz12);
2212
2213             /* Update vectorial force */
2214             fix1             = _mm256_add_pd(fix1,tx);
2215             fiy1             = _mm256_add_pd(fiy1,ty);
2216             fiz1             = _mm256_add_pd(fiz1,tz);
2217
2218             fjx2             = _mm256_add_pd(fjx2,tx);
2219             fjy2             = _mm256_add_pd(fjy2,ty);
2220             fjz2             = _mm256_add_pd(fjz2,tz);
2221
2222             /**************************
2223              * CALCULATE INTERACTIONS *
2224              **************************/
2225
2226             r20              = _mm256_mul_pd(rsq20,rinv20);
2227             r20              = _mm256_andnot_pd(dummy_mask,r20);
2228
2229             /* Calculate table index by multiplying r with table scale and truncate to integer */
2230             rt               = _mm256_mul_pd(r20,vftabscale);
2231             vfitab           = _mm256_cvttpd_epi32(rt);
2232             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2233             vfitab           = _mm_slli_epi32(vfitab,2);
2234
2235             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2236             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2237             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2238             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2239             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2240             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2241             Heps             = _mm256_mul_pd(vfeps,H);
2242             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2243             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2244             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
2245
2246             fscal            = felec;
2247
2248             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2249
2250             /* Calculate temporary vectorial force */
2251             tx               = _mm256_mul_pd(fscal,dx20);
2252             ty               = _mm256_mul_pd(fscal,dy20);
2253             tz               = _mm256_mul_pd(fscal,dz20);
2254
2255             /* Update vectorial force */
2256             fix2             = _mm256_add_pd(fix2,tx);
2257             fiy2             = _mm256_add_pd(fiy2,ty);
2258             fiz2             = _mm256_add_pd(fiz2,tz);
2259
2260             fjx0             = _mm256_add_pd(fjx0,tx);
2261             fjy0             = _mm256_add_pd(fjy0,ty);
2262             fjz0             = _mm256_add_pd(fjz0,tz);
2263
2264             /**************************
2265              * CALCULATE INTERACTIONS *
2266              **************************/
2267
2268             r21              = _mm256_mul_pd(rsq21,rinv21);
2269             r21              = _mm256_andnot_pd(dummy_mask,r21);
2270
2271             /* Calculate table index by multiplying r with table scale and truncate to integer */
2272             rt               = _mm256_mul_pd(r21,vftabscale);
2273             vfitab           = _mm256_cvttpd_epi32(rt);
2274             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2275             vfitab           = _mm_slli_epi32(vfitab,2);
2276
2277             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2278             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2279             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2280             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2281             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2282             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2283             Heps             = _mm256_mul_pd(vfeps,H);
2284             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2285             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2286             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2287
2288             fscal            = felec;
2289
2290             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2291
2292             /* Calculate temporary vectorial force */
2293             tx               = _mm256_mul_pd(fscal,dx21);
2294             ty               = _mm256_mul_pd(fscal,dy21);
2295             tz               = _mm256_mul_pd(fscal,dz21);
2296
2297             /* Update vectorial force */
2298             fix2             = _mm256_add_pd(fix2,tx);
2299             fiy2             = _mm256_add_pd(fiy2,ty);
2300             fiz2             = _mm256_add_pd(fiz2,tz);
2301
2302             fjx1             = _mm256_add_pd(fjx1,tx);
2303             fjy1             = _mm256_add_pd(fjy1,ty);
2304             fjz1             = _mm256_add_pd(fjz1,tz);
2305
2306             /**************************
2307              * CALCULATE INTERACTIONS *
2308              **************************/
2309
2310             r22              = _mm256_mul_pd(rsq22,rinv22);
2311             r22              = _mm256_andnot_pd(dummy_mask,r22);
2312
2313             /* Calculate table index by multiplying r with table scale and truncate to integer */
2314             rt               = _mm256_mul_pd(r22,vftabscale);
2315             vfitab           = _mm256_cvttpd_epi32(rt);
2316             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2317             vfitab           = _mm_slli_epi32(vfitab,2);
2318
2319             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2320             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2321             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2322             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2323             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2324             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2325             Heps             = _mm256_mul_pd(vfeps,H);
2326             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2327             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2328             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2329
2330             fscal            = felec;
2331
2332             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2333
2334             /* Calculate temporary vectorial force */
2335             tx               = _mm256_mul_pd(fscal,dx22);
2336             ty               = _mm256_mul_pd(fscal,dy22);
2337             tz               = _mm256_mul_pd(fscal,dz22);
2338
2339             /* Update vectorial force */
2340             fix2             = _mm256_add_pd(fix2,tx);
2341             fiy2             = _mm256_add_pd(fiy2,ty);
2342             fiz2             = _mm256_add_pd(fiz2,tz);
2343
2344             fjx2             = _mm256_add_pd(fjx2,tx);
2345             fjy2             = _mm256_add_pd(fjy2,ty);
2346             fjz2             = _mm256_add_pd(fjz2,tz);
2347
2348             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2349             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2350             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2351             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2352
2353             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2354                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2355
2356             /* Inner loop uses 368 flops */
2357         }
2358
2359         /* End of innermost loop */
2360
2361         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2362                                                  f+i_coord_offset,fshift+i_shift_offset);
2363
2364         /* Increment number of inner iterations */
2365         inneriter                  += j_index_end - j_index_start;
2366
2367         /* Outer loop uses 18 flops */
2368     }
2369
2370     /* Increment number of outer iterations */
2371     outeriter        += nri;
2372
2373     /* Update outer/inner flops */
2374
2375     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);
2376 }