Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            CubicSplineTable
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     real *           vdwioffsetptr0;
84     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     real *           vdwioffsetptr1;
86     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     real *           vdwioffsetptr3;
90     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
116     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
117     __m128i          vfitab;
118     __m128i          ifour       = _mm_set1_epi32(4);
119     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
120     real             *vftab;
121     __m256d          dummy_mask,cutoff_mask;
122     __m128           tmpmask0,tmpmask1;
123     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
124     __m256d          one     = _mm256_set1_pd(1.0);
125     __m256d          two     = _mm256_set1_pd(2.0);
126     x                = xx[0];
127     f                = ff[0];
128
129     nri              = nlist->nri;
130     iinr             = nlist->iinr;
131     jindex           = nlist->jindex;
132     jjnr             = nlist->jjnr;
133     shiftidx         = nlist->shift;
134     gid              = nlist->gid;
135     shiftvec         = fr->shift_vec[0];
136     fshift           = fr->fshift[0];
137     facel            = _mm256_set1_pd(fr->ic->epsfac);
138     charge           = mdatoms->chargeA;
139     nvdwtype         = fr->ntype;
140     vdwparam         = fr->nbfp;
141     vdwtype          = mdatoms->typeA;
142
143     vftab            = kernel_data->table_elec_vdw->data;
144     vftabscale       = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
145
146     /* Setup water-specific parameters */
147     inr              = nlist->iinr[0];
148     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
149     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
150     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
151     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
152
153     jq1              = _mm256_set1_pd(charge[inr+1]);
154     jq2              = _mm256_set1_pd(charge[inr+2]);
155     jq3              = _mm256_set1_pd(charge[inr+3]);
156     vdwjidx0A        = 2*vdwtype[inr+0];
157     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
158     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
159     qq11             = _mm256_mul_pd(iq1,jq1);
160     qq12             = _mm256_mul_pd(iq1,jq2);
161     qq13             = _mm256_mul_pd(iq1,jq3);
162     qq21             = _mm256_mul_pd(iq2,jq1);
163     qq22             = _mm256_mul_pd(iq2,jq2);
164     qq23             = _mm256_mul_pd(iq2,jq3);
165     qq31             = _mm256_mul_pd(iq3,jq1);
166     qq32             = _mm256_mul_pd(iq3,jq2);
167     qq33             = _mm256_mul_pd(iq3,jq3);
168
169     /* Avoid stupid compiler warnings */
170     jnrA = jnrB = jnrC = jnrD = 0;
171     j_coord_offsetA = 0;
172     j_coord_offsetB = 0;
173     j_coord_offsetC = 0;
174     j_coord_offsetD = 0;
175
176     outeriter        = 0;
177     inneriter        = 0;
178
179     for(iidx=0;iidx<4*DIM;iidx++)
180     {
181         scratch[iidx] = 0.0;
182     }
183
184     /* Start outer loop over neighborlists */
185     for(iidx=0; iidx<nri; iidx++)
186     {
187         /* Load shift vector for this list */
188         i_shift_offset   = DIM*shiftidx[iidx];
189
190         /* Load limits for loop over neighbors */
191         j_index_start    = jindex[iidx];
192         j_index_end      = jindex[iidx+1];
193
194         /* Get outer coordinate index */
195         inr              = iinr[iidx];
196         i_coord_offset   = DIM*inr;
197
198         /* Load i particle coords and add shift vector */
199         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
200                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
201
202         fix0             = _mm256_setzero_pd();
203         fiy0             = _mm256_setzero_pd();
204         fiz0             = _mm256_setzero_pd();
205         fix1             = _mm256_setzero_pd();
206         fiy1             = _mm256_setzero_pd();
207         fiz1             = _mm256_setzero_pd();
208         fix2             = _mm256_setzero_pd();
209         fiy2             = _mm256_setzero_pd();
210         fiz2             = _mm256_setzero_pd();
211         fix3             = _mm256_setzero_pd();
212         fiy3             = _mm256_setzero_pd();
213         fiz3             = _mm256_setzero_pd();
214
215         /* Reset potential sums */
216         velecsum         = _mm256_setzero_pd();
217         vvdwsum          = _mm256_setzero_pd();
218
219         /* Start inner kernel loop */
220         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
221         {
222
223             /* Get j neighbor index, and coordinate index */
224             jnrA             = jjnr[jidx];
225             jnrB             = jjnr[jidx+1];
226             jnrC             = jjnr[jidx+2];
227             jnrD             = jjnr[jidx+3];
228             j_coord_offsetA  = DIM*jnrA;
229             j_coord_offsetB  = DIM*jnrB;
230             j_coord_offsetC  = DIM*jnrC;
231             j_coord_offsetD  = DIM*jnrD;
232
233             /* load j atom coordinates */
234             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
235                                                  x+j_coord_offsetC,x+j_coord_offsetD,
236                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
237                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
238
239             /* Calculate displacement vector */
240             dx00             = _mm256_sub_pd(ix0,jx0);
241             dy00             = _mm256_sub_pd(iy0,jy0);
242             dz00             = _mm256_sub_pd(iz0,jz0);
243             dx11             = _mm256_sub_pd(ix1,jx1);
244             dy11             = _mm256_sub_pd(iy1,jy1);
245             dz11             = _mm256_sub_pd(iz1,jz1);
246             dx12             = _mm256_sub_pd(ix1,jx2);
247             dy12             = _mm256_sub_pd(iy1,jy2);
248             dz12             = _mm256_sub_pd(iz1,jz2);
249             dx13             = _mm256_sub_pd(ix1,jx3);
250             dy13             = _mm256_sub_pd(iy1,jy3);
251             dz13             = _mm256_sub_pd(iz1,jz3);
252             dx21             = _mm256_sub_pd(ix2,jx1);
253             dy21             = _mm256_sub_pd(iy2,jy1);
254             dz21             = _mm256_sub_pd(iz2,jz1);
255             dx22             = _mm256_sub_pd(ix2,jx2);
256             dy22             = _mm256_sub_pd(iy2,jy2);
257             dz22             = _mm256_sub_pd(iz2,jz2);
258             dx23             = _mm256_sub_pd(ix2,jx3);
259             dy23             = _mm256_sub_pd(iy2,jy3);
260             dz23             = _mm256_sub_pd(iz2,jz3);
261             dx31             = _mm256_sub_pd(ix3,jx1);
262             dy31             = _mm256_sub_pd(iy3,jy1);
263             dz31             = _mm256_sub_pd(iz3,jz1);
264             dx32             = _mm256_sub_pd(ix3,jx2);
265             dy32             = _mm256_sub_pd(iy3,jy2);
266             dz32             = _mm256_sub_pd(iz3,jz2);
267             dx33             = _mm256_sub_pd(ix3,jx3);
268             dy33             = _mm256_sub_pd(iy3,jy3);
269             dz33             = _mm256_sub_pd(iz3,jz3);
270
271             /* Calculate squared distance and things based on it */
272             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
273             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
274             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
275             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
276             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
277             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
278             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
279             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
280             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
281             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
282
283             rinv00           = avx256_invsqrt_d(rsq00);
284             rinv11           = avx256_invsqrt_d(rsq11);
285             rinv12           = avx256_invsqrt_d(rsq12);
286             rinv13           = avx256_invsqrt_d(rsq13);
287             rinv21           = avx256_invsqrt_d(rsq21);
288             rinv22           = avx256_invsqrt_d(rsq22);
289             rinv23           = avx256_invsqrt_d(rsq23);
290             rinv31           = avx256_invsqrt_d(rsq31);
291             rinv32           = avx256_invsqrt_d(rsq32);
292             rinv33           = avx256_invsqrt_d(rsq33);
293
294             fjx0             = _mm256_setzero_pd();
295             fjy0             = _mm256_setzero_pd();
296             fjz0             = _mm256_setzero_pd();
297             fjx1             = _mm256_setzero_pd();
298             fjy1             = _mm256_setzero_pd();
299             fjz1             = _mm256_setzero_pd();
300             fjx2             = _mm256_setzero_pd();
301             fjy2             = _mm256_setzero_pd();
302             fjz2             = _mm256_setzero_pd();
303             fjx3             = _mm256_setzero_pd();
304             fjy3             = _mm256_setzero_pd();
305             fjz3             = _mm256_setzero_pd();
306
307             /**************************
308              * CALCULATE INTERACTIONS *
309              **************************/
310
311             r00              = _mm256_mul_pd(rsq00,rinv00);
312
313             /* Calculate table index by multiplying r with table scale and truncate to integer */
314             rt               = _mm256_mul_pd(r00,vftabscale);
315             vfitab           = _mm256_cvttpd_epi32(rt);
316             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
317             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
318
319             /* CUBIC SPLINE TABLE DISPERSION */
320             vfitab           = _mm_add_epi32(vfitab,ifour);
321             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
322             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
323             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
324             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
325             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
326             Heps             = _mm256_mul_pd(vfeps,H);
327             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
328             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
329             vvdw6            = _mm256_mul_pd(c6_00,VV);
330             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
331             fvdw6            = _mm256_mul_pd(c6_00,FF);
332
333             /* CUBIC SPLINE TABLE REPULSION */
334             vfitab           = _mm_add_epi32(vfitab,ifour);
335             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
336             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
337             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
338             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
339             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
340             Heps             = _mm256_mul_pd(vfeps,H);
341             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
342             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
343             vvdw12           = _mm256_mul_pd(c12_00,VV);
344             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
345             fvdw12           = _mm256_mul_pd(c12_00,FF);
346             vvdw             = _mm256_add_pd(vvdw12,vvdw6);
347             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
348
349             /* Update potential sum for this i atom from the interaction with this j atom. */
350             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
351
352             fscal            = fvdw;
353
354             /* Calculate temporary vectorial force */
355             tx               = _mm256_mul_pd(fscal,dx00);
356             ty               = _mm256_mul_pd(fscal,dy00);
357             tz               = _mm256_mul_pd(fscal,dz00);
358
359             /* Update vectorial force */
360             fix0             = _mm256_add_pd(fix0,tx);
361             fiy0             = _mm256_add_pd(fiy0,ty);
362             fiz0             = _mm256_add_pd(fiz0,tz);
363
364             fjx0             = _mm256_add_pd(fjx0,tx);
365             fjy0             = _mm256_add_pd(fjy0,ty);
366             fjz0             = _mm256_add_pd(fjz0,tz);
367
368             /**************************
369              * CALCULATE INTERACTIONS *
370              **************************/
371
372             r11              = _mm256_mul_pd(rsq11,rinv11);
373
374             /* Calculate table index by multiplying r with table scale and truncate to integer */
375             rt               = _mm256_mul_pd(r11,vftabscale);
376             vfitab           = _mm256_cvttpd_epi32(rt);
377             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
378             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
379
380             /* CUBIC SPLINE TABLE ELECTROSTATICS */
381             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
382             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
383             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
384             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
385             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
386             Heps             = _mm256_mul_pd(vfeps,H);
387             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
388             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
389             velec            = _mm256_mul_pd(qq11,VV);
390             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
391             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
392
393             /* Update potential sum for this i atom from the interaction with this j atom. */
394             velecsum         = _mm256_add_pd(velecsum,velec);
395
396             fscal            = felec;
397
398             /* Calculate temporary vectorial force */
399             tx               = _mm256_mul_pd(fscal,dx11);
400             ty               = _mm256_mul_pd(fscal,dy11);
401             tz               = _mm256_mul_pd(fscal,dz11);
402
403             /* Update vectorial force */
404             fix1             = _mm256_add_pd(fix1,tx);
405             fiy1             = _mm256_add_pd(fiy1,ty);
406             fiz1             = _mm256_add_pd(fiz1,tz);
407
408             fjx1             = _mm256_add_pd(fjx1,tx);
409             fjy1             = _mm256_add_pd(fjy1,ty);
410             fjz1             = _mm256_add_pd(fjz1,tz);
411
412             /**************************
413              * CALCULATE INTERACTIONS *
414              **************************/
415
416             r12              = _mm256_mul_pd(rsq12,rinv12);
417
418             /* Calculate table index by multiplying r with table scale and truncate to integer */
419             rt               = _mm256_mul_pd(r12,vftabscale);
420             vfitab           = _mm256_cvttpd_epi32(rt);
421             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
422             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
423
424             /* CUBIC SPLINE TABLE ELECTROSTATICS */
425             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
426             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
427             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
428             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
429             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
430             Heps             = _mm256_mul_pd(vfeps,H);
431             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
432             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
433             velec            = _mm256_mul_pd(qq12,VV);
434             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
435             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
436
437             /* Update potential sum for this i atom from the interaction with this j atom. */
438             velecsum         = _mm256_add_pd(velecsum,velec);
439
440             fscal            = felec;
441
442             /* Calculate temporary vectorial force */
443             tx               = _mm256_mul_pd(fscal,dx12);
444             ty               = _mm256_mul_pd(fscal,dy12);
445             tz               = _mm256_mul_pd(fscal,dz12);
446
447             /* Update vectorial force */
448             fix1             = _mm256_add_pd(fix1,tx);
449             fiy1             = _mm256_add_pd(fiy1,ty);
450             fiz1             = _mm256_add_pd(fiz1,tz);
451
452             fjx2             = _mm256_add_pd(fjx2,tx);
453             fjy2             = _mm256_add_pd(fjy2,ty);
454             fjz2             = _mm256_add_pd(fjz2,tz);
455
456             /**************************
457              * CALCULATE INTERACTIONS *
458              **************************/
459
460             r13              = _mm256_mul_pd(rsq13,rinv13);
461
462             /* Calculate table index by multiplying r with table scale and truncate to integer */
463             rt               = _mm256_mul_pd(r13,vftabscale);
464             vfitab           = _mm256_cvttpd_epi32(rt);
465             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
466             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
467
468             /* CUBIC SPLINE TABLE ELECTROSTATICS */
469             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
470             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
471             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
472             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
473             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
474             Heps             = _mm256_mul_pd(vfeps,H);
475             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
476             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
477             velec            = _mm256_mul_pd(qq13,VV);
478             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
479             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
480
481             /* Update potential sum for this i atom from the interaction with this j atom. */
482             velecsum         = _mm256_add_pd(velecsum,velec);
483
484             fscal            = felec;
485
486             /* Calculate temporary vectorial force */
487             tx               = _mm256_mul_pd(fscal,dx13);
488             ty               = _mm256_mul_pd(fscal,dy13);
489             tz               = _mm256_mul_pd(fscal,dz13);
490
491             /* Update vectorial force */
492             fix1             = _mm256_add_pd(fix1,tx);
493             fiy1             = _mm256_add_pd(fiy1,ty);
494             fiz1             = _mm256_add_pd(fiz1,tz);
495
496             fjx3             = _mm256_add_pd(fjx3,tx);
497             fjy3             = _mm256_add_pd(fjy3,ty);
498             fjz3             = _mm256_add_pd(fjz3,tz);
499
500             /**************************
501              * CALCULATE INTERACTIONS *
502              **************************/
503
504             r21              = _mm256_mul_pd(rsq21,rinv21);
505
506             /* Calculate table index by multiplying r with table scale and truncate to integer */
507             rt               = _mm256_mul_pd(r21,vftabscale);
508             vfitab           = _mm256_cvttpd_epi32(rt);
509             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
510             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
511
512             /* CUBIC SPLINE TABLE ELECTROSTATICS */
513             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
514             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
515             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
516             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
517             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
518             Heps             = _mm256_mul_pd(vfeps,H);
519             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
520             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
521             velec            = _mm256_mul_pd(qq21,VV);
522             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
523             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
524
525             /* Update potential sum for this i atom from the interaction with this j atom. */
526             velecsum         = _mm256_add_pd(velecsum,velec);
527
528             fscal            = felec;
529
530             /* Calculate temporary vectorial force */
531             tx               = _mm256_mul_pd(fscal,dx21);
532             ty               = _mm256_mul_pd(fscal,dy21);
533             tz               = _mm256_mul_pd(fscal,dz21);
534
535             /* Update vectorial force */
536             fix2             = _mm256_add_pd(fix2,tx);
537             fiy2             = _mm256_add_pd(fiy2,ty);
538             fiz2             = _mm256_add_pd(fiz2,tz);
539
540             fjx1             = _mm256_add_pd(fjx1,tx);
541             fjy1             = _mm256_add_pd(fjy1,ty);
542             fjz1             = _mm256_add_pd(fjz1,tz);
543
544             /**************************
545              * CALCULATE INTERACTIONS *
546              **************************/
547
548             r22              = _mm256_mul_pd(rsq22,rinv22);
549
550             /* Calculate table index by multiplying r with table scale and truncate to integer */
551             rt               = _mm256_mul_pd(r22,vftabscale);
552             vfitab           = _mm256_cvttpd_epi32(rt);
553             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
554             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
555
556             /* CUBIC SPLINE TABLE ELECTROSTATICS */
557             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
558             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
559             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
560             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
561             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
562             Heps             = _mm256_mul_pd(vfeps,H);
563             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
564             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
565             velec            = _mm256_mul_pd(qq22,VV);
566             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
567             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
568
569             /* Update potential sum for this i atom from the interaction with this j atom. */
570             velecsum         = _mm256_add_pd(velecsum,velec);
571
572             fscal            = felec;
573
574             /* Calculate temporary vectorial force */
575             tx               = _mm256_mul_pd(fscal,dx22);
576             ty               = _mm256_mul_pd(fscal,dy22);
577             tz               = _mm256_mul_pd(fscal,dz22);
578
579             /* Update vectorial force */
580             fix2             = _mm256_add_pd(fix2,tx);
581             fiy2             = _mm256_add_pd(fiy2,ty);
582             fiz2             = _mm256_add_pd(fiz2,tz);
583
584             fjx2             = _mm256_add_pd(fjx2,tx);
585             fjy2             = _mm256_add_pd(fjy2,ty);
586             fjz2             = _mm256_add_pd(fjz2,tz);
587
588             /**************************
589              * CALCULATE INTERACTIONS *
590              **************************/
591
592             r23              = _mm256_mul_pd(rsq23,rinv23);
593
594             /* Calculate table index by multiplying r with table scale and truncate to integer */
595             rt               = _mm256_mul_pd(r23,vftabscale);
596             vfitab           = _mm256_cvttpd_epi32(rt);
597             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
598             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
599
600             /* CUBIC SPLINE TABLE ELECTROSTATICS */
601             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
602             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
603             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
604             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
605             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
606             Heps             = _mm256_mul_pd(vfeps,H);
607             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
608             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
609             velec            = _mm256_mul_pd(qq23,VV);
610             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
611             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
612
613             /* Update potential sum for this i atom from the interaction with this j atom. */
614             velecsum         = _mm256_add_pd(velecsum,velec);
615
616             fscal            = felec;
617
618             /* Calculate temporary vectorial force */
619             tx               = _mm256_mul_pd(fscal,dx23);
620             ty               = _mm256_mul_pd(fscal,dy23);
621             tz               = _mm256_mul_pd(fscal,dz23);
622
623             /* Update vectorial force */
624             fix2             = _mm256_add_pd(fix2,tx);
625             fiy2             = _mm256_add_pd(fiy2,ty);
626             fiz2             = _mm256_add_pd(fiz2,tz);
627
628             fjx3             = _mm256_add_pd(fjx3,tx);
629             fjy3             = _mm256_add_pd(fjy3,ty);
630             fjz3             = _mm256_add_pd(fjz3,tz);
631
632             /**************************
633              * CALCULATE INTERACTIONS *
634              **************************/
635
636             r31              = _mm256_mul_pd(rsq31,rinv31);
637
638             /* Calculate table index by multiplying r with table scale and truncate to integer */
639             rt               = _mm256_mul_pd(r31,vftabscale);
640             vfitab           = _mm256_cvttpd_epi32(rt);
641             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
642             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
643
644             /* CUBIC SPLINE TABLE ELECTROSTATICS */
645             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
646             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
647             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
648             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
649             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
650             Heps             = _mm256_mul_pd(vfeps,H);
651             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
652             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
653             velec            = _mm256_mul_pd(qq31,VV);
654             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
655             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
656
657             /* Update potential sum for this i atom from the interaction with this j atom. */
658             velecsum         = _mm256_add_pd(velecsum,velec);
659
660             fscal            = felec;
661
662             /* Calculate temporary vectorial force */
663             tx               = _mm256_mul_pd(fscal,dx31);
664             ty               = _mm256_mul_pd(fscal,dy31);
665             tz               = _mm256_mul_pd(fscal,dz31);
666
667             /* Update vectorial force */
668             fix3             = _mm256_add_pd(fix3,tx);
669             fiy3             = _mm256_add_pd(fiy3,ty);
670             fiz3             = _mm256_add_pd(fiz3,tz);
671
672             fjx1             = _mm256_add_pd(fjx1,tx);
673             fjy1             = _mm256_add_pd(fjy1,ty);
674             fjz1             = _mm256_add_pd(fjz1,tz);
675
676             /**************************
677              * CALCULATE INTERACTIONS *
678              **************************/
679
680             r32              = _mm256_mul_pd(rsq32,rinv32);
681
682             /* Calculate table index by multiplying r with table scale and truncate to integer */
683             rt               = _mm256_mul_pd(r32,vftabscale);
684             vfitab           = _mm256_cvttpd_epi32(rt);
685             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
686             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
687
688             /* CUBIC SPLINE TABLE ELECTROSTATICS */
689             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
690             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
691             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
692             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
693             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
694             Heps             = _mm256_mul_pd(vfeps,H);
695             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
696             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
697             velec            = _mm256_mul_pd(qq32,VV);
698             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
699             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
700
701             /* Update potential sum for this i atom from the interaction with this j atom. */
702             velecsum         = _mm256_add_pd(velecsum,velec);
703
704             fscal            = felec;
705
706             /* Calculate temporary vectorial force */
707             tx               = _mm256_mul_pd(fscal,dx32);
708             ty               = _mm256_mul_pd(fscal,dy32);
709             tz               = _mm256_mul_pd(fscal,dz32);
710
711             /* Update vectorial force */
712             fix3             = _mm256_add_pd(fix3,tx);
713             fiy3             = _mm256_add_pd(fiy3,ty);
714             fiz3             = _mm256_add_pd(fiz3,tz);
715
716             fjx2             = _mm256_add_pd(fjx2,tx);
717             fjy2             = _mm256_add_pd(fjy2,ty);
718             fjz2             = _mm256_add_pd(fjz2,tz);
719
720             /**************************
721              * CALCULATE INTERACTIONS *
722              **************************/
723
724             r33              = _mm256_mul_pd(rsq33,rinv33);
725
726             /* Calculate table index by multiplying r with table scale and truncate to integer */
727             rt               = _mm256_mul_pd(r33,vftabscale);
728             vfitab           = _mm256_cvttpd_epi32(rt);
729             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
730             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
731
732             /* CUBIC SPLINE TABLE ELECTROSTATICS */
733             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
734             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
735             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
736             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
737             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
738             Heps             = _mm256_mul_pd(vfeps,H);
739             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
740             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
741             velec            = _mm256_mul_pd(qq33,VV);
742             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
743             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
744
745             /* Update potential sum for this i atom from the interaction with this j atom. */
746             velecsum         = _mm256_add_pd(velecsum,velec);
747
748             fscal            = felec;
749
750             /* Calculate temporary vectorial force */
751             tx               = _mm256_mul_pd(fscal,dx33);
752             ty               = _mm256_mul_pd(fscal,dy33);
753             tz               = _mm256_mul_pd(fscal,dz33);
754
755             /* Update vectorial force */
756             fix3             = _mm256_add_pd(fix3,tx);
757             fiy3             = _mm256_add_pd(fiy3,ty);
758             fiz3             = _mm256_add_pd(fiz3,tz);
759
760             fjx3             = _mm256_add_pd(fjx3,tx);
761             fjy3             = _mm256_add_pd(fjy3,ty);
762             fjz3             = _mm256_add_pd(fjz3,tz);
763
764             fjptrA             = f+j_coord_offsetA;
765             fjptrB             = f+j_coord_offsetB;
766             fjptrC             = f+j_coord_offsetC;
767             fjptrD             = f+j_coord_offsetD;
768
769             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
770                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
771                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
772
773             /* Inner loop uses 446 flops */
774         }
775
776         if(jidx<j_index_end)
777         {
778
779             /* Get j neighbor index, and coordinate index */
780             jnrlistA         = jjnr[jidx];
781             jnrlistB         = jjnr[jidx+1];
782             jnrlistC         = jjnr[jidx+2];
783             jnrlistD         = jjnr[jidx+3];
784             /* Sign of each element will be negative for non-real atoms.
785              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
786              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
787              */
788             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
789
790             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
791             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
792             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
793
794             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
795             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
796             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
797             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
798             j_coord_offsetA  = DIM*jnrA;
799             j_coord_offsetB  = DIM*jnrB;
800             j_coord_offsetC  = DIM*jnrC;
801             j_coord_offsetD  = DIM*jnrD;
802
803             /* load j atom coordinates */
804             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
805                                                  x+j_coord_offsetC,x+j_coord_offsetD,
806                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
807                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
808
809             /* Calculate displacement vector */
810             dx00             = _mm256_sub_pd(ix0,jx0);
811             dy00             = _mm256_sub_pd(iy0,jy0);
812             dz00             = _mm256_sub_pd(iz0,jz0);
813             dx11             = _mm256_sub_pd(ix1,jx1);
814             dy11             = _mm256_sub_pd(iy1,jy1);
815             dz11             = _mm256_sub_pd(iz1,jz1);
816             dx12             = _mm256_sub_pd(ix1,jx2);
817             dy12             = _mm256_sub_pd(iy1,jy2);
818             dz12             = _mm256_sub_pd(iz1,jz2);
819             dx13             = _mm256_sub_pd(ix1,jx3);
820             dy13             = _mm256_sub_pd(iy1,jy3);
821             dz13             = _mm256_sub_pd(iz1,jz3);
822             dx21             = _mm256_sub_pd(ix2,jx1);
823             dy21             = _mm256_sub_pd(iy2,jy1);
824             dz21             = _mm256_sub_pd(iz2,jz1);
825             dx22             = _mm256_sub_pd(ix2,jx2);
826             dy22             = _mm256_sub_pd(iy2,jy2);
827             dz22             = _mm256_sub_pd(iz2,jz2);
828             dx23             = _mm256_sub_pd(ix2,jx3);
829             dy23             = _mm256_sub_pd(iy2,jy3);
830             dz23             = _mm256_sub_pd(iz2,jz3);
831             dx31             = _mm256_sub_pd(ix3,jx1);
832             dy31             = _mm256_sub_pd(iy3,jy1);
833             dz31             = _mm256_sub_pd(iz3,jz1);
834             dx32             = _mm256_sub_pd(ix3,jx2);
835             dy32             = _mm256_sub_pd(iy3,jy2);
836             dz32             = _mm256_sub_pd(iz3,jz2);
837             dx33             = _mm256_sub_pd(ix3,jx3);
838             dy33             = _mm256_sub_pd(iy3,jy3);
839             dz33             = _mm256_sub_pd(iz3,jz3);
840
841             /* Calculate squared distance and things based on it */
842             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
843             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
844             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
845             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
846             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
847             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
848             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
849             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
850             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
851             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
852
853             rinv00           = avx256_invsqrt_d(rsq00);
854             rinv11           = avx256_invsqrt_d(rsq11);
855             rinv12           = avx256_invsqrt_d(rsq12);
856             rinv13           = avx256_invsqrt_d(rsq13);
857             rinv21           = avx256_invsqrt_d(rsq21);
858             rinv22           = avx256_invsqrt_d(rsq22);
859             rinv23           = avx256_invsqrt_d(rsq23);
860             rinv31           = avx256_invsqrt_d(rsq31);
861             rinv32           = avx256_invsqrt_d(rsq32);
862             rinv33           = avx256_invsqrt_d(rsq33);
863
864             fjx0             = _mm256_setzero_pd();
865             fjy0             = _mm256_setzero_pd();
866             fjz0             = _mm256_setzero_pd();
867             fjx1             = _mm256_setzero_pd();
868             fjy1             = _mm256_setzero_pd();
869             fjz1             = _mm256_setzero_pd();
870             fjx2             = _mm256_setzero_pd();
871             fjy2             = _mm256_setzero_pd();
872             fjz2             = _mm256_setzero_pd();
873             fjx3             = _mm256_setzero_pd();
874             fjy3             = _mm256_setzero_pd();
875             fjz3             = _mm256_setzero_pd();
876
877             /**************************
878              * CALCULATE INTERACTIONS *
879              **************************/
880
881             r00              = _mm256_mul_pd(rsq00,rinv00);
882             r00              = _mm256_andnot_pd(dummy_mask,r00);
883
884             /* Calculate table index by multiplying r with table scale and truncate to integer */
885             rt               = _mm256_mul_pd(r00,vftabscale);
886             vfitab           = _mm256_cvttpd_epi32(rt);
887             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
888             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
889
890             /* CUBIC SPLINE TABLE DISPERSION */
891             vfitab           = _mm_add_epi32(vfitab,ifour);
892             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
893             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
894             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
895             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
896             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
897             Heps             = _mm256_mul_pd(vfeps,H);
898             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
899             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
900             vvdw6            = _mm256_mul_pd(c6_00,VV);
901             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
902             fvdw6            = _mm256_mul_pd(c6_00,FF);
903
904             /* CUBIC SPLINE TABLE REPULSION */
905             vfitab           = _mm_add_epi32(vfitab,ifour);
906             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
907             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
908             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
909             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
910             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
911             Heps             = _mm256_mul_pd(vfeps,H);
912             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
913             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
914             vvdw12           = _mm256_mul_pd(c12_00,VV);
915             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
916             fvdw12           = _mm256_mul_pd(c12_00,FF);
917             vvdw             = _mm256_add_pd(vvdw12,vvdw6);
918             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
919
920             /* Update potential sum for this i atom from the interaction with this j atom. */
921             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
922             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
923
924             fscal            = fvdw;
925
926             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
927
928             /* Calculate temporary vectorial force */
929             tx               = _mm256_mul_pd(fscal,dx00);
930             ty               = _mm256_mul_pd(fscal,dy00);
931             tz               = _mm256_mul_pd(fscal,dz00);
932
933             /* Update vectorial force */
934             fix0             = _mm256_add_pd(fix0,tx);
935             fiy0             = _mm256_add_pd(fiy0,ty);
936             fiz0             = _mm256_add_pd(fiz0,tz);
937
938             fjx0             = _mm256_add_pd(fjx0,tx);
939             fjy0             = _mm256_add_pd(fjy0,ty);
940             fjz0             = _mm256_add_pd(fjz0,tz);
941
942             /**************************
943              * CALCULATE INTERACTIONS *
944              **************************/
945
946             r11              = _mm256_mul_pd(rsq11,rinv11);
947             r11              = _mm256_andnot_pd(dummy_mask,r11);
948
949             /* Calculate table index by multiplying r with table scale and truncate to integer */
950             rt               = _mm256_mul_pd(r11,vftabscale);
951             vfitab           = _mm256_cvttpd_epi32(rt);
952             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
953             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
954
955             /* CUBIC SPLINE TABLE ELECTROSTATICS */
956             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
957             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
958             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
959             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
960             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
961             Heps             = _mm256_mul_pd(vfeps,H);
962             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
963             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
964             velec            = _mm256_mul_pd(qq11,VV);
965             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
966             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
967
968             /* Update potential sum for this i atom from the interaction with this j atom. */
969             velec            = _mm256_andnot_pd(dummy_mask,velec);
970             velecsum         = _mm256_add_pd(velecsum,velec);
971
972             fscal            = felec;
973
974             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
975
976             /* Calculate temporary vectorial force */
977             tx               = _mm256_mul_pd(fscal,dx11);
978             ty               = _mm256_mul_pd(fscal,dy11);
979             tz               = _mm256_mul_pd(fscal,dz11);
980
981             /* Update vectorial force */
982             fix1             = _mm256_add_pd(fix1,tx);
983             fiy1             = _mm256_add_pd(fiy1,ty);
984             fiz1             = _mm256_add_pd(fiz1,tz);
985
986             fjx1             = _mm256_add_pd(fjx1,tx);
987             fjy1             = _mm256_add_pd(fjy1,ty);
988             fjz1             = _mm256_add_pd(fjz1,tz);
989
990             /**************************
991              * CALCULATE INTERACTIONS *
992              **************************/
993
994             r12              = _mm256_mul_pd(rsq12,rinv12);
995             r12              = _mm256_andnot_pd(dummy_mask,r12);
996
997             /* Calculate table index by multiplying r with table scale and truncate to integer */
998             rt               = _mm256_mul_pd(r12,vftabscale);
999             vfitab           = _mm256_cvttpd_epi32(rt);
1000             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1001             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1002
1003             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1004             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1005             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1006             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1007             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1008             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1009             Heps             = _mm256_mul_pd(vfeps,H);
1010             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1011             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1012             velec            = _mm256_mul_pd(qq12,VV);
1013             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1014             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1015
1016             /* Update potential sum for this i atom from the interaction with this j atom. */
1017             velec            = _mm256_andnot_pd(dummy_mask,velec);
1018             velecsum         = _mm256_add_pd(velecsum,velec);
1019
1020             fscal            = felec;
1021
1022             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1023
1024             /* Calculate temporary vectorial force */
1025             tx               = _mm256_mul_pd(fscal,dx12);
1026             ty               = _mm256_mul_pd(fscal,dy12);
1027             tz               = _mm256_mul_pd(fscal,dz12);
1028
1029             /* Update vectorial force */
1030             fix1             = _mm256_add_pd(fix1,tx);
1031             fiy1             = _mm256_add_pd(fiy1,ty);
1032             fiz1             = _mm256_add_pd(fiz1,tz);
1033
1034             fjx2             = _mm256_add_pd(fjx2,tx);
1035             fjy2             = _mm256_add_pd(fjy2,ty);
1036             fjz2             = _mm256_add_pd(fjz2,tz);
1037
1038             /**************************
1039              * CALCULATE INTERACTIONS *
1040              **************************/
1041
1042             r13              = _mm256_mul_pd(rsq13,rinv13);
1043             r13              = _mm256_andnot_pd(dummy_mask,r13);
1044
1045             /* Calculate table index by multiplying r with table scale and truncate to integer */
1046             rt               = _mm256_mul_pd(r13,vftabscale);
1047             vfitab           = _mm256_cvttpd_epi32(rt);
1048             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1049             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1050
1051             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1052             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1053             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1054             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1055             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1056             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1057             Heps             = _mm256_mul_pd(vfeps,H);
1058             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1059             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1060             velec            = _mm256_mul_pd(qq13,VV);
1061             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1062             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
1063
1064             /* Update potential sum for this i atom from the interaction with this j atom. */
1065             velec            = _mm256_andnot_pd(dummy_mask,velec);
1066             velecsum         = _mm256_add_pd(velecsum,velec);
1067
1068             fscal            = felec;
1069
1070             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1071
1072             /* Calculate temporary vectorial force */
1073             tx               = _mm256_mul_pd(fscal,dx13);
1074             ty               = _mm256_mul_pd(fscal,dy13);
1075             tz               = _mm256_mul_pd(fscal,dz13);
1076
1077             /* Update vectorial force */
1078             fix1             = _mm256_add_pd(fix1,tx);
1079             fiy1             = _mm256_add_pd(fiy1,ty);
1080             fiz1             = _mm256_add_pd(fiz1,tz);
1081
1082             fjx3             = _mm256_add_pd(fjx3,tx);
1083             fjy3             = _mm256_add_pd(fjy3,ty);
1084             fjz3             = _mm256_add_pd(fjz3,tz);
1085
1086             /**************************
1087              * CALCULATE INTERACTIONS *
1088              **************************/
1089
1090             r21              = _mm256_mul_pd(rsq21,rinv21);
1091             r21              = _mm256_andnot_pd(dummy_mask,r21);
1092
1093             /* Calculate table index by multiplying r with table scale and truncate to integer */
1094             rt               = _mm256_mul_pd(r21,vftabscale);
1095             vfitab           = _mm256_cvttpd_epi32(rt);
1096             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1097             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1098
1099             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1100             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1101             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1102             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1103             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1104             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1105             Heps             = _mm256_mul_pd(vfeps,H);
1106             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1107             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1108             velec            = _mm256_mul_pd(qq21,VV);
1109             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1110             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1111
1112             /* Update potential sum for this i atom from the interaction with this j atom. */
1113             velec            = _mm256_andnot_pd(dummy_mask,velec);
1114             velecsum         = _mm256_add_pd(velecsum,velec);
1115
1116             fscal            = felec;
1117
1118             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1119
1120             /* Calculate temporary vectorial force */
1121             tx               = _mm256_mul_pd(fscal,dx21);
1122             ty               = _mm256_mul_pd(fscal,dy21);
1123             tz               = _mm256_mul_pd(fscal,dz21);
1124
1125             /* Update vectorial force */
1126             fix2             = _mm256_add_pd(fix2,tx);
1127             fiy2             = _mm256_add_pd(fiy2,ty);
1128             fiz2             = _mm256_add_pd(fiz2,tz);
1129
1130             fjx1             = _mm256_add_pd(fjx1,tx);
1131             fjy1             = _mm256_add_pd(fjy1,ty);
1132             fjz1             = _mm256_add_pd(fjz1,tz);
1133
1134             /**************************
1135              * CALCULATE INTERACTIONS *
1136              **************************/
1137
1138             r22              = _mm256_mul_pd(rsq22,rinv22);
1139             r22              = _mm256_andnot_pd(dummy_mask,r22);
1140
1141             /* Calculate table index by multiplying r with table scale and truncate to integer */
1142             rt               = _mm256_mul_pd(r22,vftabscale);
1143             vfitab           = _mm256_cvttpd_epi32(rt);
1144             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1145             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1146
1147             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1148             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1149             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1150             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1151             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1152             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1153             Heps             = _mm256_mul_pd(vfeps,H);
1154             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1155             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1156             velec            = _mm256_mul_pd(qq22,VV);
1157             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1158             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1159
1160             /* Update potential sum for this i atom from the interaction with this j atom. */
1161             velec            = _mm256_andnot_pd(dummy_mask,velec);
1162             velecsum         = _mm256_add_pd(velecsum,velec);
1163
1164             fscal            = felec;
1165
1166             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1167
1168             /* Calculate temporary vectorial force */
1169             tx               = _mm256_mul_pd(fscal,dx22);
1170             ty               = _mm256_mul_pd(fscal,dy22);
1171             tz               = _mm256_mul_pd(fscal,dz22);
1172
1173             /* Update vectorial force */
1174             fix2             = _mm256_add_pd(fix2,tx);
1175             fiy2             = _mm256_add_pd(fiy2,ty);
1176             fiz2             = _mm256_add_pd(fiz2,tz);
1177
1178             fjx2             = _mm256_add_pd(fjx2,tx);
1179             fjy2             = _mm256_add_pd(fjy2,ty);
1180             fjz2             = _mm256_add_pd(fjz2,tz);
1181
1182             /**************************
1183              * CALCULATE INTERACTIONS *
1184              **************************/
1185
1186             r23              = _mm256_mul_pd(rsq23,rinv23);
1187             r23              = _mm256_andnot_pd(dummy_mask,r23);
1188
1189             /* Calculate table index by multiplying r with table scale and truncate to integer */
1190             rt               = _mm256_mul_pd(r23,vftabscale);
1191             vfitab           = _mm256_cvttpd_epi32(rt);
1192             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1193             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1194
1195             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1196             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1197             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1198             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1199             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1200             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1201             Heps             = _mm256_mul_pd(vfeps,H);
1202             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1203             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1204             velec            = _mm256_mul_pd(qq23,VV);
1205             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1206             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
1207
1208             /* Update potential sum for this i atom from the interaction with this j atom. */
1209             velec            = _mm256_andnot_pd(dummy_mask,velec);
1210             velecsum         = _mm256_add_pd(velecsum,velec);
1211
1212             fscal            = felec;
1213
1214             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1215
1216             /* Calculate temporary vectorial force */
1217             tx               = _mm256_mul_pd(fscal,dx23);
1218             ty               = _mm256_mul_pd(fscal,dy23);
1219             tz               = _mm256_mul_pd(fscal,dz23);
1220
1221             /* Update vectorial force */
1222             fix2             = _mm256_add_pd(fix2,tx);
1223             fiy2             = _mm256_add_pd(fiy2,ty);
1224             fiz2             = _mm256_add_pd(fiz2,tz);
1225
1226             fjx3             = _mm256_add_pd(fjx3,tx);
1227             fjy3             = _mm256_add_pd(fjy3,ty);
1228             fjz3             = _mm256_add_pd(fjz3,tz);
1229
1230             /**************************
1231              * CALCULATE INTERACTIONS *
1232              **************************/
1233
1234             r31              = _mm256_mul_pd(rsq31,rinv31);
1235             r31              = _mm256_andnot_pd(dummy_mask,r31);
1236
1237             /* Calculate table index by multiplying r with table scale and truncate to integer */
1238             rt               = _mm256_mul_pd(r31,vftabscale);
1239             vfitab           = _mm256_cvttpd_epi32(rt);
1240             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1241             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1242
1243             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1244             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1245             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1246             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1247             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1248             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1249             Heps             = _mm256_mul_pd(vfeps,H);
1250             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1251             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1252             velec            = _mm256_mul_pd(qq31,VV);
1253             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1254             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
1255
1256             /* Update potential sum for this i atom from the interaction with this j atom. */
1257             velec            = _mm256_andnot_pd(dummy_mask,velec);
1258             velecsum         = _mm256_add_pd(velecsum,velec);
1259
1260             fscal            = felec;
1261
1262             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1263
1264             /* Calculate temporary vectorial force */
1265             tx               = _mm256_mul_pd(fscal,dx31);
1266             ty               = _mm256_mul_pd(fscal,dy31);
1267             tz               = _mm256_mul_pd(fscal,dz31);
1268
1269             /* Update vectorial force */
1270             fix3             = _mm256_add_pd(fix3,tx);
1271             fiy3             = _mm256_add_pd(fiy3,ty);
1272             fiz3             = _mm256_add_pd(fiz3,tz);
1273
1274             fjx1             = _mm256_add_pd(fjx1,tx);
1275             fjy1             = _mm256_add_pd(fjy1,ty);
1276             fjz1             = _mm256_add_pd(fjz1,tz);
1277
1278             /**************************
1279              * CALCULATE INTERACTIONS *
1280              **************************/
1281
1282             r32              = _mm256_mul_pd(rsq32,rinv32);
1283             r32              = _mm256_andnot_pd(dummy_mask,r32);
1284
1285             /* Calculate table index by multiplying r with table scale and truncate to integer */
1286             rt               = _mm256_mul_pd(r32,vftabscale);
1287             vfitab           = _mm256_cvttpd_epi32(rt);
1288             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1289             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1290
1291             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1292             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1293             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1294             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1295             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1296             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1297             Heps             = _mm256_mul_pd(vfeps,H);
1298             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1299             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1300             velec            = _mm256_mul_pd(qq32,VV);
1301             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1302             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
1303
1304             /* Update potential sum for this i atom from the interaction with this j atom. */
1305             velec            = _mm256_andnot_pd(dummy_mask,velec);
1306             velecsum         = _mm256_add_pd(velecsum,velec);
1307
1308             fscal            = felec;
1309
1310             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1311
1312             /* Calculate temporary vectorial force */
1313             tx               = _mm256_mul_pd(fscal,dx32);
1314             ty               = _mm256_mul_pd(fscal,dy32);
1315             tz               = _mm256_mul_pd(fscal,dz32);
1316
1317             /* Update vectorial force */
1318             fix3             = _mm256_add_pd(fix3,tx);
1319             fiy3             = _mm256_add_pd(fiy3,ty);
1320             fiz3             = _mm256_add_pd(fiz3,tz);
1321
1322             fjx2             = _mm256_add_pd(fjx2,tx);
1323             fjy2             = _mm256_add_pd(fjy2,ty);
1324             fjz2             = _mm256_add_pd(fjz2,tz);
1325
1326             /**************************
1327              * CALCULATE INTERACTIONS *
1328              **************************/
1329
1330             r33              = _mm256_mul_pd(rsq33,rinv33);
1331             r33              = _mm256_andnot_pd(dummy_mask,r33);
1332
1333             /* Calculate table index by multiplying r with table scale and truncate to integer */
1334             rt               = _mm256_mul_pd(r33,vftabscale);
1335             vfitab           = _mm256_cvttpd_epi32(rt);
1336             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1337             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1338
1339             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1340             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1341             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1342             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1343             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1344             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1345             Heps             = _mm256_mul_pd(vfeps,H);
1346             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1347             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1348             velec            = _mm256_mul_pd(qq33,VV);
1349             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1350             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
1351
1352             /* Update potential sum for this i atom from the interaction with this j atom. */
1353             velec            = _mm256_andnot_pd(dummy_mask,velec);
1354             velecsum         = _mm256_add_pd(velecsum,velec);
1355
1356             fscal            = felec;
1357
1358             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1359
1360             /* Calculate temporary vectorial force */
1361             tx               = _mm256_mul_pd(fscal,dx33);
1362             ty               = _mm256_mul_pd(fscal,dy33);
1363             tz               = _mm256_mul_pd(fscal,dz33);
1364
1365             /* Update vectorial force */
1366             fix3             = _mm256_add_pd(fix3,tx);
1367             fiy3             = _mm256_add_pd(fiy3,ty);
1368             fiz3             = _mm256_add_pd(fiz3,tz);
1369
1370             fjx3             = _mm256_add_pd(fjx3,tx);
1371             fjy3             = _mm256_add_pd(fjy3,ty);
1372             fjz3             = _mm256_add_pd(fjz3,tz);
1373
1374             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1375             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1376             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1377             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1378
1379             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1380                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1381                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1382
1383             /* Inner loop uses 456 flops */
1384         }
1385
1386         /* End of innermost loop */
1387
1388         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1389                                                  f+i_coord_offset,fshift+i_shift_offset);
1390
1391         ggid                        = gid[iidx];
1392         /* Update potential energies */
1393         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1394         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1395
1396         /* Increment number of inner iterations */
1397         inneriter                  += j_index_end - j_index_start;
1398
1399         /* Outer loop uses 26 flops */
1400     }
1401
1402     /* Increment number of outer iterations */
1403     outeriter        += nri;
1404
1405     /* Update outer/inner flops */
1406
1407     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*456);
1408 }
1409 /*
1410  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double
1411  * Electrostatics interaction: CubicSplineTable
1412  * VdW interaction:            CubicSplineTable
1413  * Geometry:                   Water4-Water4
1414  * Calculate force/pot:        Force
1415  */
1416 void
1417 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double
1418                     (t_nblist                    * gmx_restrict       nlist,
1419                      rvec                        * gmx_restrict          xx,
1420                      rvec                        * gmx_restrict          ff,
1421                      struct t_forcerec           * gmx_restrict          fr,
1422                      t_mdatoms                   * gmx_restrict     mdatoms,
1423                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1424                      t_nrnb                      * gmx_restrict        nrnb)
1425 {
1426     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1427      * just 0 for non-waters.
1428      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1429      * jnr indices corresponding to data put in the four positions in the SIMD register.
1430      */
1431     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1432     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1433     int              jnrA,jnrB,jnrC,jnrD;
1434     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1435     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1436     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1437     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1438     real             rcutoff_scalar;
1439     real             *shiftvec,*fshift,*x,*f;
1440     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1441     real             scratch[4*DIM];
1442     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1443     real *           vdwioffsetptr0;
1444     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1445     real *           vdwioffsetptr1;
1446     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1447     real *           vdwioffsetptr2;
1448     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1449     real *           vdwioffsetptr3;
1450     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1451     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1452     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1453     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1454     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1455     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1456     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1457     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1458     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1459     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1460     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1461     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1462     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1463     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1464     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1465     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1466     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1467     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1468     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1469     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1470     real             *charge;
1471     int              nvdwtype;
1472     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1473     int              *vdwtype;
1474     real             *vdwparam;
1475     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1476     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1477     __m128i          vfitab;
1478     __m128i          ifour       = _mm_set1_epi32(4);
1479     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1480     real             *vftab;
1481     __m256d          dummy_mask,cutoff_mask;
1482     __m128           tmpmask0,tmpmask1;
1483     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1484     __m256d          one     = _mm256_set1_pd(1.0);
1485     __m256d          two     = _mm256_set1_pd(2.0);
1486     x                = xx[0];
1487     f                = ff[0];
1488
1489     nri              = nlist->nri;
1490     iinr             = nlist->iinr;
1491     jindex           = nlist->jindex;
1492     jjnr             = nlist->jjnr;
1493     shiftidx         = nlist->shift;
1494     gid              = nlist->gid;
1495     shiftvec         = fr->shift_vec[0];
1496     fshift           = fr->fshift[0];
1497     facel            = _mm256_set1_pd(fr->ic->epsfac);
1498     charge           = mdatoms->chargeA;
1499     nvdwtype         = fr->ntype;
1500     vdwparam         = fr->nbfp;
1501     vdwtype          = mdatoms->typeA;
1502
1503     vftab            = kernel_data->table_elec_vdw->data;
1504     vftabscale       = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
1505
1506     /* Setup water-specific parameters */
1507     inr              = nlist->iinr[0];
1508     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1509     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1510     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1511     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1512
1513     jq1              = _mm256_set1_pd(charge[inr+1]);
1514     jq2              = _mm256_set1_pd(charge[inr+2]);
1515     jq3              = _mm256_set1_pd(charge[inr+3]);
1516     vdwjidx0A        = 2*vdwtype[inr+0];
1517     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1518     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1519     qq11             = _mm256_mul_pd(iq1,jq1);
1520     qq12             = _mm256_mul_pd(iq1,jq2);
1521     qq13             = _mm256_mul_pd(iq1,jq3);
1522     qq21             = _mm256_mul_pd(iq2,jq1);
1523     qq22             = _mm256_mul_pd(iq2,jq2);
1524     qq23             = _mm256_mul_pd(iq2,jq3);
1525     qq31             = _mm256_mul_pd(iq3,jq1);
1526     qq32             = _mm256_mul_pd(iq3,jq2);
1527     qq33             = _mm256_mul_pd(iq3,jq3);
1528
1529     /* Avoid stupid compiler warnings */
1530     jnrA = jnrB = jnrC = jnrD = 0;
1531     j_coord_offsetA = 0;
1532     j_coord_offsetB = 0;
1533     j_coord_offsetC = 0;
1534     j_coord_offsetD = 0;
1535
1536     outeriter        = 0;
1537     inneriter        = 0;
1538
1539     for(iidx=0;iidx<4*DIM;iidx++)
1540     {
1541         scratch[iidx] = 0.0;
1542     }
1543
1544     /* Start outer loop over neighborlists */
1545     for(iidx=0; iidx<nri; iidx++)
1546     {
1547         /* Load shift vector for this list */
1548         i_shift_offset   = DIM*shiftidx[iidx];
1549
1550         /* Load limits for loop over neighbors */
1551         j_index_start    = jindex[iidx];
1552         j_index_end      = jindex[iidx+1];
1553
1554         /* Get outer coordinate index */
1555         inr              = iinr[iidx];
1556         i_coord_offset   = DIM*inr;
1557
1558         /* Load i particle coords and add shift vector */
1559         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1560                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1561
1562         fix0             = _mm256_setzero_pd();
1563         fiy0             = _mm256_setzero_pd();
1564         fiz0             = _mm256_setzero_pd();
1565         fix1             = _mm256_setzero_pd();
1566         fiy1             = _mm256_setzero_pd();
1567         fiz1             = _mm256_setzero_pd();
1568         fix2             = _mm256_setzero_pd();
1569         fiy2             = _mm256_setzero_pd();
1570         fiz2             = _mm256_setzero_pd();
1571         fix3             = _mm256_setzero_pd();
1572         fiy3             = _mm256_setzero_pd();
1573         fiz3             = _mm256_setzero_pd();
1574
1575         /* Start inner kernel loop */
1576         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1577         {
1578
1579             /* Get j neighbor index, and coordinate index */
1580             jnrA             = jjnr[jidx];
1581             jnrB             = jjnr[jidx+1];
1582             jnrC             = jjnr[jidx+2];
1583             jnrD             = jjnr[jidx+3];
1584             j_coord_offsetA  = DIM*jnrA;
1585             j_coord_offsetB  = DIM*jnrB;
1586             j_coord_offsetC  = DIM*jnrC;
1587             j_coord_offsetD  = DIM*jnrD;
1588
1589             /* load j atom coordinates */
1590             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1591                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1592                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1593                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1594
1595             /* Calculate displacement vector */
1596             dx00             = _mm256_sub_pd(ix0,jx0);
1597             dy00             = _mm256_sub_pd(iy0,jy0);
1598             dz00             = _mm256_sub_pd(iz0,jz0);
1599             dx11             = _mm256_sub_pd(ix1,jx1);
1600             dy11             = _mm256_sub_pd(iy1,jy1);
1601             dz11             = _mm256_sub_pd(iz1,jz1);
1602             dx12             = _mm256_sub_pd(ix1,jx2);
1603             dy12             = _mm256_sub_pd(iy1,jy2);
1604             dz12             = _mm256_sub_pd(iz1,jz2);
1605             dx13             = _mm256_sub_pd(ix1,jx3);
1606             dy13             = _mm256_sub_pd(iy1,jy3);
1607             dz13             = _mm256_sub_pd(iz1,jz3);
1608             dx21             = _mm256_sub_pd(ix2,jx1);
1609             dy21             = _mm256_sub_pd(iy2,jy1);
1610             dz21             = _mm256_sub_pd(iz2,jz1);
1611             dx22             = _mm256_sub_pd(ix2,jx2);
1612             dy22             = _mm256_sub_pd(iy2,jy2);
1613             dz22             = _mm256_sub_pd(iz2,jz2);
1614             dx23             = _mm256_sub_pd(ix2,jx3);
1615             dy23             = _mm256_sub_pd(iy2,jy3);
1616             dz23             = _mm256_sub_pd(iz2,jz3);
1617             dx31             = _mm256_sub_pd(ix3,jx1);
1618             dy31             = _mm256_sub_pd(iy3,jy1);
1619             dz31             = _mm256_sub_pd(iz3,jz1);
1620             dx32             = _mm256_sub_pd(ix3,jx2);
1621             dy32             = _mm256_sub_pd(iy3,jy2);
1622             dz32             = _mm256_sub_pd(iz3,jz2);
1623             dx33             = _mm256_sub_pd(ix3,jx3);
1624             dy33             = _mm256_sub_pd(iy3,jy3);
1625             dz33             = _mm256_sub_pd(iz3,jz3);
1626
1627             /* Calculate squared distance and things based on it */
1628             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1629             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1630             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1631             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1632             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1633             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1634             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1635             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1636             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1637             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1638
1639             rinv00           = avx256_invsqrt_d(rsq00);
1640             rinv11           = avx256_invsqrt_d(rsq11);
1641             rinv12           = avx256_invsqrt_d(rsq12);
1642             rinv13           = avx256_invsqrt_d(rsq13);
1643             rinv21           = avx256_invsqrt_d(rsq21);
1644             rinv22           = avx256_invsqrt_d(rsq22);
1645             rinv23           = avx256_invsqrt_d(rsq23);
1646             rinv31           = avx256_invsqrt_d(rsq31);
1647             rinv32           = avx256_invsqrt_d(rsq32);
1648             rinv33           = avx256_invsqrt_d(rsq33);
1649
1650             fjx0             = _mm256_setzero_pd();
1651             fjy0             = _mm256_setzero_pd();
1652             fjz0             = _mm256_setzero_pd();
1653             fjx1             = _mm256_setzero_pd();
1654             fjy1             = _mm256_setzero_pd();
1655             fjz1             = _mm256_setzero_pd();
1656             fjx2             = _mm256_setzero_pd();
1657             fjy2             = _mm256_setzero_pd();
1658             fjz2             = _mm256_setzero_pd();
1659             fjx3             = _mm256_setzero_pd();
1660             fjy3             = _mm256_setzero_pd();
1661             fjz3             = _mm256_setzero_pd();
1662
1663             /**************************
1664              * CALCULATE INTERACTIONS *
1665              **************************/
1666
1667             r00              = _mm256_mul_pd(rsq00,rinv00);
1668
1669             /* Calculate table index by multiplying r with table scale and truncate to integer */
1670             rt               = _mm256_mul_pd(r00,vftabscale);
1671             vfitab           = _mm256_cvttpd_epi32(rt);
1672             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1673             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1674
1675             /* CUBIC SPLINE TABLE DISPERSION */
1676             vfitab           = _mm_add_epi32(vfitab,ifour);
1677             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1678             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1679             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1680             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1681             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1682             Heps             = _mm256_mul_pd(vfeps,H);
1683             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1684             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1685             fvdw6            = _mm256_mul_pd(c6_00,FF);
1686
1687             /* CUBIC SPLINE TABLE REPULSION */
1688             vfitab           = _mm_add_epi32(vfitab,ifour);
1689             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1690             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1691             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1692             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1693             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1694             Heps             = _mm256_mul_pd(vfeps,H);
1695             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1696             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1697             fvdw12           = _mm256_mul_pd(c12_00,FF);
1698             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1699
1700             fscal            = fvdw;
1701
1702             /* Calculate temporary vectorial force */
1703             tx               = _mm256_mul_pd(fscal,dx00);
1704             ty               = _mm256_mul_pd(fscal,dy00);
1705             tz               = _mm256_mul_pd(fscal,dz00);
1706
1707             /* Update vectorial force */
1708             fix0             = _mm256_add_pd(fix0,tx);
1709             fiy0             = _mm256_add_pd(fiy0,ty);
1710             fiz0             = _mm256_add_pd(fiz0,tz);
1711
1712             fjx0             = _mm256_add_pd(fjx0,tx);
1713             fjy0             = _mm256_add_pd(fjy0,ty);
1714             fjz0             = _mm256_add_pd(fjz0,tz);
1715
1716             /**************************
1717              * CALCULATE INTERACTIONS *
1718              **************************/
1719
1720             r11              = _mm256_mul_pd(rsq11,rinv11);
1721
1722             /* Calculate table index by multiplying r with table scale and truncate to integer */
1723             rt               = _mm256_mul_pd(r11,vftabscale);
1724             vfitab           = _mm256_cvttpd_epi32(rt);
1725             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1726             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1727
1728             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1729             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1730             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1731             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1732             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1733             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1734             Heps             = _mm256_mul_pd(vfeps,H);
1735             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1736             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1737             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1738
1739             fscal            = felec;
1740
1741             /* Calculate temporary vectorial force */
1742             tx               = _mm256_mul_pd(fscal,dx11);
1743             ty               = _mm256_mul_pd(fscal,dy11);
1744             tz               = _mm256_mul_pd(fscal,dz11);
1745
1746             /* Update vectorial force */
1747             fix1             = _mm256_add_pd(fix1,tx);
1748             fiy1             = _mm256_add_pd(fiy1,ty);
1749             fiz1             = _mm256_add_pd(fiz1,tz);
1750
1751             fjx1             = _mm256_add_pd(fjx1,tx);
1752             fjy1             = _mm256_add_pd(fjy1,ty);
1753             fjz1             = _mm256_add_pd(fjz1,tz);
1754
1755             /**************************
1756              * CALCULATE INTERACTIONS *
1757              **************************/
1758
1759             r12              = _mm256_mul_pd(rsq12,rinv12);
1760
1761             /* Calculate table index by multiplying r with table scale and truncate to integer */
1762             rt               = _mm256_mul_pd(r12,vftabscale);
1763             vfitab           = _mm256_cvttpd_epi32(rt);
1764             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1765             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1766
1767             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1768             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1769             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1770             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1771             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1772             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1773             Heps             = _mm256_mul_pd(vfeps,H);
1774             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1775             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1776             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1777
1778             fscal            = felec;
1779
1780             /* Calculate temporary vectorial force */
1781             tx               = _mm256_mul_pd(fscal,dx12);
1782             ty               = _mm256_mul_pd(fscal,dy12);
1783             tz               = _mm256_mul_pd(fscal,dz12);
1784
1785             /* Update vectorial force */
1786             fix1             = _mm256_add_pd(fix1,tx);
1787             fiy1             = _mm256_add_pd(fiy1,ty);
1788             fiz1             = _mm256_add_pd(fiz1,tz);
1789
1790             fjx2             = _mm256_add_pd(fjx2,tx);
1791             fjy2             = _mm256_add_pd(fjy2,ty);
1792             fjz2             = _mm256_add_pd(fjz2,tz);
1793
1794             /**************************
1795              * CALCULATE INTERACTIONS *
1796              **************************/
1797
1798             r13              = _mm256_mul_pd(rsq13,rinv13);
1799
1800             /* Calculate table index by multiplying r with table scale and truncate to integer */
1801             rt               = _mm256_mul_pd(r13,vftabscale);
1802             vfitab           = _mm256_cvttpd_epi32(rt);
1803             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1804             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1805
1806             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1807             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1808             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1809             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1810             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1811             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1812             Heps             = _mm256_mul_pd(vfeps,H);
1813             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1814             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1815             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
1816
1817             fscal            = felec;
1818
1819             /* Calculate temporary vectorial force */
1820             tx               = _mm256_mul_pd(fscal,dx13);
1821             ty               = _mm256_mul_pd(fscal,dy13);
1822             tz               = _mm256_mul_pd(fscal,dz13);
1823
1824             /* Update vectorial force */
1825             fix1             = _mm256_add_pd(fix1,tx);
1826             fiy1             = _mm256_add_pd(fiy1,ty);
1827             fiz1             = _mm256_add_pd(fiz1,tz);
1828
1829             fjx3             = _mm256_add_pd(fjx3,tx);
1830             fjy3             = _mm256_add_pd(fjy3,ty);
1831             fjz3             = _mm256_add_pd(fjz3,tz);
1832
1833             /**************************
1834              * CALCULATE INTERACTIONS *
1835              **************************/
1836
1837             r21              = _mm256_mul_pd(rsq21,rinv21);
1838
1839             /* Calculate table index by multiplying r with table scale and truncate to integer */
1840             rt               = _mm256_mul_pd(r21,vftabscale);
1841             vfitab           = _mm256_cvttpd_epi32(rt);
1842             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1843             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1844
1845             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1846             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1847             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1848             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1849             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1850             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1851             Heps             = _mm256_mul_pd(vfeps,H);
1852             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1853             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1854             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1855
1856             fscal            = felec;
1857
1858             /* Calculate temporary vectorial force */
1859             tx               = _mm256_mul_pd(fscal,dx21);
1860             ty               = _mm256_mul_pd(fscal,dy21);
1861             tz               = _mm256_mul_pd(fscal,dz21);
1862
1863             /* Update vectorial force */
1864             fix2             = _mm256_add_pd(fix2,tx);
1865             fiy2             = _mm256_add_pd(fiy2,ty);
1866             fiz2             = _mm256_add_pd(fiz2,tz);
1867
1868             fjx1             = _mm256_add_pd(fjx1,tx);
1869             fjy1             = _mm256_add_pd(fjy1,ty);
1870             fjz1             = _mm256_add_pd(fjz1,tz);
1871
1872             /**************************
1873              * CALCULATE INTERACTIONS *
1874              **************************/
1875
1876             r22              = _mm256_mul_pd(rsq22,rinv22);
1877
1878             /* Calculate table index by multiplying r with table scale and truncate to integer */
1879             rt               = _mm256_mul_pd(r22,vftabscale);
1880             vfitab           = _mm256_cvttpd_epi32(rt);
1881             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1882             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1883
1884             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1885             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1886             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1887             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1888             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1889             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1890             Heps             = _mm256_mul_pd(vfeps,H);
1891             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1892             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1893             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1894
1895             fscal            = felec;
1896
1897             /* Calculate temporary vectorial force */
1898             tx               = _mm256_mul_pd(fscal,dx22);
1899             ty               = _mm256_mul_pd(fscal,dy22);
1900             tz               = _mm256_mul_pd(fscal,dz22);
1901
1902             /* Update vectorial force */
1903             fix2             = _mm256_add_pd(fix2,tx);
1904             fiy2             = _mm256_add_pd(fiy2,ty);
1905             fiz2             = _mm256_add_pd(fiz2,tz);
1906
1907             fjx2             = _mm256_add_pd(fjx2,tx);
1908             fjy2             = _mm256_add_pd(fjy2,ty);
1909             fjz2             = _mm256_add_pd(fjz2,tz);
1910
1911             /**************************
1912              * CALCULATE INTERACTIONS *
1913              **************************/
1914
1915             r23              = _mm256_mul_pd(rsq23,rinv23);
1916
1917             /* Calculate table index by multiplying r with table scale and truncate to integer */
1918             rt               = _mm256_mul_pd(r23,vftabscale);
1919             vfitab           = _mm256_cvttpd_epi32(rt);
1920             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1921             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1922
1923             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1924             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1925             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1926             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1927             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1928             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1929             Heps             = _mm256_mul_pd(vfeps,H);
1930             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1931             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1932             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
1933
1934             fscal            = felec;
1935
1936             /* Calculate temporary vectorial force */
1937             tx               = _mm256_mul_pd(fscal,dx23);
1938             ty               = _mm256_mul_pd(fscal,dy23);
1939             tz               = _mm256_mul_pd(fscal,dz23);
1940
1941             /* Update vectorial force */
1942             fix2             = _mm256_add_pd(fix2,tx);
1943             fiy2             = _mm256_add_pd(fiy2,ty);
1944             fiz2             = _mm256_add_pd(fiz2,tz);
1945
1946             fjx3             = _mm256_add_pd(fjx3,tx);
1947             fjy3             = _mm256_add_pd(fjy3,ty);
1948             fjz3             = _mm256_add_pd(fjz3,tz);
1949
1950             /**************************
1951              * CALCULATE INTERACTIONS *
1952              **************************/
1953
1954             r31              = _mm256_mul_pd(rsq31,rinv31);
1955
1956             /* Calculate table index by multiplying r with table scale and truncate to integer */
1957             rt               = _mm256_mul_pd(r31,vftabscale);
1958             vfitab           = _mm256_cvttpd_epi32(rt);
1959             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1960             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1961
1962             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1963             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1964             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1965             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1966             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1967             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1968             Heps             = _mm256_mul_pd(vfeps,H);
1969             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1970             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1971             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
1972
1973             fscal            = felec;
1974
1975             /* Calculate temporary vectorial force */
1976             tx               = _mm256_mul_pd(fscal,dx31);
1977             ty               = _mm256_mul_pd(fscal,dy31);
1978             tz               = _mm256_mul_pd(fscal,dz31);
1979
1980             /* Update vectorial force */
1981             fix3             = _mm256_add_pd(fix3,tx);
1982             fiy3             = _mm256_add_pd(fiy3,ty);
1983             fiz3             = _mm256_add_pd(fiz3,tz);
1984
1985             fjx1             = _mm256_add_pd(fjx1,tx);
1986             fjy1             = _mm256_add_pd(fjy1,ty);
1987             fjz1             = _mm256_add_pd(fjz1,tz);
1988
1989             /**************************
1990              * CALCULATE INTERACTIONS *
1991              **************************/
1992
1993             r32              = _mm256_mul_pd(rsq32,rinv32);
1994
1995             /* Calculate table index by multiplying r with table scale and truncate to integer */
1996             rt               = _mm256_mul_pd(r32,vftabscale);
1997             vfitab           = _mm256_cvttpd_epi32(rt);
1998             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1999             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2000
2001             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2002             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2003             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2004             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2005             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2006             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2007             Heps             = _mm256_mul_pd(vfeps,H);
2008             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2009             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2010             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
2011
2012             fscal            = felec;
2013
2014             /* Calculate temporary vectorial force */
2015             tx               = _mm256_mul_pd(fscal,dx32);
2016             ty               = _mm256_mul_pd(fscal,dy32);
2017             tz               = _mm256_mul_pd(fscal,dz32);
2018
2019             /* Update vectorial force */
2020             fix3             = _mm256_add_pd(fix3,tx);
2021             fiy3             = _mm256_add_pd(fiy3,ty);
2022             fiz3             = _mm256_add_pd(fiz3,tz);
2023
2024             fjx2             = _mm256_add_pd(fjx2,tx);
2025             fjy2             = _mm256_add_pd(fjy2,ty);
2026             fjz2             = _mm256_add_pd(fjz2,tz);
2027
2028             /**************************
2029              * CALCULATE INTERACTIONS *
2030              **************************/
2031
2032             r33              = _mm256_mul_pd(rsq33,rinv33);
2033
2034             /* Calculate table index by multiplying r with table scale and truncate to integer */
2035             rt               = _mm256_mul_pd(r33,vftabscale);
2036             vfitab           = _mm256_cvttpd_epi32(rt);
2037             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2038             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2039
2040             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2041             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2042             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2043             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2044             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2045             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2046             Heps             = _mm256_mul_pd(vfeps,H);
2047             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2048             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2049             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
2050
2051             fscal            = felec;
2052
2053             /* Calculate temporary vectorial force */
2054             tx               = _mm256_mul_pd(fscal,dx33);
2055             ty               = _mm256_mul_pd(fscal,dy33);
2056             tz               = _mm256_mul_pd(fscal,dz33);
2057
2058             /* Update vectorial force */
2059             fix3             = _mm256_add_pd(fix3,tx);
2060             fiy3             = _mm256_add_pd(fiy3,ty);
2061             fiz3             = _mm256_add_pd(fiz3,tz);
2062
2063             fjx3             = _mm256_add_pd(fjx3,tx);
2064             fjy3             = _mm256_add_pd(fjy3,ty);
2065             fjz3             = _mm256_add_pd(fjz3,tz);
2066
2067             fjptrA             = f+j_coord_offsetA;
2068             fjptrB             = f+j_coord_offsetB;
2069             fjptrC             = f+j_coord_offsetC;
2070             fjptrD             = f+j_coord_offsetD;
2071
2072             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2073                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2074                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2075
2076             /* Inner loop uses 402 flops */
2077         }
2078
2079         if(jidx<j_index_end)
2080         {
2081
2082             /* Get j neighbor index, and coordinate index */
2083             jnrlistA         = jjnr[jidx];
2084             jnrlistB         = jjnr[jidx+1];
2085             jnrlistC         = jjnr[jidx+2];
2086             jnrlistD         = jjnr[jidx+3];
2087             /* Sign of each element will be negative for non-real atoms.
2088              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2089              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2090              */
2091             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2092
2093             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2094             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2095             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2096
2097             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2098             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2099             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2100             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2101             j_coord_offsetA  = DIM*jnrA;
2102             j_coord_offsetB  = DIM*jnrB;
2103             j_coord_offsetC  = DIM*jnrC;
2104             j_coord_offsetD  = DIM*jnrD;
2105
2106             /* load j atom coordinates */
2107             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2108                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2109                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2110                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2111
2112             /* Calculate displacement vector */
2113             dx00             = _mm256_sub_pd(ix0,jx0);
2114             dy00             = _mm256_sub_pd(iy0,jy0);
2115             dz00             = _mm256_sub_pd(iz0,jz0);
2116             dx11             = _mm256_sub_pd(ix1,jx1);
2117             dy11             = _mm256_sub_pd(iy1,jy1);
2118             dz11             = _mm256_sub_pd(iz1,jz1);
2119             dx12             = _mm256_sub_pd(ix1,jx2);
2120             dy12             = _mm256_sub_pd(iy1,jy2);
2121             dz12             = _mm256_sub_pd(iz1,jz2);
2122             dx13             = _mm256_sub_pd(ix1,jx3);
2123             dy13             = _mm256_sub_pd(iy1,jy3);
2124             dz13             = _mm256_sub_pd(iz1,jz3);
2125             dx21             = _mm256_sub_pd(ix2,jx1);
2126             dy21             = _mm256_sub_pd(iy2,jy1);
2127             dz21             = _mm256_sub_pd(iz2,jz1);
2128             dx22             = _mm256_sub_pd(ix2,jx2);
2129             dy22             = _mm256_sub_pd(iy2,jy2);
2130             dz22             = _mm256_sub_pd(iz2,jz2);
2131             dx23             = _mm256_sub_pd(ix2,jx3);
2132             dy23             = _mm256_sub_pd(iy2,jy3);
2133             dz23             = _mm256_sub_pd(iz2,jz3);
2134             dx31             = _mm256_sub_pd(ix3,jx1);
2135             dy31             = _mm256_sub_pd(iy3,jy1);
2136             dz31             = _mm256_sub_pd(iz3,jz1);
2137             dx32             = _mm256_sub_pd(ix3,jx2);
2138             dy32             = _mm256_sub_pd(iy3,jy2);
2139             dz32             = _mm256_sub_pd(iz3,jz2);
2140             dx33             = _mm256_sub_pd(ix3,jx3);
2141             dy33             = _mm256_sub_pd(iy3,jy3);
2142             dz33             = _mm256_sub_pd(iz3,jz3);
2143
2144             /* Calculate squared distance and things based on it */
2145             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2146             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2147             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2148             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2149             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2150             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2151             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2152             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2153             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2154             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2155
2156             rinv00           = avx256_invsqrt_d(rsq00);
2157             rinv11           = avx256_invsqrt_d(rsq11);
2158             rinv12           = avx256_invsqrt_d(rsq12);
2159             rinv13           = avx256_invsqrt_d(rsq13);
2160             rinv21           = avx256_invsqrt_d(rsq21);
2161             rinv22           = avx256_invsqrt_d(rsq22);
2162             rinv23           = avx256_invsqrt_d(rsq23);
2163             rinv31           = avx256_invsqrt_d(rsq31);
2164             rinv32           = avx256_invsqrt_d(rsq32);
2165             rinv33           = avx256_invsqrt_d(rsq33);
2166
2167             fjx0             = _mm256_setzero_pd();
2168             fjy0             = _mm256_setzero_pd();
2169             fjz0             = _mm256_setzero_pd();
2170             fjx1             = _mm256_setzero_pd();
2171             fjy1             = _mm256_setzero_pd();
2172             fjz1             = _mm256_setzero_pd();
2173             fjx2             = _mm256_setzero_pd();
2174             fjy2             = _mm256_setzero_pd();
2175             fjz2             = _mm256_setzero_pd();
2176             fjx3             = _mm256_setzero_pd();
2177             fjy3             = _mm256_setzero_pd();
2178             fjz3             = _mm256_setzero_pd();
2179
2180             /**************************
2181              * CALCULATE INTERACTIONS *
2182              **************************/
2183
2184             r00              = _mm256_mul_pd(rsq00,rinv00);
2185             r00              = _mm256_andnot_pd(dummy_mask,r00);
2186
2187             /* Calculate table index by multiplying r with table scale and truncate to integer */
2188             rt               = _mm256_mul_pd(r00,vftabscale);
2189             vfitab           = _mm256_cvttpd_epi32(rt);
2190             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2191             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2192
2193             /* CUBIC SPLINE TABLE DISPERSION */
2194             vfitab           = _mm_add_epi32(vfitab,ifour);
2195             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2196             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2197             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2198             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2199             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2200             Heps             = _mm256_mul_pd(vfeps,H);
2201             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2202             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2203             fvdw6            = _mm256_mul_pd(c6_00,FF);
2204
2205             /* CUBIC SPLINE TABLE REPULSION */
2206             vfitab           = _mm_add_epi32(vfitab,ifour);
2207             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2208             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2209             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2210             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2211             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2212             Heps             = _mm256_mul_pd(vfeps,H);
2213             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2214             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2215             fvdw12           = _mm256_mul_pd(c12_00,FF);
2216             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
2217
2218             fscal            = fvdw;
2219
2220             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2221
2222             /* Calculate temporary vectorial force */
2223             tx               = _mm256_mul_pd(fscal,dx00);
2224             ty               = _mm256_mul_pd(fscal,dy00);
2225             tz               = _mm256_mul_pd(fscal,dz00);
2226
2227             /* Update vectorial force */
2228             fix0             = _mm256_add_pd(fix0,tx);
2229             fiy0             = _mm256_add_pd(fiy0,ty);
2230             fiz0             = _mm256_add_pd(fiz0,tz);
2231
2232             fjx0             = _mm256_add_pd(fjx0,tx);
2233             fjy0             = _mm256_add_pd(fjy0,ty);
2234             fjz0             = _mm256_add_pd(fjz0,tz);
2235
2236             /**************************
2237              * CALCULATE INTERACTIONS *
2238              **************************/
2239
2240             r11              = _mm256_mul_pd(rsq11,rinv11);
2241             r11              = _mm256_andnot_pd(dummy_mask,r11);
2242
2243             /* Calculate table index by multiplying r with table scale and truncate to integer */
2244             rt               = _mm256_mul_pd(r11,vftabscale);
2245             vfitab           = _mm256_cvttpd_epi32(rt);
2246             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2247             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2248
2249             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2250             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2251             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2252             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2253             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2254             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2255             Heps             = _mm256_mul_pd(vfeps,H);
2256             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2257             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2258             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2259
2260             fscal            = felec;
2261
2262             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2263
2264             /* Calculate temporary vectorial force */
2265             tx               = _mm256_mul_pd(fscal,dx11);
2266             ty               = _mm256_mul_pd(fscal,dy11);
2267             tz               = _mm256_mul_pd(fscal,dz11);
2268
2269             /* Update vectorial force */
2270             fix1             = _mm256_add_pd(fix1,tx);
2271             fiy1             = _mm256_add_pd(fiy1,ty);
2272             fiz1             = _mm256_add_pd(fiz1,tz);
2273
2274             fjx1             = _mm256_add_pd(fjx1,tx);
2275             fjy1             = _mm256_add_pd(fjy1,ty);
2276             fjz1             = _mm256_add_pd(fjz1,tz);
2277
2278             /**************************
2279              * CALCULATE INTERACTIONS *
2280              **************************/
2281
2282             r12              = _mm256_mul_pd(rsq12,rinv12);
2283             r12              = _mm256_andnot_pd(dummy_mask,r12);
2284
2285             /* Calculate table index by multiplying r with table scale and truncate to integer */
2286             rt               = _mm256_mul_pd(r12,vftabscale);
2287             vfitab           = _mm256_cvttpd_epi32(rt);
2288             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2289             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2290
2291             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2292             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2293             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2294             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2295             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2296             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2297             Heps             = _mm256_mul_pd(vfeps,H);
2298             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2299             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2300             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2301
2302             fscal            = felec;
2303
2304             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2305
2306             /* Calculate temporary vectorial force */
2307             tx               = _mm256_mul_pd(fscal,dx12);
2308             ty               = _mm256_mul_pd(fscal,dy12);
2309             tz               = _mm256_mul_pd(fscal,dz12);
2310
2311             /* Update vectorial force */
2312             fix1             = _mm256_add_pd(fix1,tx);
2313             fiy1             = _mm256_add_pd(fiy1,ty);
2314             fiz1             = _mm256_add_pd(fiz1,tz);
2315
2316             fjx2             = _mm256_add_pd(fjx2,tx);
2317             fjy2             = _mm256_add_pd(fjy2,ty);
2318             fjz2             = _mm256_add_pd(fjz2,tz);
2319
2320             /**************************
2321              * CALCULATE INTERACTIONS *
2322              **************************/
2323
2324             r13              = _mm256_mul_pd(rsq13,rinv13);
2325             r13              = _mm256_andnot_pd(dummy_mask,r13);
2326
2327             /* Calculate table index by multiplying r with table scale and truncate to integer */
2328             rt               = _mm256_mul_pd(r13,vftabscale);
2329             vfitab           = _mm256_cvttpd_epi32(rt);
2330             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2331             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2332
2333             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2334             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2335             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2336             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2337             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2338             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2339             Heps             = _mm256_mul_pd(vfeps,H);
2340             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2341             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2342             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
2343
2344             fscal            = felec;
2345
2346             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2347
2348             /* Calculate temporary vectorial force */
2349             tx               = _mm256_mul_pd(fscal,dx13);
2350             ty               = _mm256_mul_pd(fscal,dy13);
2351             tz               = _mm256_mul_pd(fscal,dz13);
2352
2353             /* Update vectorial force */
2354             fix1             = _mm256_add_pd(fix1,tx);
2355             fiy1             = _mm256_add_pd(fiy1,ty);
2356             fiz1             = _mm256_add_pd(fiz1,tz);
2357
2358             fjx3             = _mm256_add_pd(fjx3,tx);
2359             fjy3             = _mm256_add_pd(fjy3,ty);
2360             fjz3             = _mm256_add_pd(fjz3,tz);
2361
2362             /**************************
2363              * CALCULATE INTERACTIONS *
2364              **************************/
2365
2366             r21              = _mm256_mul_pd(rsq21,rinv21);
2367             r21              = _mm256_andnot_pd(dummy_mask,r21);
2368
2369             /* Calculate table index by multiplying r with table scale and truncate to integer */
2370             rt               = _mm256_mul_pd(r21,vftabscale);
2371             vfitab           = _mm256_cvttpd_epi32(rt);
2372             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2373             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2374
2375             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2376             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2377             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2378             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2379             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2380             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2381             Heps             = _mm256_mul_pd(vfeps,H);
2382             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2383             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2384             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2385
2386             fscal            = felec;
2387
2388             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2389
2390             /* Calculate temporary vectorial force */
2391             tx               = _mm256_mul_pd(fscal,dx21);
2392             ty               = _mm256_mul_pd(fscal,dy21);
2393             tz               = _mm256_mul_pd(fscal,dz21);
2394
2395             /* Update vectorial force */
2396             fix2             = _mm256_add_pd(fix2,tx);
2397             fiy2             = _mm256_add_pd(fiy2,ty);
2398             fiz2             = _mm256_add_pd(fiz2,tz);
2399
2400             fjx1             = _mm256_add_pd(fjx1,tx);
2401             fjy1             = _mm256_add_pd(fjy1,ty);
2402             fjz1             = _mm256_add_pd(fjz1,tz);
2403
2404             /**************************
2405              * CALCULATE INTERACTIONS *
2406              **************************/
2407
2408             r22              = _mm256_mul_pd(rsq22,rinv22);
2409             r22              = _mm256_andnot_pd(dummy_mask,r22);
2410
2411             /* Calculate table index by multiplying r with table scale and truncate to integer */
2412             rt               = _mm256_mul_pd(r22,vftabscale);
2413             vfitab           = _mm256_cvttpd_epi32(rt);
2414             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2415             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2416
2417             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2418             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2419             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2420             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2421             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2422             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2423             Heps             = _mm256_mul_pd(vfeps,H);
2424             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2425             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2426             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2427
2428             fscal            = felec;
2429
2430             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2431
2432             /* Calculate temporary vectorial force */
2433             tx               = _mm256_mul_pd(fscal,dx22);
2434             ty               = _mm256_mul_pd(fscal,dy22);
2435             tz               = _mm256_mul_pd(fscal,dz22);
2436
2437             /* Update vectorial force */
2438             fix2             = _mm256_add_pd(fix2,tx);
2439             fiy2             = _mm256_add_pd(fiy2,ty);
2440             fiz2             = _mm256_add_pd(fiz2,tz);
2441
2442             fjx2             = _mm256_add_pd(fjx2,tx);
2443             fjy2             = _mm256_add_pd(fjy2,ty);
2444             fjz2             = _mm256_add_pd(fjz2,tz);
2445
2446             /**************************
2447              * CALCULATE INTERACTIONS *
2448              **************************/
2449
2450             r23              = _mm256_mul_pd(rsq23,rinv23);
2451             r23              = _mm256_andnot_pd(dummy_mask,r23);
2452
2453             /* Calculate table index by multiplying r with table scale and truncate to integer */
2454             rt               = _mm256_mul_pd(r23,vftabscale);
2455             vfitab           = _mm256_cvttpd_epi32(rt);
2456             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2457             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2458
2459             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2460             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2461             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2462             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2463             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2464             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2465             Heps             = _mm256_mul_pd(vfeps,H);
2466             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2467             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2468             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
2469
2470             fscal            = felec;
2471
2472             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2473
2474             /* Calculate temporary vectorial force */
2475             tx               = _mm256_mul_pd(fscal,dx23);
2476             ty               = _mm256_mul_pd(fscal,dy23);
2477             tz               = _mm256_mul_pd(fscal,dz23);
2478
2479             /* Update vectorial force */
2480             fix2             = _mm256_add_pd(fix2,tx);
2481             fiy2             = _mm256_add_pd(fiy2,ty);
2482             fiz2             = _mm256_add_pd(fiz2,tz);
2483
2484             fjx3             = _mm256_add_pd(fjx3,tx);
2485             fjy3             = _mm256_add_pd(fjy3,ty);
2486             fjz3             = _mm256_add_pd(fjz3,tz);
2487
2488             /**************************
2489              * CALCULATE INTERACTIONS *
2490              **************************/
2491
2492             r31              = _mm256_mul_pd(rsq31,rinv31);
2493             r31              = _mm256_andnot_pd(dummy_mask,r31);
2494
2495             /* Calculate table index by multiplying r with table scale and truncate to integer */
2496             rt               = _mm256_mul_pd(r31,vftabscale);
2497             vfitab           = _mm256_cvttpd_epi32(rt);
2498             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2499             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2500
2501             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2502             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2503             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2504             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2505             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2506             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2507             Heps             = _mm256_mul_pd(vfeps,H);
2508             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2509             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2510             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
2511
2512             fscal            = felec;
2513
2514             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2515
2516             /* Calculate temporary vectorial force */
2517             tx               = _mm256_mul_pd(fscal,dx31);
2518             ty               = _mm256_mul_pd(fscal,dy31);
2519             tz               = _mm256_mul_pd(fscal,dz31);
2520
2521             /* Update vectorial force */
2522             fix3             = _mm256_add_pd(fix3,tx);
2523             fiy3             = _mm256_add_pd(fiy3,ty);
2524             fiz3             = _mm256_add_pd(fiz3,tz);
2525
2526             fjx1             = _mm256_add_pd(fjx1,tx);
2527             fjy1             = _mm256_add_pd(fjy1,ty);
2528             fjz1             = _mm256_add_pd(fjz1,tz);
2529
2530             /**************************
2531              * CALCULATE INTERACTIONS *
2532              **************************/
2533
2534             r32              = _mm256_mul_pd(rsq32,rinv32);
2535             r32              = _mm256_andnot_pd(dummy_mask,r32);
2536
2537             /* Calculate table index by multiplying r with table scale and truncate to integer */
2538             rt               = _mm256_mul_pd(r32,vftabscale);
2539             vfitab           = _mm256_cvttpd_epi32(rt);
2540             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2541             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2542
2543             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2544             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2545             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2546             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2547             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2548             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2549             Heps             = _mm256_mul_pd(vfeps,H);
2550             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2551             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2552             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
2553
2554             fscal            = felec;
2555
2556             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2557
2558             /* Calculate temporary vectorial force */
2559             tx               = _mm256_mul_pd(fscal,dx32);
2560             ty               = _mm256_mul_pd(fscal,dy32);
2561             tz               = _mm256_mul_pd(fscal,dz32);
2562
2563             /* Update vectorial force */
2564             fix3             = _mm256_add_pd(fix3,tx);
2565             fiy3             = _mm256_add_pd(fiy3,ty);
2566             fiz3             = _mm256_add_pd(fiz3,tz);
2567
2568             fjx2             = _mm256_add_pd(fjx2,tx);
2569             fjy2             = _mm256_add_pd(fjy2,ty);
2570             fjz2             = _mm256_add_pd(fjz2,tz);
2571
2572             /**************************
2573              * CALCULATE INTERACTIONS *
2574              **************************/
2575
2576             r33              = _mm256_mul_pd(rsq33,rinv33);
2577             r33              = _mm256_andnot_pd(dummy_mask,r33);
2578
2579             /* Calculate table index by multiplying r with table scale and truncate to integer */
2580             rt               = _mm256_mul_pd(r33,vftabscale);
2581             vfitab           = _mm256_cvttpd_epi32(rt);
2582             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2583             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2584
2585             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2586             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2587             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2588             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2589             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2590             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2591             Heps             = _mm256_mul_pd(vfeps,H);
2592             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2593             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2594             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
2595
2596             fscal            = felec;
2597
2598             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2599
2600             /* Calculate temporary vectorial force */
2601             tx               = _mm256_mul_pd(fscal,dx33);
2602             ty               = _mm256_mul_pd(fscal,dy33);
2603             tz               = _mm256_mul_pd(fscal,dz33);
2604
2605             /* Update vectorial force */
2606             fix3             = _mm256_add_pd(fix3,tx);
2607             fiy3             = _mm256_add_pd(fiy3,ty);
2608             fiz3             = _mm256_add_pd(fiz3,tz);
2609
2610             fjx3             = _mm256_add_pd(fjx3,tx);
2611             fjy3             = _mm256_add_pd(fjy3,ty);
2612             fjz3             = _mm256_add_pd(fjz3,tz);
2613
2614             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2615             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2616             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2617             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2618
2619             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2620                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2621                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2622
2623             /* Inner loop uses 412 flops */
2624         }
2625
2626         /* End of innermost loop */
2627
2628         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2629                                                  f+i_coord_offset,fshift+i_shift_offset);
2630
2631         /* Increment number of inner iterations */
2632         inneriter                  += j_index_end - j_index_start;
2633
2634         /* Outer loop uses 24 flops */
2635     }
2636
2637     /* Increment number of outer iterations */
2638     outeriter        += nri;
2639
2640     /* Update outer/inner flops */
2641
2642     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*412);
2643 }