made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_avx_256_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
65     real             rcutoff_scalar;
66     real             *shiftvec,*fshift,*x,*f;
67     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68     real             scratch[4*DIM];
69     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70     real *           vdwioffsetptr0;
71     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72     real *           vdwioffsetptr1;
73     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74     real *           vdwioffsetptr2;
75     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76     real *           vdwioffsetptr3;
77     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
81     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
83     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
85     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
86     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
87     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
88     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
89     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
90     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
91     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
92     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
93     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
94     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
95     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
96     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
97     real             *charge;
98     int              nvdwtype;
99     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
100     int              *vdwtype;
101     real             *vdwparam;
102     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
103     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
104     __m128i          vfitab;
105     __m128i          ifour       = _mm_set1_epi32(4);
106     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
107     real             *vftab;
108     __m256d          dummy_mask,cutoff_mask;
109     __m128           tmpmask0,tmpmask1;
110     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
111     __m256d          one     = _mm256_set1_pd(1.0);
112     __m256d          two     = _mm256_set1_pd(2.0);
113     x                = xx[0];
114     f                = ff[0];
115
116     nri              = nlist->nri;
117     iinr             = nlist->iinr;
118     jindex           = nlist->jindex;
119     jjnr             = nlist->jjnr;
120     shiftidx         = nlist->shift;
121     gid              = nlist->gid;
122     shiftvec         = fr->shift_vec[0];
123     fshift           = fr->fshift[0];
124     facel            = _mm256_set1_pd(fr->epsfac);
125     charge           = mdatoms->chargeA;
126     nvdwtype         = fr->ntype;
127     vdwparam         = fr->nbfp;
128     vdwtype          = mdatoms->typeA;
129
130     vftab            = kernel_data->table_elec_vdw->data;
131     vftabscale       = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
132
133     /* Setup water-specific parameters */
134     inr              = nlist->iinr[0];
135     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
136     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
137     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
138     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
139
140     jq1              = _mm256_set1_pd(charge[inr+1]);
141     jq2              = _mm256_set1_pd(charge[inr+2]);
142     jq3              = _mm256_set1_pd(charge[inr+3]);
143     vdwjidx0A        = 2*vdwtype[inr+0];
144     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
145     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
146     qq11             = _mm256_mul_pd(iq1,jq1);
147     qq12             = _mm256_mul_pd(iq1,jq2);
148     qq13             = _mm256_mul_pd(iq1,jq3);
149     qq21             = _mm256_mul_pd(iq2,jq1);
150     qq22             = _mm256_mul_pd(iq2,jq2);
151     qq23             = _mm256_mul_pd(iq2,jq3);
152     qq31             = _mm256_mul_pd(iq3,jq1);
153     qq32             = _mm256_mul_pd(iq3,jq2);
154     qq33             = _mm256_mul_pd(iq3,jq3);
155
156     /* Avoid stupid compiler warnings */
157     jnrA = jnrB = jnrC = jnrD = 0;
158     j_coord_offsetA = 0;
159     j_coord_offsetB = 0;
160     j_coord_offsetC = 0;
161     j_coord_offsetD = 0;
162
163     outeriter        = 0;
164     inneriter        = 0;
165
166     for(iidx=0;iidx<4*DIM;iidx++)
167     {
168         scratch[iidx] = 0.0;
169     }
170
171     /* Start outer loop over neighborlists */
172     for(iidx=0; iidx<nri; iidx++)
173     {
174         /* Load shift vector for this list */
175         i_shift_offset   = DIM*shiftidx[iidx];
176
177         /* Load limits for loop over neighbors */
178         j_index_start    = jindex[iidx];
179         j_index_end      = jindex[iidx+1];
180
181         /* Get outer coordinate index */
182         inr              = iinr[iidx];
183         i_coord_offset   = DIM*inr;
184
185         /* Load i particle coords and add shift vector */
186         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
187                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
188
189         fix0             = _mm256_setzero_pd();
190         fiy0             = _mm256_setzero_pd();
191         fiz0             = _mm256_setzero_pd();
192         fix1             = _mm256_setzero_pd();
193         fiy1             = _mm256_setzero_pd();
194         fiz1             = _mm256_setzero_pd();
195         fix2             = _mm256_setzero_pd();
196         fiy2             = _mm256_setzero_pd();
197         fiz2             = _mm256_setzero_pd();
198         fix3             = _mm256_setzero_pd();
199         fiy3             = _mm256_setzero_pd();
200         fiz3             = _mm256_setzero_pd();
201
202         /* Reset potential sums */
203         velecsum         = _mm256_setzero_pd();
204         vvdwsum          = _mm256_setzero_pd();
205
206         /* Start inner kernel loop */
207         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
208         {
209
210             /* Get j neighbor index, and coordinate index */
211             jnrA             = jjnr[jidx];
212             jnrB             = jjnr[jidx+1];
213             jnrC             = jjnr[jidx+2];
214             jnrD             = jjnr[jidx+3];
215             j_coord_offsetA  = DIM*jnrA;
216             j_coord_offsetB  = DIM*jnrB;
217             j_coord_offsetC  = DIM*jnrC;
218             j_coord_offsetD  = DIM*jnrD;
219
220             /* load j atom coordinates */
221             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
222                                                  x+j_coord_offsetC,x+j_coord_offsetD,
223                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
224                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
225
226             /* Calculate displacement vector */
227             dx00             = _mm256_sub_pd(ix0,jx0);
228             dy00             = _mm256_sub_pd(iy0,jy0);
229             dz00             = _mm256_sub_pd(iz0,jz0);
230             dx11             = _mm256_sub_pd(ix1,jx1);
231             dy11             = _mm256_sub_pd(iy1,jy1);
232             dz11             = _mm256_sub_pd(iz1,jz1);
233             dx12             = _mm256_sub_pd(ix1,jx2);
234             dy12             = _mm256_sub_pd(iy1,jy2);
235             dz12             = _mm256_sub_pd(iz1,jz2);
236             dx13             = _mm256_sub_pd(ix1,jx3);
237             dy13             = _mm256_sub_pd(iy1,jy3);
238             dz13             = _mm256_sub_pd(iz1,jz3);
239             dx21             = _mm256_sub_pd(ix2,jx1);
240             dy21             = _mm256_sub_pd(iy2,jy1);
241             dz21             = _mm256_sub_pd(iz2,jz1);
242             dx22             = _mm256_sub_pd(ix2,jx2);
243             dy22             = _mm256_sub_pd(iy2,jy2);
244             dz22             = _mm256_sub_pd(iz2,jz2);
245             dx23             = _mm256_sub_pd(ix2,jx3);
246             dy23             = _mm256_sub_pd(iy2,jy3);
247             dz23             = _mm256_sub_pd(iz2,jz3);
248             dx31             = _mm256_sub_pd(ix3,jx1);
249             dy31             = _mm256_sub_pd(iy3,jy1);
250             dz31             = _mm256_sub_pd(iz3,jz1);
251             dx32             = _mm256_sub_pd(ix3,jx2);
252             dy32             = _mm256_sub_pd(iy3,jy2);
253             dz32             = _mm256_sub_pd(iz3,jz2);
254             dx33             = _mm256_sub_pd(ix3,jx3);
255             dy33             = _mm256_sub_pd(iy3,jy3);
256             dz33             = _mm256_sub_pd(iz3,jz3);
257
258             /* Calculate squared distance and things based on it */
259             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
260             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
261             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
262             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
263             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
264             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
265             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
266             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
267             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
268             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
269
270             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
271             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
272             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
273             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
274             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
275             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
276             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
277             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
278             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
279             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
280
281             fjx0             = _mm256_setzero_pd();
282             fjy0             = _mm256_setzero_pd();
283             fjz0             = _mm256_setzero_pd();
284             fjx1             = _mm256_setzero_pd();
285             fjy1             = _mm256_setzero_pd();
286             fjz1             = _mm256_setzero_pd();
287             fjx2             = _mm256_setzero_pd();
288             fjy2             = _mm256_setzero_pd();
289             fjz2             = _mm256_setzero_pd();
290             fjx3             = _mm256_setzero_pd();
291             fjy3             = _mm256_setzero_pd();
292             fjz3             = _mm256_setzero_pd();
293
294             /**************************
295              * CALCULATE INTERACTIONS *
296              **************************/
297
298             r00              = _mm256_mul_pd(rsq00,rinv00);
299
300             /* Calculate table index by multiplying r with table scale and truncate to integer */
301             rt               = _mm256_mul_pd(r00,vftabscale);
302             vfitab           = _mm256_cvttpd_epi32(rt);
303             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
304             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
305
306             /* CUBIC SPLINE TABLE DISPERSION */
307             vfitab           = _mm_add_epi32(vfitab,ifour);
308             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
309             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
310             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
311             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
312             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
313             Heps             = _mm256_mul_pd(vfeps,H);
314             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
315             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
316             vvdw6            = _mm256_mul_pd(c6_00,VV);
317             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
318             fvdw6            = _mm256_mul_pd(c6_00,FF);
319
320             /* CUBIC SPLINE TABLE REPULSION */
321             vfitab           = _mm_add_epi32(vfitab,ifour);
322             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
323             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
324             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
325             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
326             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
327             Heps             = _mm256_mul_pd(vfeps,H);
328             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
329             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
330             vvdw12           = _mm256_mul_pd(c12_00,VV);
331             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
332             fvdw12           = _mm256_mul_pd(c12_00,FF);
333             vvdw             = _mm256_add_pd(vvdw12,vvdw6);
334             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
335
336             /* Update potential sum for this i atom from the interaction with this j atom. */
337             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
338
339             fscal            = fvdw;
340
341             /* Calculate temporary vectorial force */
342             tx               = _mm256_mul_pd(fscal,dx00);
343             ty               = _mm256_mul_pd(fscal,dy00);
344             tz               = _mm256_mul_pd(fscal,dz00);
345
346             /* Update vectorial force */
347             fix0             = _mm256_add_pd(fix0,tx);
348             fiy0             = _mm256_add_pd(fiy0,ty);
349             fiz0             = _mm256_add_pd(fiz0,tz);
350
351             fjx0             = _mm256_add_pd(fjx0,tx);
352             fjy0             = _mm256_add_pd(fjy0,ty);
353             fjz0             = _mm256_add_pd(fjz0,tz);
354
355             /**************************
356              * CALCULATE INTERACTIONS *
357              **************************/
358
359             r11              = _mm256_mul_pd(rsq11,rinv11);
360
361             /* Calculate table index by multiplying r with table scale and truncate to integer */
362             rt               = _mm256_mul_pd(r11,vftabscale);
363             vfitab           = _mm256_cvttpd_epi32(rt);
364             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
365             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
366
367             /* CUBIC SPLINE TABLE ELECTROSTATICS */
368             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
369             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
370             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
371             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
372             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
373             Heps             = _mm256_mul_pd(vfeps,H);
374             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
375             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
376             velec            = _mm256_mul_pd(qq11,VV);
377             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
378             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
379
380             /* Update potential sum for this i atom from the interaction with this j atom. */
381             velecsum         = _mm256_add_pd(velecsum,velec);
382
383             fscal            = felec;
384
385             /* Calculate temporary vectorial force */
386             tx               = _mm256_mul_pd(fscal,dx11);
387             ty               = _mm256_mul_pd(fscal,dy11);
388             tz               = _mm256_mul_pd(fscal,dz11);
389
390             /* Update vectorial force */
391             fix1             = _mm256_add_pd(fix1,tx);
392             fiy1             = _mm256_add_pd(fiy1,ty);
393             fiz1             = _mm256_add_pd(fiz1,tz);
394
395             fjx1             = _mm256_add_pd(fjx1,tx);
396             fjy1             = _mm256_add_pd(fjy1,ty);
397             fjz1             = _mm256_add_pd(fjz1,tz);
398
399             /**************************
400              * CALCULATE INTERACTIONS *
401              **************************/
402
403             r12              = _mm256_mul_pd(rsq12,rinv12);
404
405             /* Calculate table index by multiplying r with table scale and truncate to integer */
406             rt               = _mm256_mul_pd(r12,vftabscale);
407             vfitab           = _mm256_cvttpd_epi32(rt);
408             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
409             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
410
411             /* CUBIC SPLINE TABLE ELECTROSTATICS */
412             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
413             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
414             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
415             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
416             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
417             Heps             = _mm256_mul_pd(vfeps,H);
418             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
419             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
420             velec            = _mm256_mul_pd(qq12,VV);
421             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
422             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
423
424             /* Update potential sum for this i atom from the interaction with this j atom. */
425             velecsum         = _mm256_add_pd(velecsum,velec);
426
427             fscal            = felec;
428
429             /* Calculate temporary vectorial force */
430             tx               = _mm256_mul_pd(fscal,dx12);
431             ty               = _mm256_mul_pd(fscal,dy12);
432             tz               = _mm256_mul_pd(fscal,dz12);
433
434             /* Update vectorial force */
435             fix1             = _mm256_add_pd(fix1,tx);
436             fiy1             = _mm256_add_pd(fiy1,ty);
437             fiz1             = _mm256_add_pd(fiz1,tz);
438
439             fjx2             = _mm256_add_pd(fjx2,tx);
440             fjy2             = _mm256_add_pd(fjy2,ty);
441             fjz2             = _mm256_add_pd(fjz2,tz);
442
443             /**************************
444              * CALCULATE INTERACTIONS *
445              **************************/
446
447             r13              = _mm256_mul_pd(rsq13,rinv13);
448
449             /* Calculate table index by multiplying r with table scale and truncate to integer */
450             rt               = _mm256_mul_pd(r13,vftabscale);
451             vfitab           = _mm256_cvttpd_epi32(rt);
452             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
453             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
454
455             /* CUBIC SPLINE TABLE ELECTROSTATICS */
456             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
457             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
458             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
459             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
460             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
461             Heps             = _mm256_mul_pd(vfeps,H);
462             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
463             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
464             velec            = _mm256_mul_pd(qq13,VV);
465             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
466             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
467
468             /* Update potential sum for this i atom from the interaction with this j atom. */
469             velecsum         = _mm256_add_pd(velecsum,velec);
470
471             fscal            = felec;
472
473             /* Calculate temporary vectorial force */
474             tx               = _mm256_mul_pd(fscal,dx13);
475             ty               = _mm256_mul_pd(fscal,dy13);
476             tz               = _mm256_mul_pd(fscal,dz13);
477
478             /* Update vectorial force */
479             fix1             = _mm256_add_pd(fix1,tx);
480             fiy1             = _mm256_add_pd(fiy1,ty);
481             fiz1             = _mm256_add_pd(fiz1,tz);
482
483             fjx3             = _mm256_add_pd(fjx3,tx);
484             fjy3             = _mm256_add_pd(fjy3,ty);
485             fjz3             = _mm256_add_pd(fjz3,tz);
486
487             /**************************
488              * CALCULATE INTERACTIONS *
489              **************************/
490
491             r21              = _mm256_mul_pd(rsq21,rinv21);
492
493             /* Calculate table index by multiplying r with table scale and truncate to integer */
494             rt               = _mm256_mul_pd(r21,vftabscale);
495             vfitab           = _mm256_cvttpd_epi32(rt);
496             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
497             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
498
499             /* CUBIC SPLINE TABLE ELECTROSTATICS */
500             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
501             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
502             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
503             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
504             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
505             Heps             = _mm256_mul_pd(vfeps,H);
506             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
507             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
508             velec            = _mm256_mul_pd(qq21,VV);
509             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
510             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
511
512             /* Update potential sum for this i atom from the interaction with this j atom. */
513             velecsum         = _mm256_add_pd(velecsum,velec);
514
515             fscal            = felec;
516
517             /* Calculate temporary vectorial force */
518             tx               = _mm256_mul_pd(fscal,dx21);
519             ty               = _mm256_mul_pd(fscal,dy21);
520             tz               = _mm256_mul_pd(fscal,dz21);
521
522             /* Update vectorial force */
523             fix2             = _mm256_add_pd(fix2,tx);
524             fiy2             = _mm256_add_pd(fiy2,ty);
525             fiz2             = _mm256_add_pd(fiz2,tz);
526
527             fjx1             = _mm256_add_pd(fjx1,tx);
528             fjy1             = _mm256_add_pd(fjy1,ty);
529             fjz1             = _mm256_add_pd(fjz1,tz);
530
531             /**************************
532              * CALCULATE INTERACTIONS *
533              **************************/
534
535             r22              = _mm256_mul_pd(rsq22,rinv22);
536
537             /* Calculate table index by multiplying r with table scale and truncate to integer */
538             rt               = _mm256_mul_pd(r22,vftabscale);
539             vfitab           = _mm256_cvttpd_epi32(rt);
540             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
541             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
542
543             /* CUBIC SPLINE TABLE ELECTROSTATICS */
544             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
545             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
546             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
547             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
548             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
549             Heps             = _mm256_mul_pd(vfeps,H);
550             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
551             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
552             velec            = _mm256_mul_pd(qq22,VV);
553             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
554             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
555
556             /* Update potential sum for this i atom from the interaction with this j atom. */
557             velecsum         = _mm256_add_pd(velecsum,velec);
558
559             fscal            = felec;
560
561             /* Calculate temporary vectorial force */
562             tx               = _mm256_mul_pd(fscal,dx22);
563             ty               = _mm256_mul_pd(fscal,dy22);
564             tz               = _mm256_mul_pd(fscal,dz22);
565
566             /* Update vectorial force */
567             fix2             = _mm256_add_pd(fix2,tx);
568             fiy2             = _mm256_add_pd(fiy2,ty);
569             fiz2             = _mm256_add_pd(fiz2,tz);
570
571             fjx2             = _mm256_add_pd(fjx2,tx);
572             fjy2             = _mm256_add_pd(fjy2,ty);
573             fjz2             = _mm256_add_pd(fjz2,tz);
574
575             /**************************
576              * CALCULATE INTERACTIONS *
577              **************************/
578
579             r23              = _mm256_mul_pd(rsq23,rinv23);
580
581             /* Calculate table index by multiplying r with table scale and truncate to integer */
582             rt               = _mm256_mul_pd(r23,vftabscale);
583             vfitab           = _mm256_cvttpd_epi32(rt);
584             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
585             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
586
587             /* CUBIC SPLINE TABLE ELECTROSTATICS */
588             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
589             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
590             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
591             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
592             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
593             Heps             = _mm256_mul_pd(vfeps,H);
594             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
595             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
596             velec            = _mm256_mul_pd(qq23,VV);
597             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
598             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
599
600             /* Update potential sum for this i atom from the interaction with this j atom. */
601             velecsum         = _mm256_add_pd(velecsum,velec);
602
603             fscal            = felec;
604
605             /* Calculate temporary vectorial force */
606             tx               = _mm256_mul_pd(fscal,dx23);
607             ty               = _mm256_mul_pd(fscal,dy23);
608             tz               = _mm256_mul_pd(fscal,dz23);
609
610             /* Update vectorial force */
611             fix2             = _mm256_add_pd(fix2,tx);
612             fiy2             = _mm256_add_pd(fiy2,ty);
613             fiz2             = _mm256_add_pd(fiz2,tz);
614
615             fjx3             = _mm256_add_pd(fjx3,tx);
616             fjy3             = _mm256_add_pd(fjy3,ty);
617             fjz3             = _mm256_add_pd(fjz3,tz);
618
619             /**************************
620              * CALCULATE INTERACTIONS *
621              **************************/
622
623             r31              = _mm256_mul_pd(rsq31,rinv31);
624
625             /* Calculate table index by multiplying r with table scale and truncate to integer */
626             rt               = _mm256_mul_pd(r31,vftabscale);
627             vfitab           = _mm256_cvttpd_epi32(rt);
628             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
629             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
630
631             /* CUBIC SPLINE TABLE ELECTROSTATICS */
632             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
633             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
634             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
635             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
636             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
637             Heps             = _mm256_mul_pd(vfeps,H);
638             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
639             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
640             velec            = _mm256_mul_pd(qq31,VV);
641             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
642             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
643
644             /* Update potential sum for this i atom from the interaction with this j atom. */
645             velecsum         = _mm256_add_pd(velecsum,velec);
646
647             fscal            = felec;
648
649             /* Calculate temporary vectorial force */
650             tx               = _mm256_mul_pd(fscal,dx31);
651             ty               = _mm256_mul_pd(fscal,dy31);
652             tz               = _mm256_mul_pd(fscal,dz31);
653
654             /* Update vectorial force */
655             fix3             = _mm256_add_pd(fix3,tx);
656             fiy3             = _mm256_add_pd(fiy3,ty);
657             fiz3             = _mm256_add_pd(fiz3,tz);
658
659             fjx1             = _mm256_add_pd(fjx1,tx);
660             fjy1             = _mm256_add_pd(fjy1,ty);
661             fjz1             = _mm256_add_pd(fjz1,tz);
662
663             /**************************
664              * CALCULATE INTERACTIONS *
665              **************************/
666
667             r32              = _mm256_mul_pd(rsq32,rinv32);
668
669             /* Calculate table index by multiplying r with table scale and truncate to integer */
670             rt               = _mm256_mul_pd(r32,vftabscale);
671             vfitab           = _mm256_cvttpd_epi32(rt);
672             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
673             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
674
675             /* CUBIC SPLINE TABLE ELECTROSTATICS */
676             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
677             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
678             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
679             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
680             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
681             Heps             = _mm256_mul_pd(vfeps,H);
682             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
683             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
684             velec            = _mm256_mul_pd(qq32,VV);
685             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
686             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
687
688             /* Update potential sum for this i atom from the interaction with this j atom. */
689             velecsum         = _mm256_add_pd(velecsum,velec);
690
691             fscal            = felec;
692
693             /* Calculate temporary vectorial force */
694             tx               = _mm256_mul_pd(fscal,dx32);
695             ty               = _mm256_mul_pd(fscal,dy32);
696             tz               = _mm256_mul_pd(fscal,dz32);
697
698             /* Update vectorial force */
699             fix3             = _mm256_add_pd(fix3,tx);
700             fiy3             = _mm256_add_pd(fiy3,ty);
701             fiz3             = _mm256_add_pd(fiz3,tz);
702
703             fjx2             = _mm256_add_pd(fjx2,tx);
704             fjy2             = _mm256_add_pd(fjy2,ty);
705             fjz2             = _mm256_add_pd(fjz2,tz);
706
707             /**************************
708              * CALCULATE INTERACTIONS *
709              **************************/
710
711             r33              = _mm256_mul_pd(rsq33,rinv33);
712
713             /* Calculate table index by multiplying r with table scale and truncate to integer */
714             rt               = _mm256_mul_pd(r33,vftabscale);
715             vfitab           = _mm256_cvttpd_epi32(rt);
716             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
717             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
718
719             /* CUBIC SPLINE TABLE ELECTROSTATICS */
720             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
721             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
722             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
723             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
724             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
725             Heps             = _mm256_mul_pd(vfeps,H);
726             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
727             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
728             velec            = _mm256_mul_pd(qq33,VV);
729             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
730             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
731
732             /* Update potential sum for this i atom from the interaction with this j atom. */
733             velecsum         = _mm256_add_pd(velecsum,velec);
734
735             fscal            = felec;
736
737             /* Calculate temporary vectorial force */
738             tx               = _mm256_mul_pd(fscal,dx33);
739             ty               = _mm256_mul_pd(fscal,dy33);
740             tz               = _mm256_mul_pd(fscal,dz33);
741
742             /* Update vectorial force */
743             fix3             = _mm256_add_pd(fix3,tx);
744             fiy3             = _mm256_add_pd(fiy3,ty);
745             fiz3             = _mm256_add_pd(fiz3,tz);
746
747             fjx3             = _mm256_add_pd(fjx3,tx);
748             fjy3             = _mm256_add_pd(fjy3,ty);
749             fjz3             = _mm256_add_pd(fjz3,tz);
750
751             fjptrA             = f+j_coord_offsetA;
752             fjptrB             = f+j_coord_offsetB;
753             fjptrC             = f+j_coord_offsetC;
754             fjptrD             = f+j_coord_offsetD;
755
756             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
757                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
758                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
759
760             /* Inner loop uses 446 flops */
761         }
762
763         if(jidx<j_index_end)
764         {
765
766             /* Get j neighbor index, and coordinate index */
767             jnrlistA         = jjnr[jidx];
768             jnrlistB         = jjnr[jidx+1];
769             jnrlistC         = jjnr[jidx+2];
770             jnrlistD         = jjnr[jidx+3];
771             /* Sign of each element will be negative for non-real atoms.
772              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
773              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
774              */
775             tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
776
777             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
778             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
779             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
780
781             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
782             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
783             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
784             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
785             j_coord_offsetA  = DIM*jnrA;
786             j_coord_offsetB  = DIM*jnrB;
787             j_coord_offsetC  = DIM*jnrC;
788             j_coord_offsetD  = DIM*jnrD;
789
790             /* load j atom coordinates */
791             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
792                                                  x+j_coord_offsetC,x+j_coord_offsetD,
793                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
794                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
795
796             /* Calculate displacement vector */
797             dx00             = _mm256_sub_pd(ix0,jx0);
798             dy00             = _mm256_sub_pd(iy0,jy0);
799             dz00             = _mm256_sub_pd(iz0,jz0);
800             dx11             = _mm256_sub_pd(ix1,jx1);
801             dy11             = _mm256_sub_pd(iy1,jy1);
802             dz11             = _mm256_sub_pd(iz1,jz1);
803             dx12             = _mm256_sub_pd(ix1,jx2);
804             dy12             = _mm256_sub_pd(iy1,jy2);
805             dz12             = _mm256_sub_pd(iz1,jz2);
806             dx13             = _mm256_sub_pd(ix1,jx3);
807             dy13             = _mm256_sub_pd(iy1,jy3);
808             dz13             = _mm256_sub_pd(iz1,jz3);
809             dx21             = _mm256_sub_pd(ix2,jx1);
810             dy21             = _mm256_sub_pd(iy2,jy1);
811             dz21             = _mm256_sub_pd(iz2,jz1);
812             dx22             = _mm256_sub_pd(ix2,jx2);
813             dy22             = _mm256_sub_pd(iy2,jy2);
814             dz22             = _mm256_sub_pd(iz2,jz2);
815             dx23             = _mm256_sub_pd(ix2,jx3);
816             dy23             = _mm256_sub_pd(iy2,jy3);
817             dz23             = _mm256_sub_pd(iz2,jz3);
818             dx31             = _mm256_sub_pd(ix3,jx1);
819             dy31             = _mm256_sub_pd(iy3,jy1);
820             dz31             = _mm256_sub_pd(iz3,jz1);
821             dx32             = _mm256_sub_pd(ix3,jx2);
822             dy32             = _mm256_sub_pd(iy3,jy2);
823             dz32             = _mm256_sub_pd(iz3,jz2);
824             dx33             = _mm256_sub_pd(ix3,jx3);
825             dy33             = _mm256_sub_pd(iy3,jy3);
826             dz33             = _mm256_sub_pd(iz3,jz3);
827
828             /* Calculate squared distance and things based on it */
829             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
830             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
831             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
832             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
833             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
834             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
835             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
836             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
837             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
838             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
839
840             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
841             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
842             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
843             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
844             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
845             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
846             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
847             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
848             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
849             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
850
851             fjx0             = _mm256_setzero_pd();
852             fjy0             = _mm256_setzero_pd();
853             fjz0             = _mm256_setzero_pd();
854             fjx1             = _mm256_setzero_pd();
855             fjy1             = _mm256_setzero_pd();
856             fjz1             = _mm256_setzero_pd();
857             fjx2             = _mm256_setzero_pd();
858             fjy2             = _mm256_setzero_pd();
859             fjz2             = _mm256_setzero_pd();
860             fjx3             = _mm256_setzero_pd();
861             fjy3             = _mm256_setzero_pd();
862             fjz3             = _mm256_setzero_pd();
863
864             /**************************
865              * CALCULATE INTERACTIONS *
866              **************************/
867
868             r00              = _mm256_mul_pd(rsq00,rinv00);
869             r00              = _mm256_andnot_pd(dummy_mask,r00);
870
871             /* Calculate table index by multiplying r with table scale and truncate to integer */
872             rt               = _mm256_mul_pd(r00,vftabscale);
873             vfitab           = _mm256_cvttpd_epi32(rt);
874             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
875             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
876
877             /* CUBIC SPLINE TABLE DISPERSION */
878             vfitab           = _mm_add_epi32(vfitab,ifour);
879             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
880             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
881             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
882             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
883             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
884             Heps             = _mm256_mul_pd(vfeps,H);
885             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
886             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
887             vvdw6            = _mm256_mul_pd(c6_00,VV);
888             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
889             fvdw6            = _mm256_mul_pd(c6_00,FF);
890
891             /* CUBIC SPLINE TABLE REPULSION */
892             vfitab           = _mm_add_epi32(vfitab,ifour);
893             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
894             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
895             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
896             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
897             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
898             Heps             = _mm256_mul_pd(vfeps,H);
899             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
900             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
901             vvdw12           = _mm256_mul_pd(c12_00,VV);
902             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
903             fvdw12           = _mm256_mul_pd(c12_00,FF);
904             vvdw             = _mm256_add_pd(vvdw12,vvdw6);
905             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
906
907             /* Update potential sum for this i atom from the interaction with this j atom. */
908             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
909             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
910
911             fscal            = fvdw;
912
913             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
914
915             /* Calculate temporary vectorial force */
916             tx               = _mm256_mul_pd(fscal,dx00);
917             ty               = _mm256_mul_pd(fscal,dy00);
918             tz               = _mm256_mul_pd(fscal,dz00);
919
920             /* Update vectorial force */
921             fix0             = _mm256_add_pd(fix0,tx);
922             fiy0             = _mm256_add_pd(fiy0,ty);
923             fiz0             = _mm256_add_pd(fiz0,tz);
924
925             fjx0             = _mm256_add_pd(fjx0,tx);
926             fjy0             = _mm256_add_pd(fjy0,ty);
927             fjz0             = _mm256_add_pd(fjz0,tz);
928
929             /**************************
930              * CALCULATE INTERACTIONS *
931              **************************/
932
933             r11              = _mm256_mul_pd(rsq11,rinv11);
934             r11              = _mm256_andnot_pd(dummy_mask,r11);
935
936             /* Calculate table index by multiplying r with table scale and truncate to integer */
937             rt               = _mm256_mul_pd(r11,vftabscale);
938             vfitab           = _mm256_cvttpd_epi32(rt);
939             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
940             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
941
942             /* CUBIC SPLINE TABLE ELECTROSTATICS */
943             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
944             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
945             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
946             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
947             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
948             Heps             = _mm256_mul_pd(vfeps,H);
949             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
950             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
951             velec            = _mm256_mul_pd(qq11,VV);
952             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
953             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
954
955             /* Update potential sum for this i atom from the interaction with this j atom. */
956             velec            = _mm256_andnot_pd(dummy_mask,velec);
957             velecsum         = _mm256_add_pd(velecsum,velec);
958
959             fscal            = felec;
960
961             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
962
963             /* Calculate temporary vectorial force */
964             tx               = _mm256_mul_pd(fscal,dx11);
965             ty               = _mm256_mul_pd(fscal,dy11);
966             tz               = _mm256_mul_pd(fscal,dz11);
967
968             /* Update vectorial force */
969             fix1             = _mm256_add_pd(fix1,tx);
970             fiy1             = _mm256_add_pd(fiy1,ty);
971             fiz1             = _mm256_add_pd(fiz1,tz);
972
973             fjx1             = _mm256_add_pd(fjx1,tx);
974             fjy1             = _mm256_add_pd(fjy1,ty);
975             fjz1             = _mm256_add_pd(fjz1,tz);
976
977             /**************************
978              * CALCULATE INTERACTIONS *
979              **************************/
980
981             r12              = _mm256_mul_pd(rsq12,rinv12);
982             r12              = _mm256_andnot_pd(dummy_mask,r12);
983
984             /* Calculate table index by multiplying r with table scale and truncate to integer */
985             rt               = _mm256_mul_pd(r12,vftabscale);
986             vfitab           = _mm256_cvttpd_epi32(rt);
987             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
988             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
989
990             /* CUBIC SPLINE TABLE ELECTROSTATICS */
991             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
992             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
993             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
994             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
995             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
996             Heps             = _mm256_mul_pd(vfeps,H);
997             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
998             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
999             velec            = _mm256_mul_pd(qq12,VV);
1000             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1001             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1002
1003             /* Update potential sum for this i atom from the interaction with this j atom. */
1004             velec            = _mm256_andnot_pd(dummy_mask,velec);
1005             velecsum         = _mm256_add_pd(velecsum,velec);
1006
1007             fscal            = felec;
1008
1009             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1010
1011             /* Calculate temporary vectorial force */
1012             tx               = _mm256_mul_pd(fscal,dx12);
1013             ty               = _mm256_mul_pd(fscal,dy12);
1014             tz               = _mm256_mul_pd(fscal,dz12);
1015
1016             /* Update vectorial force */
1017             fix1             = _mm256_add_pd(fix1,tx);
1018             fiy1             = _mm256_add_pd(fiy1,ty);
1019             fiz1             = _mm256_add_pd(fiz1,tz);
1020
1021             fjx2             = _mm256_add_pd(fjx2,tx);
1022             fjy2             = _mm256_add_pd(fjy2,ty);
1023             fjz2             = _mm256_add_pd(fjz2,tz);
1024
1025             /**************************
1026              * CALCULATE INTERACTIONS *
1027              **************************/
1028
1029             r13              = _mm256_mul_pd(rsq13,rinv13);
1030             r13              = _mm256_andnot_pd(dummy_mask,r13);
1031
1032             /* Calculate table index by multiplying r with table scale and truncate to integer */
1033             rt               = _mm256_mul_pd(r13,vftabscale);
1034             vfitab           = _mm256_cvttpd_epi32(rt);
1035             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1036             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1037
1038             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1039             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1040             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1041             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1042             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1043             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1044             Heps             = _mm256_mul_pd(vfeps,H);
1045             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1046             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1047             velec            = _mm256_mul_pd(qq13,VV);
1048             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1049             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
1050
1051             /* Update potential sum for this i atom from the interaction with this j atom. */
1052             velec            = _mm256_andnot_pd(dummy_mask,velec);
1053             velecsum         = _mm256_add_pd(velecsum,velec);
1054
1055             fscal            = felec;
1056
1057             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1058
1059             /* Calculate temporary vectorial force */
1060             tx               = _mm256_mul_pd(fscal,dx13);
1061             ty               = _mm256_mul_pd(fscal,dy13);
1062             tz               = _mm256_mul_pd(fscal,dz13);
1063
1064             /* Update vectorial force */
1065             fix1             = _mm256_add_pd(fix1,tx);
1066             fiy1             = _mm256_add_pd(fiy1,ty);
1067             fiz1             = _mm256_add_pd(fiz1,tz);
1068
1069             fjx3             = _mm256_add_pd(fjx3,tx);
1070             fjy3             = _mm256_add_pd(fjy3,ty);
1071             fjz3             = _mm256_add_pd(fjz3,tz);
1072
1073             /**************************
1074              * CALCULATE INTERACTIONS *
1075              **************************/
1076
1077             r21              = _mm256_mul_pd(rsq21,rinv21);
1078             r21              = _mm256_andnot_pd(dummy_mask,r21);
1079
1080             /* Calculate table index by multiplying r with table scale and truncate to integer */
1081             rt               = _mm256_mul_pd(r21,vftabscale);
1082             vfitab           = _mm256_cvttpd_epi32(rt);
1083             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1084             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1085
1086             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1087             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1088             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1089             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1090             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1091             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1092             Heps             = _mm256_mul_pd(vfeps,H);
1093             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1094             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1095             velec            = _mm256_mul_pd(qq21,VV);
1096             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1097             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1098
1099             /* Update potential sum for this i atom from the interaction with this j atom. */
1100             velec            = _mm256_andnot_pd(dummy_mask,velec);
1101             velecsum         = _mm256_add_pd(velecsum,velec);
1102
1103             fscal            = felec;
1104
1105             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1106
1107             /* Calculate temporary vectorial force */
1108             tx               = _mm256_mul_pd(fscal,dx21);
1109             ty               = _mm256_mul_pd(fscal,dy21);
1110             tz               = _mm256_mul_pd(fscal,dz21);
1111
1112             /* Update vectorial force */
1113             fix2             = _mm256_add_pd(fix2,tx);
1114             fiy2             = _mm256_add_pd(fiy2,ty);
1115             fiz2             = _mm256_add_pd(fiz2,tz);
1116
1117             fjx1             = _mm256_add_pd(fjx1,tx);
1118             fjy1             = _mm256_add_pd(fjy1,ty);
1119             fjz1             = _mm256_add_pd(fjz1,tz);
1120
1121             /**************************
1122              * CALCULATE INTERACTIONS *
1123              **************************/
1124
1125             r22              = _mm256_mul_pd(rsq22,rinv22);
1126             r22              = _mm256_andnot_pd(dummy_mask,r22);
1127
1128             /* Calculate table index by multiplying r with table scale and truncate to integer */
1129             rt               = _mm256_mul_pd(r22,vftabscale);
1130             vfitab           = _mm256_cvttpd_epi32(rt);
1131             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1132             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1133
1134             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1135             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1136             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1137             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1138             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1139             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1140             Heps             = _mm256_mul_pd(vfeps,H);
1141             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1142             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1143             velec            = _mm256_mul_pd(qq22,VV);
1144             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1145             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1146
1147             /* Update potential sum for this i atom from the interaction with this j atom. */
1148             velec            = _mm256_andnot_pd(dummy_mask,velec);
1149             velecsum         = _mm256_add_pd(velecsum,velec);
1150
1151             fscal            = felec;
1152
1153             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1154
1155             /* Calculate temporary vectorial force */
1156             tx               = _mm256_mul_pd(fscal,dx22);
1157             ty               = _mm256_mul_pd(fscal,dy22);
1158             tz               = _mm256_mul_pd(fscal,dz22);
1159
1160             /* Update vectorial force */
1161             fix2             = _mm256_add_pd(fix2,tx);
1162             fiy2             = _mm256_add_pd(fiy2,ty);
1163             fiz2             = _mm256_add_pd(fiz2,tz);
1164
1165             fjx2             = _mm256_add_pd(fjx2,tx);
1166             fjy2             = _mm256_add_pd(fjy2,ty);
1167             fjz2             = _mm256_add_pd(fjz2,tz);
1168
1169             /**************************
1170              * CALCULATE INTERACTIONS *
1171              **************************/
1172
1173             r23              = _mm256_mul_pd(rsq23,rinv23);
1174             r23              = _mm256_andnot_pd(dummy_mask,r23);
1175
1176             /* Calculate table index by multiplying r with table scale and truncate to integer */
1177             rt               = _mm256_mul_pd(r23,vftabscale);
1178             vfitab           = _mm256_cvttpd_epi32(rt);
1179             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1180             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1181
1182             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1183             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1184             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1185             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1186             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1187             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1188             Heps             = _mm256_mul_pd(vfeps,H);
1189             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1190             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1191             velec            = _mm256_mul_pd(qq23,VV);
1192             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1193             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
1194
1195             /* Update potential sum for this i atom from the interaction with this j atom. */
1196             velec            = _mm256_andnot_pd(dummy_mask,velec);
1197             velecsum         = _mm256_add_pd(velecsum,velec);
1198
1199             fscal            = felec;
1200
1201             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1202
1203             /* Calculate temporary vectorial force */
1204             tx               = _mm256_mul_pd(fscal,dx23);
1205             ty               = _mm256_mul_pd(fscal,dy23);
1206             tz               = _mm256_mul_pd(fscal,dz23);
1207
1208             /* Update vectorial force */
1209             fix2             = _mm256_add_pd(fix2,tx);
1210             fiy2             = _mm256_add_pd(fiy2,ty);
1211             fiz2             = _mm256_add_pd(fiz2,tz);
1212
1213             fjx3             = _mm256_add_pd(fjx3,tx);
1214             fjy3             = _mm256_add_pd(fjy3,ty);
1215             fjz3             = _mm256_add_pd(fjz3,tz);
1216
1217             /**************************
1218              * CALCULATE INTERACTIONS *
1219              **************************/
1220
1221             r31              = _mm256_mul_pd(rsq31,rinv31);
1222             r31              = _mm256_andnot_pd(dummy_mask,r31);
1223
1224             /* Calculate table index by multiplying r with table scale and truncate to integer */
1225             rt               = _mm256_mul_pd(r31,vftabscale);
1226             vfitab           = _mm256_cvttpd_epi32(rt);
1227             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1228             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1229
1230             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1231             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1232             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1233             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1234             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1235             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1236             Heps             = _mm256_mul_pd(vfeps,H);
1237             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1238             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1239             velec            = _mm256_mul_pd(qq31,VV);
1240             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1241             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
1242
1243             /* Update potential sum for this i atom from the interaction with this j atom. */
1244             velec            = _mm256_andnot_pd(dummy_mask,velec);
1245             velecsum         = _mm256_add_pd(velecsum,velec);
1246
1247             fscal            = felec;
1248
1249             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1250
1251             /* Calculate temporary vectorial force */
1252             tx               = _mm256_mul_pd(fscal,dx31);
1253             ty               = _mm256_mul_pd(fscal,dy31);
1254             tz               = _mm256_mul_pd(fscal,dz31);
1255
1256             /* Update vectorial force */
1257             fix3             = _mm256_add_pd(fix3,tx);
1258             fiy3             = _mm256_add_pd(fiy3,ty);
1259             fiz3             = _mm256_add_pd(fiz3,tz);
1260
1261             fjx1             = _mm256_add_pd(fjx1,tx);
1262             fjy1             = _mm256_add_pd(fjy1,ty);
1263             fjz1             = _mm256_add_pd(fjz1,tz);
1264
1265             /**************************
1266              * CALCULATE INTERACTIONS *
1267              **************************/
1268
1269             r32              = _mm256_mul_pd(rsq32,rinv32);
1270             r32              = _mm256_andnot_pd(dummy_mask,r32);
1271
1272             /* Calculate table index by multiplying r with table scale and truncate to integer */
1273             rt               = _mm256_mul_pd(r32,vftabscale);
1274             vfitab           = _mm256_cvttpd_epi32(rt);
1275             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1276             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1277
1278             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1279             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1280             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1281             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1282             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1283             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1284             Heps             = _mm256_mul_pd(vfeps,H);
1285             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1286             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1287             velec            = _mm256_mul_pd(qq32,VV);
1288             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1289             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
1290
1291             /* Update potential sum for this i atom from the interaction with this j atom. */
1292             velec            = _mm256_andnot_pd(dummy_mask,velec);
1293             velecsum         = _mm256_add_pd(velecsum,velec);
1294
1295             fscal            = felec;
1296
1297             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1298
1299             /* Calculate temporary vectorial force */
1300             tx               = _mm256_mul_pd(fscal,dx32);
1301             ty               = _mm256_mul_pd(fscal,dy32);
1302             tz               = _mm256_mul_pd(fscal,dz32);
1303
1304             /* Update vectorial force */
1305             fix3             = _mm256_add_pd(fix3,tx);
1306             fiy3             = _mm256_add_pd(fiy3,ty);
1307             fiz3             = _mm256_add_pd(fiz3,tz);
1308
1309             fjx2             = _mm256_add_pd(fjx2,tx);
1310             fjy2             = _mm256_add_pd(fjy2,ty);
1311             fjz2             = _mm256_add_pd(fjz2,tz);
1312
1313             /**************************
1314              * CALCULATE INTERACTIONS *
1315              **************************/
1316
1317             r33              = _mm256_mul_pd(rsq33,rinv33);
1318             r33              = _mm256_andnot_pd(dummy_mask,r33);
1319
1320             /* Calculate table index by multiplying r with table scale and truncate to integer */
1321             rt               = _mm256_mul_pd(r33,vftabscale);
1322             vfitab           = _mm256_cvttpd_epi32(rt);
1323             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1324             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1325
1326             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1327             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1328             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1329             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1330             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1331             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1332             Heps             = _mm256_mul_pd(vfeps,H);
1333             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1334             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1335             velec            = _mm256_mul_pd(qq33,VV);
1336             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1337             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
1338
1339             /* Update potential sum for this i atom from the interaction with this j atom. */
1340             velec            = _mm256_andnot_pd(dummy_mask,velec);
1341             velecsum         = _mm256_add_pd(velecsum,velec);
1342
1343             fscal            = felec;
1344
1345             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1346
1347             /* Calculate temporary vectorial force */
1348             tx               = _mm256_mul_pd(fscal,dx33);
1349             ty               = _mm256_mul_pd(fscal,dy33);
1350             tz               = _mm256_mul_pd(fscal,dz33);
1351
1352             /* Update vectorial force */
1353             fix3             = _mm256_add_pd(fix3,tx);
1354             fiy3             = _mm256_add_pd(fiy3,ty);
1355             fiz3             = _mm256_add_pd(fiz3,tz);
1356
1357             fjx3             = _mm256_add_pd(fjx3,tx);
1358             fjy3             = _mm256_add_pd(fjy3,ty);
1359             fjz3             = _mm256_add_pd(fjz3,tz);
1360
1361             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1362             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1363             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1364             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1365
1366             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1367                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1368                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1369
1370             /* Inner loop uses 456 flops */
1371         }
1372
1373         /* End of innermost loop */
1374
1375         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1376                                                  f+i_coord_offset,fshift+i_shift_offset);
1377
1378         ggid                        = gid[iidx];
1379         /* Update potential energies */
1380         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1381         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1382
1383         /* Increment number of inner iterations */
1384         inneriter                  += j_index_end - j_index_start;
1385
1386         /* Outer loop uses 26 flops */
1387     }
1388
1389     /* Increment number of outer iterations */
1390     outeriter        += nri;
1391
1392     /* Update outer/inner flops */
1393
1394     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*456);
1395 }
1396 /*
1397  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double
1398  * Electrostatics interaction: CubicSplineTable
1399  * VdW interaction:            CubicSplineTable
1400  * Geometry:                   Water4-Water4
1401  * Calculate force/pot:        Force
1402  */
1403 void
1404 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double
1405                     (t_nblist * gmx_restrict                nlist,
1406                      rvec * gmx_restrict                    xx,
1407                      rvec * gmx_restrict                    ff,
1408                      t_forcerec * gmx_restrict              fr,
1409                      t_mdatoms * gmx_restrict               mdatoms,
1410                      nb_kernel_data_t * gmx_restrict        kernel_data,
1411                      t_nrnb * gmx_restrict                  nrnb)
1412 {
1413     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1414      * just 0 for non-waters.
1415      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1416      * jnr indices corresponding to data put in the four positions in the SIMD register.
1417      */
1418     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1419     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1420     int              jnrA,jnrB,jnrC,jnrD;
1421     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1422     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1423     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1424     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1425     real             rcutoff_scalar;
1426     real             *shiftvec,*fshift,*x,*f;
1427     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1428     real             scratch[4*DIM];
1429     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1430     real *           vdwioffsetptr0;
1431     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1432     real *           vdwioffsetptr1;
1433     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1434     real *           vdwioffsetptr2;
1435     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1436     real *           vdwioffsetptr3;
1437     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1438     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1439     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1440     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1441     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1442     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1443     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1444     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1445     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1446     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1447     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1448     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1449     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1450     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1451     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1452     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1453     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1454     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1455     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1456     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1457     real             *charge;
1458     int              nvdwtype;
1459     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1460     int              *vdwtype;
1461     real             *vdwparam;
1462     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1463     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1464     __m128i          vfitab;
1465     __m128i          ifour       = _mm_set1_epi32(4);
1466     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1467     real             *vftab;
1468     __m256d          dummy_mask,cutoff_mask;
1469     __m128           tmpmask0,tmpmask1;
1470     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1471     __m256d          one     = _mm256_set1_pd(1.0);
1472     __m256d          two     = _mm256_set1_pd(2.0);
1473     x                = xx[0];
1474     f                = ff[0];
1475
1476     nri              = nlist->nri;
1477     iinr             = nlist->iinr;
1478     jindex           = nlist->jindex;
1479     jjnr             = nlist->jjnr;
1480     shiftidx         = nlist->shift;
1481     gid              = nlist->gid;
1482     shiftvec         = fr->shift_vec[0];
1483     fshift           = fr->fshift[0];
1484     facel            = _mm256_set1_pd(fr->epsfac);
1485     charge           = mdatoms->chargeA;
1486     nvdwtype         = fr->ntype;
1487     vdwparam         = fr->nbfp;
1488     vdwtype          = mdatoms->typeA;
1489
1490     vftab            = kernel_data->table_elec_vdw->data;
1491     vftabscale       = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
1492
1493     /* Setup water-specific parameters */
1494     inr              = nlist->iinr[0];
1495     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1496     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1497     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1498     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1499
1500     jq1              = _mm256_set1_pd(charge[inr+1]);
1501     jq2              = _mm256_set1_pd(charge[inr+2]);
1502     jq3              = _mm256_set1_pd(charge[inr+3]);
1503     vdwjidx0A        = 2*vdwtype[inr+0];
1504     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1505     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1506     qq11             = _mm256_mul_pd(iq1,jq1);
1507     qq12             = _mm256_mul_pd(iq1,jq2);
1508     qq13             = _mm256_mul_pd(iq1,jq3);
1509     qq21             = _mm256_mul_pd(iq2,jq1);
1510     qq22             = _mm256_mul_pd(iq2,jq2);
1511     qq23             = _mm256_mul_pd(iq2,jq3);
1512     qq31             = _mm256_mul_pd(iq3,jq1);
1513     qq32             = _mm256_mul_pd(iq3,jq2);
1514     qq33             = _mm256_mul_pd(iq3,jq3);
1515
1516     /* Avoid stupid compiler warnings */
1517     jnrA = jnrB = jnrC = jnrD = 0;
1518     j_coord_offsetA = 0;
1519     j_coord_offsetB = 0;
1520     j_coord_offsetC = 0;
1521     j_coord_offsetD = 0;
1522
1523     outeriter        = 0;
1524     inneriter        = 0;
1525
1526     for(iidx=0;iidx<4*DIM;iidx++)
1527     {
1528         scratch[iidx] = 0.0;
1529     }
1530
1531     /* Start outer loop over neighborlists */
1532     for(iidx=0; iidx<nri; iidx++)
1533     {
1534         /* Load shift vector for this list */
1535         i_shift_offset   = DIM*shiftidx[iidx];
1536
1537         /* Load limits for loop over neighbors */
1538         j_index_start    = jindex[iidx];
1539         j_index_end      = jindex[iidx+1];
1540
1541         /* Get outer coordinate index */
1542         inr              = iinr[iidx];
1543         i_coord_offset   = DIM*inr;
1544
1545         /* Load i particle coords and add shift vector */
1546         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1547                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1548
1549         fix0             = _mm256_setzero_pd();
1550         fiy0             = _mm256_setzero_pd();
1551         fiz0             = _mm256_setzero_pd();
1552         fix1             = _mm256_setzero_pd();
1553         fiy1             = _mm256_setzero_pd();
1554         fiz1             = _mm256_setzero_pd();
1555         fix2             = _mm256_setzero_pd();
1556         fiy2             = _mm256_setzero_pd();
1557         fiz2             = _mm256_setzero_pd();
1558         fix3             = _mm256_setzero_pd();
1559         fiy3             = _mm256_setzero_pd();
1560         fiz3             = _mm256_setzero_pd();
1561
1562         /* Start inner kernel loop */
1563         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1564         {
1565
1566             /* Get j neighbor index, and coordinate index */
1567             jnrA             = jjnr[jidx];
1568             jnrB             = jjnr[jidx+1];
1569             jnrC             = jjnr[jidx+2];
1570             jnrD             = jjnr[jidx+3];
1571             j_coord_offsetA  = DIM*jnrA;
1572             j_coord_offsetB  = DIM*jnrB;
1573             j_coord_offsetC  = DIM*jnrC;
1574             j_coord_offsetD  = DIM*jnrD;
1575
1576             /* load j atom coordinates */
1577             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1578                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1579                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1580                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1581
1582             /* Calculate displacement vector */
1583             dx00             = _mm256_sub_pd(ix0,jx0);
1584             dy00             = _mm256_sub_pd(iy0,jy0);
1585             dz00             = _mm256_sub_pd(iz0,jz0);
1586             dx11             = _mm256_sub_pd(ix1,jx1);
1587             dy11             = _mm256_sub_pd(iy1,jy1);
1588             dz11             = _mm256_sub_pd(iz1,jz1);
1589             dx12             = _mm256_sub_pd(ix1,jx2);
1590             dy12             = _mm256_sub_pd(iy1,jy2);
1591             dz12             = _mm256_sub_pd(iz1,jz2);
1592             dx13             = _mm256_sub_pd(ix1,jx3);
1593             dy13             = _mm256_sub_pd(iy1,jy3);
1594             dz13             = _mm256_sub_pd(iz1,jz3);
1595             dx21             = _mm256_sub_pd(ix2,jx1);
1596             dy21             = _mm256_sub_pd(iy2,jy1);
1597             dz21             = _mm256_sub_pd(iz2,jz1);
1598             dx22             = _mm256_sub_pd(ix2,jx2);
1599             dy22             = _mm256_sub_pd(iy2,jy2);
1600             dz22             = _mm256_sub_pd(iz2,jz2);
1601             dx23             = _mm256_sub_pd(ix2,jx3);
1602             dy23             = _mm256_sub_pd(iy2,jy3);
1603             dz23             = _mm256_sub_pd(iz2,jz3);
1604             dx31             = _mm256_sub_pd(ix3,jx1);
1605             dy31             = _mm256_sub_pd(iy3,jy1);
1606             dz31             = _mm256_sub_pd(iz3,jz1);
1607             dx32             = _mm256_sub_pd(ix3,jx2);
1608             dy32             = _mm256_sub_pd(iy3,jy2);
1609             dz32             = _mm256_sub_pd(iz3,jz2);
1610             dx33             = _mm256_sub_pd(ix3,jx3);
1611             dy33             = _mm256_sub_pd(iy3,jy3);
1612             dz33             = _mm256_sub_pd(iz3,jz3);
1613
1614             /* Calculate squared distance and things based on it */
1615             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1616             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1617             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1618             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1619             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1620             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1621             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1622             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1623             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1624             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1625
1626             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1627             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1628             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1629             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1630             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1631             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1632             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1633             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1634             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1635             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1636
1637             fjx0             = _mm256_setzero_pd();
1638             fjy0             = _mm256_setzero_pd();
1639             fjz0             = _mm256_setzero_pd();
1640             fjx1             = _mm256_setzero_pd();
1641             fjy1             = _mm256_setzero_pd();
1642             fjz1             = _mm256_setzero_pd();
1643             fjx2             = _mm256_setzero_pd();
1644             fjy2             = _mm256_setzero_pd();
1645             fjz2             = _mm256_setzero_pd();
1646             fjx3             = _mm256_setzero_pd();
1647             fjy3             = _mm256_setzero_pd();
1648             fjz3             = _mm256_setzero_pd();
1649
1650             /**************************
1651              * CALCULATE INTERACTIONS *
1652              **************************/
1653
1654             r00              = _mm256_mul_pd(rsq00,rinv00);
1655
1656             /* Calculate table index by multiplying r with table scale and truncate to integer */
1657             rt               = _mm256_mul_pd(r00,vftabscale);
1658             vfitab           = _mm256_cvttpd_epi32(rt);
1659             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1660             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1661
1662             /* CUBIC SPLINE TABLE DISPERSION */
1663             vfitab           = _mm_add_epi32(vfitab,ifour);
1664             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1665             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1666             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1667             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1668             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1669             Heps             = _mm256_mul_pd(vfeps,H);
1670             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1671             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1672             fvdw6            = _mm256_mul_pd(c6_00,FF);
1673
1674             /* CUBIC SPLINE TABLE REPULSION */
1675             vfitab           = _mm_add_epi32(vfitab,ifour);
1676             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1677             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1678             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1679             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1680             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1681             Heps             = _mm256_mul_pd(vfeps,H);
1682             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1683             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1684             fvdw12           = _mm256_mul_pd(c12_00,FF);
1685             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1686
1687             fscal            = fvdw;
1688
1689             /* Calculate temporary vectorial force */
1690             tx               = _mm256_mul_pd(fscal,dx00);
1691             ty               = _mm256_mul_pd(fscal,dy00);
1692             tz               = _mm256_mul_pd(fscal,dz00);
1693
1694             /* Update vectorial force */
1695             fix0             = _mm256_add_pd(fix0,tx);
1696             fiy0             = _mm256_add_pd(fiy0,ty);
1697             fiz0             = _mm256_add_pd(fiz0,tz);
1698
1699             fjx0             = _mm256_add_pd(fjx0,tx);
1700             fjy0             = _mm256_add_pd(fjy0,ty);
1701             fjz0             = _mm256_add_pd(fjz0,tz);
1702
1703             /**************************
1704              * CALCULATE INTERACTIONS *
1705              **************************/
1706
1707             r11              = _mm256_mul_pd(rsq11,rinv11);
1708
1709             /* Calculate table index by multiplying r with table scale and truncate to integer */
1710             rt               = _mm256_mul_pd(r11,vftabscale);
1711             vfitab           = _mm256_cvttpd_epi32(rt);
1712             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1713             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1714
1715             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1716             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1717             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1718             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1719             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1720             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1721             Heps             = _mm256_mul_pd(vfeps,H);
1722             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1723             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1724             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1725
1726             fscal            = felec;
1727
1728             /* Calculate temporary vectorial force */
1729             tx               = _mm256_mul_pd(fscal,dx11);
1730             ty               = _mm256_mul_pd(fscal,dy11);
1731             tz               = _mm256_mul_pd(fscal,dz11);
1732
1733             /* Update vectorial force */
1734             fix1             = _mm256_add_pd(fix1,tx);
1735             fiy1             = _mm256_add_pd(fiy1,ty);
1736             fiz1             = _mm256_add_pd(fiz1,tz);
1737
1738             fjx1             = _mm256_add_pd(fjx1,tx);
1739             fjy1             = _mm256_add_pd(fjy1,ty);
1740             fjz1             = _mm256_add_pd(fjz1,tz);
1741
1742             /**************************
1743              * CALCULATE INTERACTIONS *
1744              **************************/
1745
1746             r12              = _mm256_mul_pd(rsq12,rinv12);
1747
1748             /* Calculate table index by multiplying r with table scale and truncate to integer */
1749             rt               = _mm256_mul_pd(r12,vftabscale);
1750             vfitab           = _mm256_cvttpd_epi32(rt);
1751             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1752             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1753
1754             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1755             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1756             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1757             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1758             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1759             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1760             Heps             = _mm256_mul_pd(vfeps,H);
1761             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1762             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1763             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1764
1765             fscal            = felec;
1766
1767             /* Calculate temporary vectorial force */
1768             tx               = _mm256_mul_pd(fscal,dx12);
1769             ty               = _mm256_mul_pd(fscal,dy12);
1770             tz               = _mm256_mul_pd(fscal,dz12);
1771
1772             /* Update vectorial force */
1773             fix1             = _mm256_add_pd(fix1,tx);
1774             fiy1             = _mm256_add_pd(fiy1,ty);
1775             fiz1             = _mm256_add_pd(fiz1,tz);
1776
1777             fjx2             = _mm256_add_pd(fjx2,tx);
1778             fjy2             = _mm256_add_pd(fjy2,ty);
1779             fjz2             = _mm256_add_pd(fjz2,tz);
1780
1781             /**************************
1782              * CALCULATE INTERACTIONS *
1783              **************************/
1784
1785             r13              = _mm256_mul_pd(rsq13,rinv13);
1786
1787             /* Calculate table index by multiplying r with table scale and truncate to integer */
1788             rt               = _mm256_mul_pd(r13,vftabscale);
1789             vfitab           = _mm256_cvttpd_epi32(rt);
1790             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1791             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1792
1793             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1794             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1795             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1796             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1797             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1798             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1799             Heps             = _mm256_mul_pd(vfeps,H);
1800             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1801             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1802             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
1803
1804             fscal            = felec;
1805
1806             /* Calculate temporary vectorial force */
1807             tx               = _mm256_mul_pd(fscal,dx13);
1808             ty               = _mm256_mul_pd(fscal,dy13);
1809             tz               = _mm256_mul_pd(fscal,dz13);
1810
1811             /* Update vectorial force */
1812             fix1             = _mm256_add_pd(fix1,tx);
1813             fiy1             = _mm256_add_pd(fiy1,ty);
1814             fiz1             = _mm256_add_pd(fiz1,tz);
1815
1816             fjx3             = _mm256_add_pd(fjx3,tx);
1817             fjy3             = _mm256_add_pd(fjy3,ty);
1818             fjz3             = _mm256_add_pd(fjz3,tz);
1819
1820             /**************************
1821              * CALCULATE INTERACTIONS *
1822              **************************/
1823
1824             r21              = _mm256_mul_pd(rsq21,rinv21);
1825
1826             /* Calculate table index by multiplying r with table scale and truncate to integer */
1827             rt               = _mm256_mul_pd(r21,vftabscale);
1828             vfitab           = _mm256_cvttpd_epi32(rt);
1829             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1830             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1831
1832             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1833             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1834             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1835             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1836             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1837             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1838             Heps             = _mm256_mul_pd(vfeps,H);
1839             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1840             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1841             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1842
1843             fscal            = felec;
1844
1845             /* Calculate temporary vectorial force */
1846             tx               = _mm256_mul_pd(fscal,dx21);
1847             ty               = _mm256_mul_pd(fscal,dy21);
1848             tz               = _mm256_mul_pd(fscal,dz21);
1849
1850             /* Update vectorial force */
1851             fix2             = _mm256_add_pd(fix2,tx);
1852             fiy2             = _mm256_add_pd(fiy2,ty);
1853             fiz2             = _mm256_add_pd(fiz2,tz);
1854
1855             fjx1             = _mm256_add_pd(fjx1,tx);
1856             fjy1             = _mm256_add_pd(fjy1,ty);
1857             fjz1             = _mm256_add_pd(fjz1,tz);
1858
1859             /**************************
1860              * CALCULATE INTERACTIONS *
1861              **************************/
1862
1863             r22              = _mm256_mul_pd(rsq22,rinv22);
1864
1865             /* Calculate table index by multiplying r with table scale and truncate to integer */
1866             rt               = _mm256_mul_pd(r22,vftabscale);
1867             vfitab           = _mm256_cvttpd_epi32(rt);
1868             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1869             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1870
1871             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1872             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1873             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1874             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1875             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1876             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1877             Heps             = _mm256_mul_pd(vfeps,H);
1878             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1879             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1880             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1881
1882             fscal            = felec;
1883
1884             /* Calculate temporary vectorial force */
1885             tx               = _mm256_mul_pd(fscal,dx22);
1886             ty               = _mm256_mul_pd(fscal,dy22);
1887             tz               = _mm256_mul_pd(fscal,dz22);
1888
1889             /* Update vectorial force */
1890             fix2             = _mm256_add_pd(fix2,tx);
1891             fiy2             = _mm256_add_pd(fiy2,ty);
1892             fiz2             = _mm256_add_pd(fiz2,tz);
1893
1894             fjx2             = _mm256_add_pd(fjx2,tx);
1895             fjy2             = _mm256_add_pd(fjy2,ty);
1896             fjz2             = _mm256_add_pd(fjz2,tz);
1897
1898             /**************************
1899              * CALCULATE INTERACTIONS *
1900              **************************/
1901
1902             r23              = _mm256_mul_pd(rsq23,rinv23);
1903
1904             /* Calculate table index by multiplying r with table scale and truncate to integer */
1905             rt               = _mm256_mul_pd(r23,vftabscale);
1906             vfitab           = _mm256_cvttpd_epi32(rt);
1907             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1908             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1909
1910             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1911             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1912             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1913             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1914             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1915             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1916             Heps             = _mm256_mul_pd(vfeps,H);
1917             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1918             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1919             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
1920
1921             fscal            = felec;
1922
1923             /* Calculate temporary vectorial force */
1924             tx               = _mm256_mul_pd(fscal,dx23);
1925             ty               = _mm256_mul_pd(fscal,dy23);
1926             tz               = _mm256_mul_pd(fscal,dz23);
1927
1928             /* Update vectorial force */
1929             fix2             = _mm256_add_pd(fix2,tx);
1930             fiy2             = _mm256_add_pd(fiy2,ty);
1931             fiz2             = _mm256_add_pd(fiz2,tz);
1932
1933             fjx3             = _mm256_add_pd(fjx3,tx);
1934             fjy3             = _mm256_add_pd(fjy3,ty);
1935             fjz3             = _mm256_add_pd(fjz3,tz);
1936
1937             /**************************
1938              * CALCULATE INTERACTIONS *
1939              **************************/
1940
1941             r31              = _mm256_mul_pd(rsq31,rinv31);
1942
1943             /* Calculate table index by multiplying r with table scale and truncate to integer */
1944             rt               = _mm256_mul_pd(r31,vftabscale);
1945             vfitab           = _mm256_cvttpd_epi32(rt);
1946             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1947             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1948
1949             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1950             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1951             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1952             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1953             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1954             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1955             Heps             = _mm256_mul_pd(vfeps,H);
1956             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1957             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1958             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
1959
1960             fscal            = felec;
1961
1962             /* Calculate temporary vectorial force */
1963             tx               = _mm256_mul_pd(fscal,dx31);
1964             ty               = _mm256_mul_pd(fscal,dy31);
1965             tz               = _mm256_mul_pd(fscal,dz31);
1966
1967             /* Update vectorial force */
1968             fix3             = _mm256_add_pd(fix3,tx);
1969             fiy3             = _mm256_add_pd(fiy3,ty);
1970             fiz3             = _mm256_add_pd(fiz3,tz);
1971
1972             fjx1             = _mm256_add_pd(fjx1,tx);
1973             fjy1             = _mm256_add_pd(fjy1,ty);
1974             fjz1             = _mm256_add_pd(fjz1,tz);
1975
1976             /**************************
1977              * CALCULATE INTERACTIONS *
1978              **************************/
1979
1980             r32              = _mm256_mul_pd(rsq32,rinv32);
1981
1982             /* Calculate table index by multiplying r with table scale and truncate to integer */
1983             rt               = _mm256_mul_pd(r32,vftabscale);
1984             vfitab           = _mm256_cvttpd_epi32(rt);
1985             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1986             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1987
1988             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1989             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1990             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1991             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1992             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1993             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1994             Heps             = _mm256_mul_pd(vfeps,H);
1995             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1996             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1997             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
1998
1999             fscal            = felec;
2000
2001             /* Calculate temporary vectorial force */
2002             tx               = _mm256_mul_pd(fscal,dx32);
2003             ty               = _mm256_mul_pd(fscal,dy32);
2004             tz               = _mm256_mul_pd(fscal,dz32);
2005
2006             /* Update vectorial force */
2007             fix3             = _mm256_add_pd(fix3,tx);
2008             fiy3             = _mm256_add_pd(fiy3,ty);
2009             fiz3             = _mm256_add_pd(fiz3,tz);
2010
2011             fjx2             = _mm256_add_pd(fjx2,tx);
2012             fjy2             = _mm256_add_pd(fjy2,ty);
2013             fjz2             = _mm256_add_pd(fjz2,tz);
2014
2015             /**************************
2016              * CALCULATE INTERACTIONS *
2017              **************************/
2018
2019             r33              = _mm256_mul_pd(rsq33,rinv33);
2020
2021             /* Calculate table index by multiplying r with table scale and truncate to integer */
2022             rt               = _mm256_mul_pd(r33,vftabscale);
2023             vfitab           = _mm256_cvttpd_epi32(rt);
2024             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2025             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2026
2027             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2028             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2029             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2030             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2031             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2032             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2033             Heps             = _mm256_mul_pd(vfeps,H);
2034             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2035             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2036             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
2037
2038             fscal            = felec;
2039
2040             /* Calculate temporary vectorial force */
2041             tx               = _mm256_mul_pd(fscal,dx33);
2042             ty               = _mm256_mul_pd(fscal,dy33);
2043             tz               = _mm256_mul_pd(fscal,dz33);
2044
2045             /* Update vectorial force */
2046             fix3             = _mm256_add_pd(fix3,tx);
2047             fiy3             = _mm256_add_pd(fiy3,ty);
2048             fiz3             = _mm256_add_pd(fiz3,tz);
2049
2050             fjx3             = _mm256_add_pd(fjx3,tx);
2051             fjy3             = _mm256_add_pd(fjy3,ty);
2052             fjz3             = _mm256_add_pd(fjz3,tz);
2053
2054             fjptrA             = f+j_coord_offsetA;
2055             fjptrB             = f+j_coord_offsetB;
2056             fjptrC             = f+j_coord_offsetC;
2057             fjptrD             = f+j_coord_offsetD;
2058
2059             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2060                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2061                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2062
2063             /* Inner loop uses 402 flops */
2064         }
2065
2066         if(jidx<j_index_end)
2067         {
2068
2069             /* Get j neighbor index, and coordinate index */
2070             jnrlistA         = jjnr[jidx];
2071             jnrlistB         = jjnr[jidx+1];
2072             jnrlistC         = jjnr[jidx+2];
2073             jnrlistD         = jjnr[jidx+3];
2074             /* Sign of each element will be negative for non-real atoms.
2075              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2076              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2077              */
2078             tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2079
2080             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2081             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2082             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2083
2084             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2085             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2086             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2087             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2088             j_coord_offsetA  = DIM*jnrA;
2089             j_coord_offsetB  = DIM*jnrB;
2090             j_coord_offsetC  = DIM*jnrC;
2091             j_coord_offsetD  = DIM*jnrD;
2092
2093             /* load j atom coordinates */
2094             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2095                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2096                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2097                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2098
2099             /* Calculate displacement vector */
2100             dx00             = _mm256_sub_pd(ix0,jx0);
2101             dy00             = _mm256_sub_pd(iy0,jy0);
2102             dz00             = _mm256_sub_pd(iz0,jz0);
2103             dx11             = _mm256_sub_pd(ix1,jx1);
2104             dy11             = _mm256_sub_pd(iy1,jy1);
2105             dz11             = _mm256_sub_pd(iz1,jz1);
2106             dx12             = _mm256_sub_pd(ix1,jx2);
2107             dy12             = _mm256_sub_pd(iy1,jy2);
2108             dz12             = _mm256_sub_pd(iz1,jz2);
2109             dx13             = _mm256_sub_pd(ix1,jx3);
2110             dy13             = _mm256_sub_pd(iy1,jy3);
2111             dz13             = _mm256_sub_pd(iz1,jz3);
2112             dx21             = _mm256_sub_pd(ix2,jx1);
2113             dy21             = _mm256_sub_pd(iy2,jy1);
2114             dz21             = _mm256_sub_pd(iz2,jz1);
2115             dx22             = _mm256_sub_pd(ix2,jx2);
2116             dy22             = _mm256_sub_pd(iy2,jy2);
2117             dz22             = _mm256_sub_pd(iz2,jz2);
2118             dx23             = _mm256_sub_pd(ix2,jx3);
2119             dy23             = _mm256_sub_pd(iy2,jy3);
2120             dz23             = _mm256_sub_pd(iz2,jz3);
2121             dx31             = _mm256_sub_pd(ix3,jx1);
2122             dy31             = _mm256_sub_pd(iy3,jy1);
2123             dz31             = _mm256_sub_pd(iz3,jz1);
2124             dx32             = _mm256_sub_pd(ix3,jx2);
2125             dy32             = _mm256_sub_pd(iy3,jy2);
2126             dz32             = _mm256_sub_pd(iz3,jz2);
2127             dx33             = _mm256_sub_pd(ix3,jx3);
2128             dy33             = _mm256_sub_pd(iy3,jy3);
2129             dz33             = _mm256_sub_pd(iz3,jz3);
2130
2131             /* Calculate squared distance and things based on it */
2132             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2133             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2134             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2135             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2136             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2137             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2138             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2139             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2140             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2141             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2142
2143             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
2144             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2145             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2146             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
2147             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2148             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2149             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
2150             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
2151             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
2152             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
2153
2154             fjx0             = _mm256_setzero_pd();
2155             fjy0             = _mm256_setzero_pd();
2156             fjz0             = _mm256_setzero_pd();
2157             fjx1             = _mm256_setzero_pd();
2158             fjy1             = _mm256_setzero_pd();
2159             fjz1             = _mm256_setzero_pd();
2160             fjx2             = _mm256_setzero_pd();
2161             fjy2             = _mm256_setzero_pd();
2162             fjz2             = _mm256_setzero_pd();
2163             fjx3             = _mm256_setzero_pd();
2164             fjy3             = _mm256_setzero_pd();
2165             fjz3             = _mm256_setzero_pd();
2166
2167             /**************************
2168              * CALCULATE INTERACTIONS *
2169              **************************/
2170
2171             r00              = _mm256_mul_pd(rsq00,rinv00);
2172             r00              = _mm256_andnot_pd(dummy_mask,r00);
2173
2174             /* Calculate table index by multiplying r with table scale and truncate to integer */
2175             rt               = _mm256_mul_pd(r00,vftabscale);
2176             vfitab           = _mm256_cvttpd_epi32(rt);
2177             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2178             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2179
2180             /* CUBIC SPLINE TABLE DISPERSION */
2181             vfitab           = _mm_add_epi32(vfitab,ifour);
2182             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2183             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2184             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2185             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2186             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2187             Heps             = _mm256_mul_pd(vfeps,H);
2188             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2189             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2190             fvdw6            = _mm256_mul_pd(c6_00,FF);
2191
2192             /* CUBIC SPLINE TABLE REPULSION */
2193             vfitab           = _mm_add_epi32(vfitab,ifour);
2194             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2195             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2196             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2197             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2198             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2199             Heps             = _mm256_mul_pd(vfeps,H);
2200             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2201             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2202             fvdw12           = _mm256_mul_pd(c12_00,FF);
2203             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
2204
2205             fscal            = fvdw;
2206
2207             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2208
2209             /* Calculate temporary vectorial force */
2210             tx               = _mm256_mul_pd(fscal,dx00);
2211             ty               = _mm256_mul_pd(fscal,dy00);
2212             tz               = _mm256_mul_pd(fscal,dz00);
2213
2214             /* Update vectorial force */
2215             fix0             = _mm256_add_pd(fix0,tx);
2216             fiy0             = _mm256_add_pd(fiy0,ty);
2217             fiz0             = _mm256_add_pd(fiz0,tz);
2218
2219             fjx0             = _mm256_add_pd(fjx0,tx);
2220             fjy0             = _mm256_add_pd(fjy0,ty);
2221             fjz0             = _mm256_add_pd(fjz0,tz);
2222
2223             /**************************
2224              * CALCULATE INTERACTIONS *
2225              **************************/
2226
2227             r11              = _mm256_mul_pd(rsq11,rinv11);
2228             r11              = _mm256_andnot_pd(dummy_mask,r11);
2229
2230             /* Calculate table index by multiplying r with table scale and truncate to integer */
2231             rt               = _mm256_mul_pd(r11,vftabscale);
2232             vfitab           = _mm256_cvttpd_epi32(rt);
2233             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2234             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2235
2236             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2237             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2238             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2239             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2240             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2241             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2242             Heps             = _mm256_mul_pd(vfeps,H);
2243             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2244             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2245             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2246
2247             fscal            = felec;
2248
2249             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2250
2251             /* Calculate temporary vectorial force */
2252             tx               = _mm256_mul_pd(fscal,dx11);
2253             ty               = _mm256_mul_pd(fscal,dy11);
2254             tz               = _mm256_mul_pd(fscal,dz11);
2255
2256             /* Update vectorial force */
2257             fix1             = _mm256_add_pd(fix1,tx);
2258             fiy1             = _mm256_add_pd(fiy1,ty);
2259             fiz1             = _mm256_add_pd(fiz1,tz);
2260
2261             fjx1             = _mm256_add_pd(fjx1,tx);
2262             fjy1             = _mm256_add_pd(fjy1,ty);
2263             fjz1             = _mm256_add_pd(fjz1,tz);
2264
2265             /**************************
2266              * CALCULATE INTERACTIONS *
2267              **************************/
2268
2269             r12              = _mm256_mul_pd(rsq12,rinv12);
2270             r12              = _mm256_andnot_pd(dummy_mask,r12);
2271
2272             /* Calculate table index by multiplying r with table scale and truncate to integer */
2273             rt               = _mm256_mul_pd(r12,vftabscale);
2274             vfitab           = _mm256_cvttpd_epi32(rt);
2275             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2276             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2277
2278             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2279             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2280             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2281             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2282             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2283             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2284             Heps             = _mm256_mul_pd(vfeps,H);
2285             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2286             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2287             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2288
2289             fscal            = felec;
2290
2291             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2292
2293             /* Calculate temporary vectorial force */
2294             tx               = _mm256_mul_pd(fscal,dx12);
2295             ty               = _mm256_mul_pd(fscal,dy12);
2296             tz               = _mm256_mul_pd(fscal,dz12);
2297
2298             /* Update vectorial force */
2299             fix1             = _mm256_add_pd(fix1,tx);
2300             fiy1             = _mm256_add_pd(fiy1,ty);
2301             fiz1             = _mm256_add_pd(fiz1,tz);
2302
2303             fjx2             = _mm256_add_pd(fjx2,tx);
2304             fjy2             = _mm256_add_pd(fjy2,ty);
2305             fjz2             = _mm256_add_pd(fjz2,tz);
2306
2307             /**************************
2308              * CALCULATE INTERACTIONS *
2309              **************************/
2310
2311             r13              = _mm256_mul_pd(rsq13,rinv13);
2312             r13              = _mm256_andnot_pd(dummy_mask,r13);
2313
2314             /* Calculate table index by multiplying r with table scale and truncate to integer */
2315             rt               = _mm256_mul_pd(r13,vftabscale);
2316             vfitab           = _mm256_cvttpd_epi32(rt);
2317             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2318             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2319
2320             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2321             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2322             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2323             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2324             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2325             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2326             Heps             = _mm256_mul_pd(vfeps,H);
2327             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2328             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2329             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
2330
2331             fscal            = felec;
2332
2333             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2334
2335             /* Calculate temporary vectorial force */
2336             tx               = _mm256_mul_pd(fscal,dx13);
2337             ty               = _mm256_mul_pd(fscal,dy13);
2338             tz               = _mm256_mul_pd(fscal,dz13);
2339
2340             /* Update vectorial force */
2341             fix1             = _mm256_add_pd(fix1,tx);
2342             fiy1             = _mm256_add_pd(fiy1,ty);
2343             fiz1             = _mm256_add_pd(fiz1,tz);
2344
2345             fjx3             = _mm256_add_pd(fjx3,tx);
2346             fjy3             = _mm256_add_pd(fjy3,ty);
2347             fjz3             = _mm256_add_pd(fjz3,tz);
2348
2349             /**************************
2350              * CALCULATE INTERACTIONS *
2351              **************************/
2352
2353             r21              = _mm256_mul_pd(rsq21,rinv21);
2354             r21              = _mm256_andnot_pd(dummy_mask,r21);
2355
2356             /* Calculate table index by multiplying r with table scale and truncate to integer */
2357             rt               = _mm256_mul_pd(r21,vftabscale);
2358             vfitab           = _mm256_cvttpd_epi32(rt);
2359             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2360             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2361
2362             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2363             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2364             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2365             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2366             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2367             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2368             Heps             = _mm256_mul_pd(vfeps,H);
2369             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2370             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2371             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2372
2373             fscal            = felec;
2374
2375             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2376
2377             /* Calculate temporary vectorial force */
2378             tx               = _mm256_mul_pd(fscal,dx21);
2379             ty               = _mm256_mul_pd(fscal,dy21);
2380             tz               = _mm256_mul_pd(fscal,dz21);
2381
2382             /* Update vectorial force */
2383             fix2             = _mm256_add_pd(fix2,tx);
2384             fiy2             = _mm256_add_pd(fiy2,ty);
2385             fiz2             = _mm256_add_pd(fiz2,tz);
2386
2387             fjx1             = _mm256_add_pd(fjx1,tx);
2388             fjy1             = _mm256_add_pd(fjy1,ty);
2389             fjz1             = _mm256_add_pd(fjz1,tz);
2390
2391             /**************************
2392              * CALCULATE INTERACTIONS *
2393              **************************/
2394
2395             r22              = _mm256_mul_pd(rsq22,rinv22);
2396             r22              = _mm256_andnot_pd(dummy_mask,r22);
2397
2398             /* Calculate table index by multiplying r with table scale and truncate to integer */
2399             rt               = _mm256_mul_pd(r22,vftabscale);
2400             vfitab           = _mm256_cvttpd_epi32(rt);
2401             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2402             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2403
2404             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2405             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2406             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2407             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2408             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2409             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2410             Heps             = _mm256_mul_pd(vfeps,H);
2411             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2412             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2413             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2414
2415             fscal            = felec;
2416
2417             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2418
2419             /* Calculate temporary vectorial force */
2420             tx               = _mm256_mul_pd(fscal,dx22);
2421             ty               = _mm256_mul_pd(fscal,dy22);
2422             tz               = _mm256_mul_pd(fscal,dz22);
2423
2424             /* Update vectorial force */
2425             fix2             = _mm256_add_pd(fix2,tx);
2426             fiy2             = _mm256_add_pd(fiy2,ty);
2427             fiz2             = _mm256_add_pd(fiz2,tz);
2428
2429             fjx2             = _mm256_add_pd(fjx2,tx);
2430             fjy2             = _mm256_add_pd(fjy2,ty);
2431             fjz2             = _mm256_add_pd(fjz2,tz);
2432
2433             /**************************
2434              * CALCULATE INTERACTIONS *
2435              **************************/
2436
2437             r23              = _mm256_mul_pd(rsq23,rinv23);
2438             r23              = _mm256_andnot_pd(dummy_mask,r23);
2439
2440             /* Calculate table index by multiplying r with table scale and truncate to integer */
2441             rt               = _mm256_mul_pd(r23,vftabscale);
2442             vfitab           = _mm256_cvttpd_epi32(rt);
2443             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2444             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2445
2446             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2447             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2448             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2449             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2450             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2451             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2452             Heps             = _mm256_mul_pd(vfeps,H);
2453             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2454             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2455             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
2456
2457             fscal            = felec;
2458
2459             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2460
2461             /* Calculate temporary vectorial force */
2462             tx               = _mm256_mul_pd(fscal,dx23);
2463             ty               = _mm256_mul_pd(fscal,dy23);
2464             tz               = _mm256_mul_pd(fscal,dz23);
2465
2466             /* Update vectorial force */
2467             fix2             = _mm256_add_pd(fix2,tx);
2468             fiy2             = _mm256_add_pd(fiy2,ty);
2469             fiz2             = _mm256_add_pd(fiz2,tz);
2470
2471             fjx3             = _mm256_add_pd(fjx3,tx);
2472             fjy3             = _mm256_add_pd(fjy3,ty);
2473             fjz3             = _mm256_add_pd(fjz3,tz);
2474
2475             /**************************
2476              * CALCULATE INTERACTIONS *
2477              **************************/
2478
2479             r31              = _mm256_mul_pd(rsq31,rinv31);
2480             r31              = _mm256_andnot_pd(dummy_mask,r31);
2481
2482             /* Calculate table index by multiplying r with table scale and truncate to integer */
2483             rt               = _mm256_mul_pd(r31,vftabscale);
2484             vfitab           = _mm256_cvttpd_epi32(rt);
2485             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2486             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2487
2488             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2489             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2490             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2491             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2492             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2493             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2494             Heps             = _mm256_mul_pd(vfeps,H);
2495             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2496             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2497             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
2498
2499             fscal            = felec;
2500
2501             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2502
2503             /* Calculate temporary vectorial force */
2504             tx               = _mm256_mul_pd(fscal,dx31);
2505             ty               = _mm256_mul_pd(fscal,dy31);
2506             tz               = _mm256_mul_pd(fscal,dz31);
2507
2508             /* Update vectorial force */
2509             fix3             = _mm256_add_pd(fix3,tx);
2510             fiy3             = _mm256_add_pd(fiy3,ty);
2511             fiz3             = _mm256_add_pd(fiz3,tz);
2512
2513             fjx1             = _mm256_add_pd(fjx1,tx);
2514             fjy1             = _mm256_add_pd(fjy1,ty);
2515             fjz1             = _mm256_add_pd(fjz1,tz);
2516
2517             /**************************
2518              * CALCULATE INTERACTIONS *
2519              **************************/
2520
2521             r32              = _mm256_mul_pd(rsq32,rinv32);
2522             r32              = _mm256_andnot_pd(dummy_mask,r32);
2523
2524             /* Calculate table index by multiplying r with table scale and truncate to integer */
2525             rt               = _mm256_mul_pd(r32,vftabscale);
2526             vfitab           = _mm256_cvttpd_epi32(rt);
2527             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2528             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2529
2530             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2531             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2532             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2533             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2534             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2535             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2536             Heps             = _mm256_mul_pd(vfeps,H);
2537             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2538             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2539             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
2540
2541             fscal            = felec;
2542
2543             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2544
2545             /* Calculate temporary vectorial force */
2546             tx               = _mm256_mul_pd(fscal,dx32);
2547             ty               = _mm256_mul_pd(fscal,dy32);
2548             tz               = _mm256_mul_pd(fscal,dz32);
2549
2550             /* Update vectorial force */
2551             fix3             = _mm256_add_pd(fix3,tx);
2552             fiy3             = _mm256_add_pd(fiy3,ty);
2553             fiz3             = _mm256_add_pd(fiz3,tz);
2554
2555             fjx2             = _mm256_add_pd(fjx2,tx);
2556             fjy2             = _mm256_add_pd(fjy2,ty);
2557             fjz2             = _mm256_add_pd(fjz2,tz);
2558
2559             /**************************
2560              * CALCULATE INTERACTIONS *
2561              **************************/
2562
2563             r33              = _mm256_mul_pd(rsq33,rinv33);
2564             r33              = _mm256_andnot_pd(dummy_mask,r33);
2565
2566             /* Calculate table index by multiplying r with table scale and truncate to integer */
2567             rt               = _mm256_mul_pd(r33,vftabscale);
2568             vfitab           = _mm256_cvttpd_epi32(rt);
2569             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2570             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2571
2572             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2573             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2574             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2575             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2576             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2577             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2578             Heps             = _mm256_mul_pd(vfeps,H);
2579             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2580             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2581             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
2582
2583             fscal            = felec;
2584
2585             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2586
2587             /* Calculate temporary vectorial force */
2588             tx               = _mm256_mul_pd(fscal,dx33);
2589             ty               = _mm256_mul_pd(fscal,dy33);
2590             tz               = _mm256_mul_pd(fscal,dz33);
2591
2592             /* Update vectorial force */
2593             fix3             = _mm256_add_pd(fix3,tx);
2594             fiy3             = _mm256_add_pd(fiy3,ty);
2595             fiz3             = _mm256_add_pd(fiz3,tz);
2596
2597             fjx3             = _mm256_add_pd(fjx3,tx);
2598             fjy3             = _mm256_add_pd(fjy3,ty);
2599             fjz3             = _mm256_add_pd(fjz3,tz);
2600
2601             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2602             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2603             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2604             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2605
2606             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2607                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2608                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2609
2610             /* Inner loop uses 412 flops */
2611         }
2612
2613         /* End of innermost loop */
2614
2615         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2616                                                  f+i_coord_offset,fshift+i_shift_offset);
2617
2618         /* Increment number of inner iterations */
2619         inneriter                  += j_index_end - j_index_start;
2620
2621         /* Outer loop uses 24 flops */
2622     }
2623
2624     /* Increment number of outer iterations */
2625     outeriter        += nri;
2626
2627     /* Update outer/inner flops */
2628
2629     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*412);
2630 }