Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_single
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     real *           vdwioffsetptr3;
79     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
80     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
81     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
82     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
83     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
84     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
85     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
86     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
87     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
88     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
89     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
90     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
91     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
92     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
93     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
94     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
95     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
96     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
97     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
98     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
99     real             *charge;
100     int              nvdwtype;
101     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102     int              *vdwtype;
103     real             *vdwparam;
104     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
105     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
106     __m256i          vfitab;
107     __m128i          vfitab_lo,vfitab_hi;
108     __m128i          ifour       = _mm_set1_epi32(4);
109     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
110     real             *vftab;
111     __m256           dummy_mask,cutoff_mask;
112     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
113     __m256           one     = _mm256_set1_ps(1.0);
114     __m256           two     = _mm256_set1_ps(2.0);
115     x                = xx[0];
116     f                = ff[0];
117
118     nri              = nlist->nri;
119     iinr             = nlist->iinr;
120     jindex           = nlist->jindex;
121     jjnr             = nlist->jjnr;
122     shiftidx         = nlist->shift;
123     gid              = nlist->gid;
124     shiftvec         = fr->shift_vec[0];
125     fshift           = fr->fshift[0];
126     facel            = _mm256_set1_ps(fr->epsfac);
127     charge           = mdatoms->chargeA;
128     nvdwtype         = fr->ntype;
129     vdwparam         = fr->nbfp;
130     vdwtype          = mdatoms->typeA;
131
132     vftab            = kernel_data->table_elec_vdw->data;
133     vftabscale       = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
134
135     /* Setup water-specific parameters */
136     inr              = nlist->iinr[0];
137     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
138     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
139     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
140     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
141
142     jq1              = _mm256_set1_ps(charge[inr+1]);
143     jq2              = _mm256_set1_ps(charge[inr+2]);
144     jq3              = _mm256_set1_ps(charge[inr+3]);
145     vdwjidx0A        = 2*vdwtype[inr+0];
146     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
147     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
148     qq11             = _mm256_mul_ps(iq1,jq1);
149     qq12             = _mm256_mul_ps(iq1,jq2);
150     qq13             = _mm256_mul_ps(iq1,jq3);
151     qq21             = _mm256_mul_ps(iq2,jq1);
152     qq22             = _mm256_mul_ps(iq2,jq2);
153     qq23             = _mm256_mul_ps(iq2,jq3);
154     qq31             = _mm256_mul_ps(iq3,jq1);
155     qq32             = _mm256_mul_ps(iq3,jq2);
156     qq33             = _mm256_mul_ps(iq3,jq3);
157
158     /* Avoid stupid compiler warnings */
159     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
160     j_coord_offsetA = 0;
161     j_coord_offsetB = 0;
162     j_coord_offsetC = 0;
163     j_coord_offsetD = 0;
164     j_coord_offsetE = 0;
165     j_coord_offsetF = 0;
166     j_coord_offsetG = 0;
167     j_coord_offsetH = 0;
168
169     outeriter        = 0;
170     inneriter        = 0;
171
172     for(iidx=0;iidx<4*DIM;iidx++)
173     {
174         scratch[iidx] = 0.0;
175     }
176
177     /* Start outer loop over neighborlists */
178     for(iidx=0; iidx<nri; iidx++)
179     {
180         /* Load shift vector for this list */
181         i_shift_offset   = DIM*shiftidx[iidx];
182
183         /* Load limits for loop over neighbors */
184         j_index_start    = jindex[iidx];
185         j_index_end      = jindex[iidx+1];
186
187         /* Get outer coordinate index */
188         inr              = iinr[iidx];
189         i_coord_offset   = DIM*inr;
190
191         /* Load i particle coords and add shift vector */
192         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
193                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
194
195         fix0             = _mm256_setzero_ps();
196         fiy0             = _mm256_setzero_ps();
197         fiz0             = _mm256_setzero_ps();
198         fix1             = _mm256_setzero_ps();
199         fiy1             = _mm256_setzero_ps();
200         fiz1             = _mm256_setzero_ps();
201         fix2             = _mm256_setzero_ps();
202         fiy2             = _mm256_setzero_ps();
203         fiz2             = _mm256_setzero_ps();
204         fix3             = _mm256_setzero_ps();
205         fiy3             = _mm256_setzero_ps();
206         fiz3             = _mm256_setzero_ps();
207
208         /* Reset potential sums */
209         velecsum         = _mm256_setzero_ps();
210         vvdwsum          = _mm256_setzero_ps();
211
212         /* Start inner kernel loop */
213         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
214         {
215
216             /* Get j neighbor index, and coordinate index */
217             jnrA             = jjnr[jidx];
218             jnrB             = jjnr[jidx+1];
219             jnrC             = jjnr[jidx+2];
220             jnrD             = jjnr[jidx+3];
221             jnrE             = jjnr[jidx+4];
222             jnrF             = jjnr[jidx+5];
223             jnrG             = jjnr[jidx+6];
224             jnrH             = jjnr[jidx+7];
225             j_coord_offsetA  = DIM*jnrA;
226             j_coord_offsetB  = DIM*jnrB;
227             j_coord_offsetC  = DIM*jnrC;
228             j_coord_offsetD  = DIM*jnrD;
229             j_coord_offsetE  = DIM*jnrE;
230             j_coord_offsetF  = DIM*jnrF;
231             j_coord_offsetG  = DIM*jnrG;
232             j_coord_offsetH  = DIM*jnrH;
233
234             /* load j atom coordinates */
235             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236                                                  x+j_coord_offsetC,x+j_coord_offsetD,
237                                                  x+j_coord_offsetE,x+j_coord_offsetF,
238                                                  x+j_coord_offsetG,x+j_coord_offsetH,
239                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
240                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
241
242             /* Calculate displacement vector */
243             dx00             = _mm256_sub_ps(ix0,jx0);
244             dy00             = _mm256_sub_ps(iy0,jy0);
245             dz00             = _mm256_sub_ps(iz0,jz0);
246             dx11             = _mm256_sub_ps(ix1,jx1);
247             dy11             = _mm256_sub_ps(iy1,jy1);
248             dz11             = _mm256_sub_ps(iz1,jz1);
249             dx12             = _mm256_sub_ps(ix1,jx2);
250             dy12             = _mm256_sub_ps(iy1,jy2);
251             dz12             = _mm256_sub_ps(iz1,jz2);
252             dx13             = _mm256_sub_ps(ix1,jx3);
253             dy13             = _mm256_sub_ps(iy1,jy3);
254             dz13             = _mm256_sub_ps(iz1,jz3);
255             dx21             = _mm256_sub_ps(ix2,jx1);
256             dy21             = _mm256_sub_ps(iy2,jy1);
257             dz21             = _mm256_sub_ps(iz2,jz1);
258             dx22             = _mm256_sub_ps(ix2,jx2);
259             dy22             = _mm256_sub_ps(iy2,jy2);
260             dz22             = _mm256_sub_ps(iz2,jz2);
261             dx23             = _mm256_sub_ps(ix2,jx3);
262             dy23             = _mm256_sub_ps(iy2,jy3);
263             dz23             = _mm256_sub_ps(iz2,jz3);
264             dx31             = _mm256_sub_ps(ix3,jx1);
265             dy31             = _mm256_sub_ps(iy3,jy1);
266             dz31             = _mm256_sub_ps(iz3,jz1);
267             dx32             = _mm256_sub_ps(ix3,jx2);
268             dy32             = _mm256_sub_ps(iy3,jy2);
269             dz32             = _mm256_sub_ps(iz3,jz2);
270             dx33             = _mm256_sub_ps(ix3,jx3);
271             dy33             = _mm256_sub_ps(iy3,jy3);
272             dz33             = _mm256_sub_ps(iz3,jz3);
273
274             /* Calculate squared distance and things based on it */
275             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
276             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
277             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
278             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
279             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
280             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
281             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
282             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
283             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
284             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
285
286             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
287             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
288             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
289             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
290             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
291             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
292             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
293             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
294             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
295             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
296
297             fjx0             = _mm256_setzero_ps();
298             fjy0             = _mm256_setzero_ps();
299             fjz0             = _mm256_setzero_ps();
300             fjx1             = _mm256_setzero_ps();
301             fjy1             = _mm256_setzero_ps();
302             fjz1             = _mm256_setzero_ps();
303             fjx2             = _mm256_setzero_ps();
304             fjy2             = _mm256_setzero_ps();
305             fjz2             = _mm256_setzero_ps();
306             fjx3             = _mm256_setzero_ps();
307             fjy3             = _mm256_setzero_ps();
308             fjz3             = _mm256_setzero_ps();
309
310             /**************************
311              * CALCULATE INTERACTIONS *
312              **************************/
313
314             r00              = _mm256_mul_ps(rsq00,rinv00);
315
316             /* Calculate table index by multiplying r with table scale and truncate to integer */
317             rt               = _mm256_mul_ps(r00,vftabscale);
318             vfitab           = _mm256_cvttps_epi32(rt);
319             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
320             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
321             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
322             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
323             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
324             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
325
326             /* CUBIC SPLINE TABLE DISPERSION */
327             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
328             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
329             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
330                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
331             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
332                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
333             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
334                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
335             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
336                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
337             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
338             Heps             = _mm256_mul_ps(vfeps,H);
339             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
340             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
341             vvdw6            = _mm256_mul_ps(c6_00,VV);
342             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
343             fvdw6            = _mm256_mul_ps(c6_00,FF);
344
345             /* CUBIC SPLINE TABLE REPULSION */
346             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
347             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
348             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
349                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
350             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
351                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
352             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
353                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
354             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
355                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
356             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
357             Heps             = _mm256_mul_ps(vfeps,H);
358             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
359             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
360             vvdw12           = _mm256_mul_ps(c12_00,VV);
361             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
362             fvdw12           = _mm256_mul_ps(c12_00,FF);
363             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
364             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
365
366             /* Update potential sum for this i atom from the interaction with this j atom. */
367             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
368
369             fscal            = fvdw;
370
371             /* Calculate temporary vectorial force */
372             tx               = _mm256_mul_ps(fscal,dx00);
373             ty               = _mm256_mul_ps(fscal,dy00);
374             tz               = _mm256_mul_ps(fscal,dz00);
375
376             /* Update vectorial force */
377             fix0             = _mm256_add_ps(fix0,tx);
378             fiy0             = _mm256_add_ps(fiy0,ty);
379             fiz0             = _mm256_add_ps(fiz0,tz);
380
381             fjx0             = _mm256_add_ps(fjx0,tx);
382             fjy0             = _mm256_add_ps(fjy0,ty);
383             fjz0             = _mm256_add_ps(fjz0,tz);
384
385             /**************************
386              * CALCULATE INTERACTIONS *
387              **************************/
388
389             r11              = _mm256_mul_ps(rsq11,rinv11);
390
391             /* Calculate table index by multiplying r with table scale and truncate to integer */
392             rt               = _mm256_mul_ps(r11,vftabscale);
393             vfitab           = _mm256_cvttps_epi32(rt);
394             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
395             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
396             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
397             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
398             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
399             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
400
401             /* CUBIC SPLINE TABLE ELECTROSTATICS */
402             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
403                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
404             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
405                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
406             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
407                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
408             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
409                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
410             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
411             Heps             = _mm256_mul_ps(vfeps,H);
412             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
413             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
414             velec            = _mm256_mul_ps(qq11,VV);
415             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
416             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
417
418             /* Update potential sum for this i atom from the interaction with this j atom. */
419             velecsum         = _mm256_add_ps(velecsum,velec);
420
421             fscal            = felec;
422
423             /* Calculate temporary vectorial force */
424             tx               = _mm256_mul_ps(fscal,dx11);
425             ty               = _mm256_mul_ps(fscal,dy11);
426             tz               = _mm256_mul_ps(fscal,dz11);
427
428             /* Update vectorial force */
429             fix1             = _mm256_add_ps(fix1,tx);
430             fiy1             = _mm256_add_ps(fiy1,ty);
431             fiz1             = _mm256_add_ps(fiz1,tz);
432
433             fjx1             = _mm256_add_ps(fjx1,tx);
434             fjy1             = _mm256_add_ps(fjy1,ty);
435             fjz1             = _mm256_add_ps(fjz1,tz);
436
437             /**************************
438              * CALCULATE INTERACTIONS *
439              **************************/
440
441             r12              = _mm256_mul_ps(rsq12,rinv12);
442
443             /* Calculate table index by multiplying r with table scale and truncate to integer */
444             rt               = _mm256_mul_ps(r12,vftabscale);
445             vfitab           = _mm256_cvttps_epi32(rt);
446             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
447             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
448             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
449             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
450             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
451             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
452
453             /* CUBIC SPLINE TABLE ELECTROSTATICS */
454             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
455                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
456             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
457                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
458             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
459                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
460             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
461                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
462             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
463             Heps             = _mm256_mul_ps(vfeps,H);
464             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
465             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
466             velec            = _mm256_mul_ps(qq12,VV);
467             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
468             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
469
470             /* Update potential sum for this i atom from the interaction with this j atom. */
471             velecsum         = _mm256_add_ps(velecsum,velec);
472
473             fscal            = felec;
474
475             /* Calculate temporary vectorial force */
476             tx               = _mm256_mul_ps(fscal,dx12);
477             ty               = _mm256_mul_ps(fscal,dy12);
478             tz               = _mm256_mul_ps(fscal,dz12);
479
480             /* Update vectorial force */
481             fix1             = _mm256_add_ps(fix1,tx);
482             fiy1             = _mm256_add_ps(fiy1,ty);
483             fiz1             = _mm256_add_ps(fiz1,tz);
484
485             fjx2             = _mm256_add_ps(fjx2,tx);
486             fjy2             = _mm256_add_ps(fjy2,ty);
487             fjz2             = _mm256_add_ps(fjz2,tz);
488
489             /**************************
490              * CALCULATE INTERACTIONS *
491              **************************/
492
493             r13              = _mm256_mul_ps(rsq13,rinv13);
494
495             /* Calculate table index by multiplying r with table scale and truncate to integer */
496             rt               = _mm256_mul_ps(r13,vftabscale);
497             vfitab           = _mm256_cvttps_epi32(rt);
498             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
499             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
500             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
501             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
502             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
503             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
504
505             /* CUBIC SPLINE TABLE ELECTROSTATICS */
506             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
507                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
508             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
509                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
510             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
511                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
512             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
513                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
514             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
515             Heps             = _mm256_mul_ps(vfeps,H);
516             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
517             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
518             velec            = _mm256_mul_ps(qq13,VV);
519             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
520             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
521
522             /* Update potential sum for this i atom from the interaction with this j atom. */
523             velecsum         = _mm256_add_ps(velecsum,velec);
524
525             fscal            = felec;
526
527             /* Calculate temporary vectorial force */
528             tx               = _mm256_mul_ps(fscal,dx13);
529             ty               = _mm256_mul_ps(fscal,dy13);
530             tz               = _mm256_mul_ps(fscal,dz13);
531
532             /* Update vectorial force */
533             fix1             = _mm256_add_ps(fix1,tx);
534             fiy1             = _mm256_add_ps(fiy1,ty);
535             fiz1             = _mm256_add_ps(fiz1,tz);
536
537             fjx3             = _mm256_add_ps(fjx3,tx);
538             fjy3             = _mm256_add_ps(fjy3,ty);
539             fjz3             = _mm256_add_ps(fjz3,tz);
540
541             /**************************
542              * CALCULATE INTERACTIONS *
543              **************************/
544
545             r21              = _mm256_mul_ps(rsq21,rinv21);
546
547             /* Calculate table index by multiplying r with table scale and truncate to integer */
548             rt               = _mm256_mul_ps(r21,vftabscale);
549             vfitab           = _mm256_cvttps_epi32(rt);
550             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
551             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
552             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
553             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
554             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
555             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
556
557             /* CUBIC SPLINE TABLE ELECTROSTATICS */
558             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
559                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
560             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
561                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
562             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
563                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
564             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
565                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
566             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
567             Heps             = _mm256_mul_ps(vfeps,H);
568             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
569             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
570             velec            = _mm256_mul_ps(qq21,VV);
571             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
572             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
573
574             /* Update potential sum for this i atom from the interaction with this j atom. */
575             velecsum         = _mm256_add_ps(velecsum,velec);
576
577             fscal            = felec;
578
579             /* Calculate temporary vectorial force */
580             tx               = _mm256_mul_ps(fscal,dx21);
581             ty               = _mm256_mul_ps(fscal,dy21);
582             tz               = _mm256_mul_ps(fscal,dz21);
583
584             /* Update vectorial force */
585             fix2             = _mm256_add_ps(fix2,tx);
586             fiy2             = _mm256_add_ps(fiy2,ty);
587             fiz2             = _mm256_add_ps(fiz2,tz);
588
589             fjx1             = _mm256_add_ps(fjx1,tx);
590             fjy1             = _mm256_add_ps(fjy1,ty);
591             fjz1             = _mm256_add_ps(fjz1,tz);
592
593             /**************************
594              * CALCULATE INTERACTIONS *
595              **************************/
596
597             r22              = _mm256_mul_ps(rsq22,rinv22);
598
599             /* Calculate table index by multiplying r with table scale and truncate to integer */
600             rt               = _mm256_mul_ps(r22,vftabscale);
601             vfitab           = _mm256_cvttps_epi32(rt);
602             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
603             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
604             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
605             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
606             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
607             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
608
609             /* CUBIC SPLINE TABLE ELECTROSTATICS */
610             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
611                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
612             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
613                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
614             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
615                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
616             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
617                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
618             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
619             Heps             = _mm256_mul_ps(vfeps,H);
620             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
621             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
622             velec            = _mm256_mul_ps(qq22,VV);
623             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
624             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
625
626             /* Update potential sum for this i atom from the interaction with this j atom. */
627             velecsum         = _mm256_add_ps(velecsum,velec);
628
629             fscal            = felec;
630
631             /* Calculate temporary vectorial force */
632             tx               = _mm256_mul_ps(fscal,dx22);
633             ty               = _mm256_mul_ps(fscal,dy22);
634             tz               = _mm256_mul_ps(fscal,dz22);
635
636             /* Update vectorial force */
637             fix2             = _mm256_add_ps(fix2,tx);
638             fiy2             = _mm256_add_ps(fiy2,ty);
639             fiz2             = _mm256_add_ps(fiz2,tz);
640
641             fjx2             = _mm256_add_ps(fjx2,tx);
642             fjy2             = _mm256_add_ps(fjy2,ty);
643             fjz2             = _mm256_add_ps(fjz2,tz);
644
645             /**************************
646              * CALCULATE INTERACTIONS *
647              **************************/
648
649             r23              = _mm256_mul_ps(rsq23,rinv23);
650
651             /* Calculate table index by multiplying r with table scale and truncate to integer */
652             rt               = _mm256_mul_ps(r23,vftabscale);
653             vfitab           = _mm256_cvttps_epi32(rt);
654             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
655             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
656             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
657             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
658             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
659             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
660
661             /* CUBIC SPLINE TABLE ELECTROSTATICS */
662             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
663                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
664             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
665                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
666             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
667                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
668             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
669                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
670             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
671             Heps             = _mm256_mul_ps(vfeps,H);
672             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
673             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
674             velec            = _mm256_mul_ps(qq23,VV);
675             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
676             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
677
678             /* Update potential sum for this i atom from the interaction with this j atom. */
679             velecsum         = _mm256_add_ps(velecsum,velec);
680
681             fscal            = felec;
682
683             /* Calculate temporary vectorial force */
684             tx               = _mm256_mul_ps(fscal,dx23);
685             ty               = _mm256_mul_ps(fscal,dy23);
686             tz               = _mm256_mul_ps(fscal,dz23);
687
688             /* Update vectorial force */
689             fix2             = _mm256_add_ps(fix2,tx);
690             fiy2             = _mm256_add_ps(fiy2,ty);
691             fiz2             = _mm256_add_ps(fiz2,tz);
692
693             fjx3             = _mm256_add_ps(fjx3,tx);
694             fjy3             = _mm256_add_ps(fjy3,ty);
695             fjz3             = _mm256_add_ps(fjz3,tz);
696
697             /**************************
698              * CALCULATE INTERACTIONS *
699              **************************/
700
701             r31              = _mm256_mul_ps(rsq31,rinv31);
702
703             /* Calculate table index by multiplying r with table scale and truncate to integer */
704             rt               = _mm256_mul_ps(r31,vftabscale);
705             vfitab           = _mm256_cvttps_epi32(rt);
706             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
707             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
708             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
709             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
710             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
711             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
712
713             /* CUBIC SPLINE TABLE ELECTROSTATICS */
714             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
715                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
716             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
717                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
718             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
719                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
720             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
721                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
722             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
723             Heps             = _mm256_mul_ps(vfeps,H);
724             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
725             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
726             velec            = _mm256_mul_ps(qq31,VV);
727             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
728             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
729
730             /* Update potential sum for this i atom from the interaction with this j atom. */
731             velecsum         = _mm256_add_ps(velecsum,velec);
732
733             fscal            = felec;
734
735             /* Calculate temporary vectorial force */
736             tx               = _mm256_mul_ps(fscal,dx31);
737             ty               = _mm256_mul_ps(fscal,dy31);
738             tz               = _mm256_mul_ps(fscal,dz31);
739
740             /* Update vectorial force */
741             fix3             = _mm256_add_ps(fix3,tx);
742             fiy3             = _mm256_add_ps(fiy3,ty);
743             fiz3             = _mm256_add_ps(fiz3,tz);
744
745             fjx1             = _mm256_add_ps(fjx1,tx);
746             fjy1             = _mm256_add_ps(fjy1,ty);
747             fjz1             = _mm256_add_ps(fjz1,tz);
748
749             /**************************
750              * CALCULATE INTERACTIONS *
751              **************************/
752
753             r32              = _mm256_mul_ps(rsq32,rinv32);
754
755             /* Calculate table index by multiplying r with table scale and truncate to integer */
756             rt               = _mm256_mul_ps(r32,vftabscale);
757             vfitab           = _mm256_cvttps_epi32(rt);
758             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
759             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
760             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
761             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
762             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
763             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
764
765             /* CUBIC SPLINE TABLE ELECTROSTATICS */
766             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
767                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
768             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
769                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
770             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
771                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
772             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
773                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
774             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
775             Heps             = _mm256_mul_ps(vfeps,H);
776             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
777             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
778             velec            = _mm256_mul_ps(qq32,VV);
779             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
780             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
781
782             /* Update potential sum for this i atom from the interaction with this j atom. */
783             velecsum         = _mm256_add_ps(velecsum,velec);
784
785             fscal            = felec;
786
787             /* Calculate temporary vectorial force */
788             tx               = _mm256_mul_ps(fscal,dx32);
789             ty               = _mm256_mul_ps(fscal,dy32);
790             tz               = _mm256_mul_ps(fscal,dz32);
791
792             /* Update vectorial force */
793             fix3             = _mm256_add_ps(fix3,tx);
794             fiy3             = _mm256_add_ps(fiy3,ty);
795             fiz3             = _mm256_add_ps(fiz3,tz);
796
797             fjx2             = _mm256_add_ps(fjx2,tx);
798             fjy2             = _mm256_add_ps(fjy2,ty);
799             fjz2             = _mm256_add_ps(fjz2,tz);
800
801             /**************************
802              * CALCULATE INTERACTIONS *
803              **************************/
804
805             r33              = _mm256_mul_ps(rsq33,rinv33);
806
807             /* Calculate table index by multiplying r with table scale and truncate to integer */
808             rt               = _mm256_mul_ps(r33,vftabscale);
809             vfitab           = _mm256_cvttps_epi32(rt);
810             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
811             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
812             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
813             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
814             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
815             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
816
817             /* CUBIC SPLINE TABLE ELECTROSTATICS */
818             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
819                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
820             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
821                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
822             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
823                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
824             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
825                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
826             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
827             Heps             = _mm256_mul_ps(vfeps,H);
828             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
829             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
830             velec            = _mm256_mul_ps(qq33,VV);
831             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
832             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
833
834             /* Update potential sum for this i atom from the interaction with this j atom. */
835             velecsum         = _mm256_add_ps(velecsum,velec);
836
837             fscal            = felec;
838
839             /* Calculate temporary vectorial force */
840             tx               = _mm256_mul_ps(fscal,dx33);
841             ty               = _mm256_mul_ps(fscal,dy33);
842             tz               = _mm256_mul_ps(fscal,dz33);
843
844             /* Update vectorial force */
845             fix3             = _mm256_add_ps(fix3,tx);
846             fiy3             = _mm256_add_ps(fiy3,ty);
847             fiz3             = _mm256_add_ps(fiz3,tz);
848
849             fjx3             = _mm256_add_ps(fjx3,tx);
850             fjy3             = _mm256_add_ps(fjy3,ty);
851             fjz3             = _mm256_add_ps(fjz3,tz);
852
853             fjptrA             = f+j_coord_offsetA;
854             fjptrB             = f+j_coord_offsetB;
855             fjptrC             = f+j_coord_offsetC;
856             fjptrD             = f+j_coord_offsetD;
857             fjptrE             = f+j_coord_offsetE;
858             fjptrF             = f+j_coord_offsetF;
859             fjptrG             = f+j_coord_offsetG;
860             fjptrH             = f+j_coord_offsetH;
861
862             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
863                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
864                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
865
866             /* Inner loop uses 446 flops */
867         }
868
869         if(jidx<j_index_end)
870         {
871
872             /* Get j neighbor index, and coordinate index */
873             jnrlistA         = jjnr[jidx];
874             jnrlistB         = jjnr[jidx+1];
875             jnrlistC         = jjnr[jidx+2];
876             jnrlistD         = jjnr[jidx+3];
877             jnrlistE         = jjnr[jidx+4];
878             jnrlistF         = jjnr[jidx+5];
879             jnrlistG         = jjnr[jidx+6];
880             jnrlistH         = jjnr[jidx+7];
881             /* Sign of each element will be negative for non-real atoms.
882              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
883              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
884              */
885             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
886                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
887                                             
888             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
889             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
890             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
891             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
892             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
893             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
894             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
895             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
896             j_coord_offsetA  = DIM*jnrA;
897             j_coord_offsetB  = DIM*jnrB;
898             j_coord_offsetC  = DIM*jnrC;
899             j_coord_offsetD  = DIM*jnrD;
900             j_coord_offsetE  = DIM*jnrE;
901             j_coord_offsetF  = DIM*jnrF;
902             j_coord_offsetG  = DIM*jnrG;
903             j_coord_offsetH  = DIM*jnrH;
904
905             /* load j atom coordinates */
906             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
907                                                  x+j_coord_offsetC,x+j_coord_offsetD,
908                                                  x+j_coord_offsetE,x+j_coord_offsetF,
909                                                  x+j_coord_offsetG,x+j_coord_offsetH,
910                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
911                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
912
913             /* Calculate displacement vector */
914             dx00             = _mm256_sub_ps(ix0,jx0);
915             dy00             = _mm256_sub_ps(iy0,jy0);
916             dz00             = _mm256_sub_ps(iz0,jz0);
917             dx11             = _mm256_sub_ps(ix1,jx1);
918             dy11             = _mm256_sub_ps(iy1,jy1);
919             dz11             = _mm256_sub_ps(iz1,jz1);
920             dx12             = _mm256_sub_ps(ix1,jx2);
921             dy12             = _mm256_sub_ps(iy1,jy2);
922             dz12             = _mm256_sub_ps(iz1,jz2);
923             dx13             = _mm256_sub_ps(ix1,jx3);
924             dy13             = _mm256_sub_ps(iy1,jy3);
925             dz13             = _mm256_sub_ps(iz1,jz3);
926             dx21             = _mm256_sub_ps(ix2,jx1);
927             dy21             = _mm256_sub_ps(iy2,jy1);
928             dz21             = _mm256_sub_ps(iz2,jz1);
929             dx22             = _mm256_sub_ps(ix2,jx2);
930             dy22             = _mm256_sub_ps(iy2,jy2);
931             dz22             = _mm256_sub_ps(iz2,jz2);
932             dx23             = _mm256_sub_ps(ix2,jx3);
933             dy23             = _mm256_sub_ps(iy2,jy3);
934             dz23             = _mm256_sub_ps(iz2,jz3);
935             dx31             = _mm256_sub_ps(ix3,jx1);
936             dy31             = _mm256_sub_ps(iy3,jy1);
937             dz31             = _mm256_sub_ps(iz3,jz1);
938             dx32             = _mm256_sub_ps(ix3,jx2);
939             dy32             = _mm256_sub_ps(iy3,jy2);
940             dz32             = _mm256_sub_ps(iz3,jz2);
941             dx33             = _mm256_sub_ps(ix3,jx3);
942             dy33             = _mm256_sub_ps(iy3,jy3);
943             dz33             = _mm256_sub_ps(iz3,jz3);
944
945             /* Calculate squared distance and things based on it */
946             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
947             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
948             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
949             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
950             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
951             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
952             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
953             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
954             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
955             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
956
957             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
958             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
959             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
960             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
961             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
962             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
963             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
964             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
965             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
966             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
967
968             fjx0             = _mm256_setzero_ps();
969             fjy0             = _mm256_setzero_ps();
970             fjz0             = _mm256_setzero_ps();
971             fjx1             = _mm256_setzero_ps();
972             fjy1             = _mm256_setzero_ps();
973             fjz1             = _mm256_setzero_ps();
974             fjx2             = _mm256_setzero_ps();
975             fjy2             = _mm256_setzero_ps();
976             fjz2             = _mm256_setzero_ps();
977             fjx3             = _mm256_setzero_ps();
978             fjy3             = _mm256_setzero_ps();
979             fjz3             = _mm256_setzero_ps();
980
981             /**************************
982              * CALCULATE INTERACTIONS *
983              **************************/
984
985             r00              = _mm256_mul_ps(rsq00,rinv00);
986             r00              = _mm256_andnot_ps(dummy_mask,r00);
987
988             /* Calculate table index by multiplying r with table scale and truncate to integer */
989             rt               = _mm256_mul_ps(r00,vftabscale);
990             vfitab           = _mm256_cvttps_epi32(rt);
991             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
992             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
993             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
994             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
995             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
996             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
997
998             /* CUBIC SPLINE TABLE DISPERSION */
999             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1000             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1001             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1002                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1003             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1004                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1005             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1006                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1007             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1008                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1009             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1010             Heps             = _mm256_mul_ps(vfeps,H);
1011             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1012             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1013             vvdw6            = _mm256_mul_ps(c6_00,VV);
1014             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1015             fvdw6            = _mm256_mul_ps(c6_00,FF);
1016
1017             /* CUBIC SPLINE TABLE REPULSION */
1018             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1019             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1020             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1021                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1022             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1023                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1024             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1025                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1026             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1027                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1028             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1029             Heps             = _mm256_mul_ps(vfeps,H);
1030             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1031             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1032             vvdw12           = _mm256_mul_ps(c12_00,VV);
1033             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1034             fvdw12           = _mm256_mul_ps(c12_00,FF);
1035             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
1036             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1037
1038             /* Update potential sum for this i atom from the interaction with this j atom. */
1039             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
1040             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
1041
1042             fscal            = fvdw;
1043
1044             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1045
1046             /* Calculate temporary vectorial force */
1047             tx               = _mm256_mul_ps(fscal,dx00);
1048             ty               = _mm256_mul_ps(fscal,dy00);
1049             tz               = _mm256_mul_ps(fscal,dz00);
1050
1051             /* Update vectorial force */
1052             fix0             = _mm256_add_ps(fix0,tx);
1053             fiy0             = _mm256_add_ps(fiy0,ty);
1054             fiz0             = _mm256_add_ps(fiz0,tz);
1055
1056             fjx0             = _mm256_add_ps(fjx0,tx);
1057             fjy0             = _mm256_add_ps(fjy0,ty);
1058             fjz0             = _mm256_add_ps(fjz0,tz);
1059
1060             /**************************
1061              * CALCULATE INTERACTIONS *
1062              **************************/
1063
1064             r11              = _mm256_mul_ps(rsq11,rinv11);
1065             r11              = _mm256_andnot_ps(dummy_mask,r11);
1066
1067             /* Calculate table index by multiplying r with table scale and truncate to integer */
1068             rt               = _mm256_mul_ps(r11,vftabscale);
1069             vfitab           = _mm256_cvttps_epi32(rt);
1070             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1071             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1072             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1073             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1074             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1075             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1076
1077             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1078             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1079                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1080             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1081                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1082             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1083                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1084             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1085                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1086             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1087             Heps             = _mm256_mul_ps(vfeps,H);
1088             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1089             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1090             velec            = _mm256_mul_ps(qq11,VV);
1091             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1092             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1093
1094             /* Update potential sum for this i atom from the interaction with this j atom. */
1095             velec            = _mm256_andnot_ps(dummy_mask,velec);
1096             velecsum         = _mm256_add_ps(velecsum,velec);
1097
1098             fscal            = felec;
1099
1100             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1101
1102             /* Calculate temporary vectorial force */
1103             tx               = _mm256_mul_ps(fscal,dx11);
1104             ty               = _mm256_mul_ps(fscal,dy11);
1105             tz               = _mm256_mul_ps(fscal,dz11);
1106
1107             /* Update vectorial force */
1108             fix1             = _mm256_add_ps(fix1,tx);
1109             fiy1             = _mm256_add_ps(fiy1,ty);
1110             fiz1             = _mm256_add_ps(fiz1,tz);
1111
1112             fjx1             = _mm256_add_ps(fjx1,tx);
1113             fjy1             = _mm256_add_ps(fjy1,ty);
1114             fjz1             = _mm256_add_ps(fjz1,tz);
1115
1116             /**************************
1117              * CALCULATE INTERACTIONS *
1118              **************************/
1119
1120             r12              = _mm256_mul_ps(rsq12,rinv12);
1121             r12              = _mm256_andnot_ps(dummy_mask,r12);
1122
1123             /* Calculate table index by multiplying r with table scale and truncate to integer */
1124             rt               = _mm256_mul_ps(r12,vftabscale);
1125             vfitab           = _mm256_cvttps_epi32(rt);
1126             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1127             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1128             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1129             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1130             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1131             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1132
1133             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1134             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1135                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1136             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1137                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1138             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1139                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1140             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1141                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1142             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1143             Heps             = _mm256_mul_ps(vfeps,H);
1144             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1145             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1146             velec            = _mm256_mul_ps(qq12,VV);
1147             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1148             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1149
1150             /* Update potential sum for this i atom from the interaction with this j atom. */
1151             velec            = _mm256_andnot_ps(dummy_mask,velec);
1152             velecsum         = _mm256_add_ps(velecsum,velec);
1153
1154             fscal            = felec;
1155
1156             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1157
1158             /* Calculate temporary vectorial force */
1159             tx               = _mm256_mul_ps(fscal,dx12);
1160             ty               = _mm256_mul_ps(fscal,dy12);
1161             tz               = _mm256_mul_ps(fscal,dz12);
1162
1163             /* Update vectorial force */
1164             fix1             = _mm256_add_ps(fix1,tx);
1165             fiy1             = _mm256_add_ps(fiy1,ty);
1166             fiz1             = _mm256_add_ps(fiz1,tz);
1167
1168             fjx2             = _mm256_add_ps(fjx2,tx);
1169             fjy2             = _mm256_add_ps(fjy2,ty);
1170             fjz2             = _mm256_add_ps(fjz2,tz);
1171
1172             /**************************
1173              * CALCULATE INTERACTIONS *
1174              **************************/
1175
1176             r13              = _mm256_mul_ps(rsq13,rinv13);
1177             r13              = _mm256_andnot_ps(dummy_mask,r13);
1178
1179             /* Calculate table index by multiplying r with table scale and truncate to integer */
1180             rt               = _mm256_mul_ps(r13,vftabscale);
1181             vfitab           = _mm256_cvttps_epi32(rt);
1182             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1183             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1184             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1185             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1186             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1187             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1188
1189             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1190             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1191                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1192             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1193                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1194             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1195                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1196             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1197                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1198             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1199             Heps             = _mm256_mul_ps(vfeps,H);
1200             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1201             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1202             velec            = _mm256_mul_ps(qq13,VV);
1203             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1204             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
1205
1206             /* Update potential sum for this i atom from the interaction with this j atom. */
1207             velec            = _mm256_andnot_ps(dummy_mask,velec);
1208             velecsum         = _mm256_add_ps(velecsum,velec);
1209
1210             fscal            = felec;
1211
1212             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1213
1214             /* Calculate temporary vectorial force */
1215             tx               = _mm256_mul_ps(fscal,dx13);
1216             ty               = _mm256_mul_ps(fscal,dy13);
1217             tz               = _mm256_mul_ps(fscal,dz13);
1218
1219             /* Update vectorial force */
1220             fix1             = _mm256_add_ps(fix1,tx);
1221             fiy1             = _mm256_add_ps(fiy1,ty);
1222             fiz1             = _mm256_add_ps(fiz1,tz);
1223
1224             fjx3             = _mm256_add_ps(fjx3,tx);
1225             fjy3             = _mm256_add_ps(fjy3,ty);
1226             fjz3             = _mm256_add_ps(fjz3,tz);
1227
1228             /**************************
1229              * CALCULATE INTERACTIONS *
1230              **************************/
1231
1232             r21              = _mm256_mul_ps(rsq21,rinv21);
1233             r21              = _mm256_andnot_ps(dummy_mask,r21);
1234
1235             /* Calculate table index by multiplying r with table scale and truncate to integer */
1236             rt               = _mm256_mul_ps(r21,vftabscale);
1237             vfitab           = _mm256_cvttps_epi32(rt);
1238             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1239             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1240             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1241             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1242             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1243             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1244
1245             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1246             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1247                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1248             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1249                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1250             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1251                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1252             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1253                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1254             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1255             Heps             = _mm256_mul_ps(vfeps,H);
1256             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1257             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1258             velec            = _mm256_mul_ps(qq21,VV);
1259             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1260             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1261
1262             /* Update potential sum for this i atom from the interaction with this j atom. */
1263             velec            = _mm256_andnot_ps(dummy_mask,velec);
1264             velecsum         = _mm256_add_ps(velecsum,velec);
1265
1266             fscal            = felec;
1267
1268             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1269
1270             /* Calculate temporary vectorial force */
1271             tx               = _mm256_mul_ps(fscal,dx21);
1272             ty               = _mm256_mul_ps(fscal,dy21);
1273             tz               = _mm256_mul_ps(fscal,dz21);
1274
1275             /* Update vectorial force */
1276             fix2             = _mm256_add_ps(fix2,tx);
1277             fiy2             = _mm256_add_ps(fiy2,ty);
1278             fiz2             = _mm256_add_ps(fiz2,tz);
1279
1280             fjx1             = _mm256_add_ps(fjx1,tx);
1281             fjy1             = _mm256_add_ps(fjy1,ty);
1282             fjz1             = _mm256_add_ps(fjz1,tz);
1283
1284             /**************************
1285              * CALCULATE INTERACTIONS *
1286              **************************/
1287
1288             r22              = _mm256_mul_ps(rsq22,rinv22);
1289             r22              = _mm256_andnot_ps(dummy_mask,r22);
1290
1291             /* Calculate table index by multiplying r with table scale and truncate to integer */
1292             rt               = _mm256_mul_ps(r22,vftabscale);
1293             vfitab           = _mm256_cvttps_epi32(rt);
1294             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1295             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1296             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1297             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1298             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1299             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1300
1301             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1302             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1303                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1304             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1305                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1306             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1307                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1308             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1309                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1310             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1311             Heps             = _mm256_mul_ps(vfeps,H);
1312             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1313             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1314             velec            = _mm256_mul_ps(qq22,VV);
1315             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1316             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1317
1318             /* Update potential sum for this i atom from the interaction with this j atom. */
1319             velec            = _mm256_andnot_ps(dummy_mask,velec);
1320             velecsum         = _mm256_add_ps(velecsum,velec);
1321
1322             fscal            = felec;
1323
1324             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1325
1326             /* Calculate temporary vectorial force */
1327             tx               = _mm256_mul_ps(fscal,dx22);
1328             ty               = _mm256_mul_ps(fscal,dy22);
1329             tz               = _mm256_mul_ps(fscal,dz22);
1330
1331             /* Update vectorial force */
1332             fix2             = _mm256_add_ps(fix2,tx);
1333             fiy2             = _mm256_add_ps(fiy2,ty);
1334             fiz2             = _mm256_add_ps(fiz2,tz);
1335
1336             fjx2             = _mm256_add_ps(fjx2,tx);
1337             fjy2             = _mm256_add_ps(fjy2,ty);
1338             fjz2             = _mm256_add_ps(fjz2,tz);
1339
1340             /**************************
1341              * CALCULATE INTERACTIONS *
1342              **************************/
1343
1344             r23              = _mm256_mul_ps(rsq23,rinv23);
1345             r23              = _mm256_andnot_ps(dummy_mask,r23);
1346
1347             /* Calculate table index by multiplying r with table scale and truncate to integer */
1348             rt               = _mm256_mul_ps(r23,vftabscale);
1349             vfitab           = _mm256_cvttps_epi32(rt);
1350             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1351             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1352             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1353             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1354             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1355             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1356
1357             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1358             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1359                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1360             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1361                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1362             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1363                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1364             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1365                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1366             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1367             Heps             = _mm256_mul_ps(vfeps,H);
1368             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1369             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1370             velec            = _mm256_mul_ps(qq23,VV);
1371             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1372             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
1373
1374             /* Update potential sum for this i atom from the interaction with this j atom. */
1375             velec            = _mm256_andnot_ps(dummy_mask,velec);
1376             velecsum         = _mm256_add_ps(velecsum,velec);
1377
1378             fscal            = felec;
1379
1380             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1381
1382             /* Calculate temporary vectorial force */
1383             tx               = _mm256_mul_ps(fscal,dx23);
1384             ty               = _mm256_mul_ps(fscal,dy23);
1385             tz               = _mm256_mul_ps(fscal,dz23);
1386
1387             /* Update vectorial force */
1388             fix2             = _mm256_add_ps(fix2,tx);
1389             fiy2             = _mm256_add_ps(fiy2,ty);
1390             fiz2             = _mm256_add_ps(fiz2,tz);
1391
1392             fjx3             = _mm256_add_ps(fjx3,tx);
1393             fjy3             = _mm256_add_ps(fjy3,ty);
1394             fjz3             = _mm256_add_ps(fjz3,tz);
1395
1396             /**************************
1397              * CALCULATE INTERACTIONS *
1398              **************************/
1399
1400             r31              = _mm256_mul_ps(rsq31,rinv31);
1401             r31              = _mm256_andnot_ps(dummy_mask,r31);
1402
1403             /* Calculate table index by multiplying r with table scale and truncate to integer */
1404             rt               = _mm256_mul_ps(r31,vftabscale);
1405             vfitab           = _mm256_cvttps_epi32(rt);
1406             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1407             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1408             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1409             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1410             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1411             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1412
1413             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1414             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1415                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1416             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1417                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1418             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1419                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1420             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1421                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1422             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1423             Heps             = _mm256_mul_ps(vfeps,H);
1424             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1425             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1426             velec            = _mm256_mul_ps(qq31,VV);
1427             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1428             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
1429
1430             /* Update potential sum for this i atom from the interaction with this j atom. */
1431             velec            = _mm256_andnot_ps(dummy_mask,velec);
1432             velecsum         = _mm256_add_ps(velecsum,velec);
1433
1434             fscal            = felec;
1435
1436             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1437
1438             /* Calculate temporary vectorial force */
1439             tx               = _mm256_mul_ps(fscal,dx31);
1440             ty               = _mm256_mul_ps(fscal,dy31);
1441             tz               = _mm256_mul_ps(fscal,dz31);
1442
1443             /* Update vectorial force */
1444             fix3             = _mm256_add_ps(fix3,tx);
1445             fiy3             = _mm256_add_ps(fiy3,ty);
1446             fiz3             = _mm256_add_ps(fiz3,tz);
1447
1448             fjx1             = _mm256_add_ps(fjx1,tx);
1449             fjy1             = _mm256_add_ps(fjy1,ty);
1450             fjz1             = _mm256_add_ps(fjz1,tz);
1451
1452             /**************************
1453              * CALCULATE INTERACTIONS *
1454              **************************/
1455
1456             r32              = _mm256_mul_ps(rsq32,rinv32);
1457             r32              = _mm256_andnot_ps(dummy_mask,r32);
1458
1459             /* Calculate table index by multiplying r with table scale and truncate to integer */
1460             rt               = _mm256_mul_ps(r32,vftabscale);
1461             vfitab           = _mm256_cvttps_epi32(rt);
1462             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1463             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1464             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1465             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1466             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1467             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1468
1469             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1470             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1471                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1472             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1473                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1474             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1475                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1476             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1477                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1478             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1479             Heps             = _mm256_mul_ps(vfeps,H);
1480             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1481             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1482             velec            = _mm256_mul_ps(qq32,VV);
1483             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1484             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
1485
1486             /* Update potential sum for this i atom from the interaction with this j atom. */
1487             velec            = _mm256_andnot_ps(dummy_mask,velec);
1488             velecsum         = _mm256_add_ps(velecsum,velec);
1489
1490             fscal            = felec;
1491
1492             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1493
1494             /* Calculate temporary vectorial force */
1495             tx               = _mm256_mul_ps(fscal,dx32);
1496             ty               = _mm256_mul_ps(fscal,dy32);
1497             tz               = _mm256_mul_ps(fscal,dz32);
1498
1499             /* Update vectorial force */
1500             fix3             = _mm256_add_ps(fix3,tx);
1501             fiy3             = _mm256_add_ps(fiy3,ty);
1502             fiz3             = _mm256_add_ps(fiz3,tz);
1503
1504             fjx2             = _mm256_add_ps(fjx2,tx);
1505             fjy2             = _mm256_add_ps(fjy2,ty);
1506             fjz2             = _mm256_add_ps(fjz2,tz);
1507
1508             /**************************
1509              * CALCULATE INTERACTIONS *
1510              **************************/
1511
1512             r33              = _mm256_mul_ps(rsq33,rinv33);
1513             r33              = _mm256_andnot_ps(dummy_mask,r33);
1514
1515             /* Calculate table index by multiplying r with table scale and truncate to integer */
1516             rt               = _mm256_mul_ps(r33,vftabscale);
1517             vfitab           = _mm256_cvttps_epi32(rt);
1518             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1519             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1520             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1521             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1522             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1523             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1524
1525             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1526             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1527                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1528             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1529                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1530             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1531                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1532             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1533                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1534             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1535             Heps             = _mm256_mul_ps(vfeps,H);
1536             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1537             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1538             velec            = _mm256_mul_ps(qq33,VV);
1539             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1540             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
1541
1542             /* Update potential sum for this i atom from the interaction with this j atom. */
1543             velec            = _mm256_andnot_ps(dummy_mask,velec);
1544             velecsum         = _mm256_add_ps(velecsum,velec);
1545
1546             fscal            = felec;
1547
1548             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1549
1550             /* Calculate temporary vectorial force */
1551             tx               = _mm256_mul_ps(fscal,dx33);
1552             ty               = _mm256_mul_ps(fscal,dy33);
1553             tz               = _mm256_mul_ps(fscal,dz33);
1554
1555             /* Update vectorial force */
1556             fix3             = _mm256_add_ps(fix3,tx);
1557             fiy3             = _mm256_add_ps(fiy3,ty);
1558             fiz3             = _mm256_add_ps(fiz3,tz);
1559
1560             fjx3             = _mm256_add_ps(fjx3,tx);
1561             fjy3             = _mm256_add_ps(fjy3,ty);
1562             fjz3             = _mm256_add_ps(fjz3,tz);
1563
1564             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1565             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1566             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1567             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1568             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1569             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1570             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1571             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1572
1573             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1574                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1575                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1576
1577             /* Inner loop uses 456 flops */
1578         }
1579
1580         /* End of innermost loop */
1581
1582         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1583                                                  f+i_coord_offset,fshift+i_shift_offset);
1584
1585         ggid                        = gid[iidx];
1586         /* Update potential energies */
1587         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1588         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1589
1590         /* Increment number of inner iterations */
1591         inneriter                  += j_index_end - j_index_start;
1592
1593         /* Outer loop uses 26 flops */
1594     }
1595
1596     /* Increment number of outer iterations */
1597     outeriter        += nri;
1598
1599     /* Update outer/inner flops */
1600
1601     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*456);
1602 }
1603 /*
1604  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_single
1605  * Electrostatics interaction: CubicSplineTable
1606  * VdW interaction:            CubicSplineTable
1607  * Geometry:                   Water4-Water4
1608  * Calculate force/pot:        Force
1609  */
1610 void
1611 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_single
1612                     (t_nblist * gmx_restrict                nlist,
1613                      rvec * gmx_restrict                    xx,
1614                      rvec * gmx_restrict                    ff,
1615                      t_forcerec * gmx_restrict              fr,
1616                      t_mdatoms * gmx_restrict               mdatoms,
1617                      nb_kernel_data_t * gmx_restrict        kernel_data,
1618                      t_nrnb * gmx_restrict                  nrnb)
1619 {
1620     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1621      * just 0 for non-waters.
1622      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1623      * jnr indices corresponding to data put in the four positions in the SIMD register.
1624      */
1625     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1626     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1627     int              jnrA,jnrB,jnrC,jnrD;
1628     int              jnrE,jnrF,jnrG,jnrH;
1629     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1630     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1631     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1632     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1633     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1634     real             rcutoff_scalar;
1635     real             *shiftvec,*fshift,*x,*f;
1636     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1637     real             scratch[4*DIM];
1638     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1639     real *           vdwioffsetptr0;
1640     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1641     real *           vdwioffsetptr1;
1642     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1643     real *           vdwioffsetptr2;
1644     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1645     real *           vdwioffsetptr3;
1646     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1647     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1648     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1649     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1650     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1651     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1652     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1653     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1654     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1655     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1656     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1657     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1658     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1659     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1660     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1661     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1662     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1663     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1664     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1665     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1666     real             *charge;
1667     int              nvdwtype;
1668     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1669     int              *vdwtype;
1670     real             *vdwparam;
1671     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1672     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1673     __m256i          vfitab;
1674     __m128i          vfitab_lo,vfitab_hi;
1675     __m128i          ifour       = _mm_set1_epi32(4);
1676     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1677     real             *vftab;
1678     __m256           dummy_mask,cutoff_mask;
1679     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1680     __m256           one     = _mm256_set1_ps(1.0);
1681     __m256           two     = _mm256_set1_ps(2.0);
1682     x                = xx[0];
1683     f                = ff[0];
1684
1685     nri              = nlist->nri;
1686     iinr             = nlist->iinr;
1687     jindex           = nlist->jindex;
1688     jjnr             = nlist->jjnr;
1689     shiftidx         = nlist->shift;
1690     gid              = nlist->gid;
1691     shiftvec         = fr->shift_vec[0];
1692     fshift           = fr->fshift[0];
1693     facel            = _mm256_set1_ps(fr->epsfac);
1694     charge           = mdatoms->chargeA;
1695     nvdwtype         = fr->ntype;
1696     vdwparam         = fr->nbfp;
1697     vdwtype          = mdatoms->typeA;
1698
1699     vftab            = kernel_data->table_elec_vdw->data;
1700     vftabscale       = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
1701
1702     /* Setup water-specific parameters */
1703     inr              = nlist->iinr[0];
1704     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1705     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1706     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1707     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1708
1709     jq1              = _mm256_set1_ps(charge[inr+1]);
1710     jq2              = _mm256_set1_ps(charge[inr+2]);
1711     jq3              = _mm256_set1_ps(charge[inr+3]);
1712     vdwjidx0A        = 2*vdwtype[inr+0];
1713     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1714     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1715     qq11             = _mm256_mul_ps(iq1,jq1);
1716     qq12             = _mm256_mul_ps(iq1,jq2);
1717     qq13             = _mm256_mul_ps(iq1,jq3);
1718     qq21             = _mm256_mul_ps(iq2,jq1);
1719     qq22             = _mm256_mul_ps(iq2,jq2);
1720     qq23             = _mm256_mul_ps(iq2,jq3);
1721     qq31             = _mm256_mul_ps(iq3,jq1);
1722     qq32             = _mm256_mul_ps(iq3,jq2);
1723     qq33             = _mm256_mul_ps(iq3,jq3);
1724
1725     /* Avoid stupid compiler warnings */
1726     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1727     j_coord_offsetA = 0;
1728     j_coord_offsetB = 0;
1729     j_coord_offsetC = 0;
1730     j_coord_offsetD = 0;
1731     j_coord_offsetE = 0;
1732     j_coord_offsetF = 0;
1733     j_coord_offsetG = 0;
1734     j_coord_offsetH = 0;
1735
1736     outeriter        = 0;
1737     inneriter        = 0;
1738
1739     for(iidx=0;iidx<4*DIM;iidx++)
1740     {
1741         scratch[iidx] = 0.0;
1742     }
1743
1744     /* Start outer loop over neighborlists */
1745     for(iidx=0; iidx<nri; iidx++)
1746     {
1747         /* Load shift vector for this list */
1748         i_shift_offset   = DIM*shiftidx[iidx];
1749
1750         /* Load limits for loop over neighbors */
1751         j_index_start    = jindex[iidx];
1752         j_index_end      = jindex[iidx+1];
1753
1754         /* Get outer coordinate index */
1755         inr              = iinr[iidx];
1756         i_coord_offset   = DIM*inr;
1757
1758         /* Load i particle coords and add shift vector */
1759         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1760                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1761
1762         fix0             = _mm256_setzero_ps();
1763         fiy0             = _mm256_setzero_ps();
1764         fiz0             = _mm256_setzero_ps();
1765         fix1             = _mm256_setzero_ps();
1766         fiy1             = _mm256_setzero_ps();
1767         fiz1             = _mm256_setzero_ps();
1768         fix2             = _mm256_setzero_ps();
1769         fiy2             = _mm256_setzero_ps();
1770         fiz2             = _mm256_setzero_ps();
1771         fix3             = _mm256_setzero_ps();
1772         fiy3             = _mm256_setzero_ps();
1773         fiz3             = _mm256_setzero_ps();
1774
1775         /* Start inner kernel loop */
1776         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1777         {
1778
1779             /* Get j neighbor index, and coordinate index */
1780             jnrA             = jjnr[jidx];
1781             jnrB             = jjnr[jidx+1];
1782             jnrC             = jjnr[jidx+2];
1783             jnrD             = jjnr[jidx+3];
1784             jnrE             = jjnr[jidx+4];
1785             jnrF             = jjnr[jidx+5];
1786             jnrG             = jjnr[jidx+6];
1787             jnrH             = jjnr[jidx+7];
1788             j_coord_offsetA  = DIM*jnrA;
1789             j_coord_offsetB  = DIM*jnrB;
1790             j_coord_offsetC  = DIM*jnrC;
1791             j_coord_offsetD  = DIM*jnrD;
1792             j_coord_offsetE  = DIM*jnrE;
1793             j_coord_offsetF  = DIM*jnrF;
1794             j_coord_offsetG  = DIM*jnrG;
1795             j_coord_offsetH  = DIM*jnrH;
1796
1797             /* load j atom coordinates */
1798             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1799                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1800                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1801                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1802                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1803                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1804
1805             /* Calculate displacement vector */
1806             dx00             = _mm256_sub_ps(ix0,jx0);
1807             dy00             = _mm256_sub_ps(iy0,jy0);
1808             dz00             = _mm256_sub_ps(iz0,jz0);
1809             dx11             = _mm256_sub_ps(ix1,jx1);
1810             dy11             = _mm256_sub_ps(iy1,jy1);
1811             dz11             = _mm256_sub_ps(iz1,jz1);
1812             dx12             = _mm256_sub_ps(ix1,jx2);
1813             dy12             = _mm256_sub_ps(iy1,jy2);
1814             dz12             = _mm256_sub_ps(iz1,jz2);
1815             dx13             = _mm256_sub_ps(ix1,jx3);
1816             dy13             = _mm256_sub_ps(iy1,jy3);
1817             dz13             = _mm256_sub_ps(iz1,jz3);
1818             dx21             = _mm256_sub_ps(ix2,jx1);
1819             dy21             = _mm256_sub_ps(iy2,jy1);
1820             dz21             = _mm256_sub_ps(iz2,jz1);
1821             dx22             = _mm256_sub_ps(ix2,jx2);
1822             dy22             = _mm256_sub_ps(iy2,jy2);
1823             dz22             = _mm256_sub_ps(iz2,jz2);
1824             dx23             = _mm256_sub_ps(ix2,jx3);
1825             dy23             = _mm256_sub_ps(iy2,jy3);
1826             dz23             = _mm256_sub_ps(iz2,jz3);
1827             dx31             = _mm256_sub_ps(ix3,jx1);
1828             dy31             = _mm256_sub_ps(iy3,jy1);
1829             dz31             = _mm256_sub_ps(iz3,jz1);
1830             dx32             = _mm256_sub_ps(ix3,jx2);
1831             dy32             = _mm256_sub_ps(iy3,jy2);
1832             dz32             = _mm256_sub_ps(iz3,jz2);
1833             dx33             = _mm256_sub_ps(ix3,jx3);
1834             dy33             = _mm256_sub_ps(iy3,jy3);
1835             dz33             = _mm256_sub_ps(iz3,jz3);
1836
1837             /* Calculate squared distance and things based on it */
1838             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1839             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1840             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1841             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1842             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1843             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1844             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1845             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1846             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1847             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1848
1849             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1850             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1851             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1852             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1853             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1854             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1855             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1856             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1857             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1858             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1859
1860             fjx0             = _mm256_setzero_ps();
1861             fjy0             = _mm256_setzero_ps();
1862             fjz0             = _mm256_setzero_ps();
1863             fjx1             = _mm256_setzero_ps();
1864             fjy1             = _mm256_setzero_ps();
1865             fjz1             = _mm256_setzero_ps();
1866             fjx2             = _mm256_setzero_ps();
1867             fjy2             = _mm256_setzero_ps();
1868             fjz2             = _mm256_setzero_ps();
1869             fjx3             = _mm256_setzero_ps();
1870             fjy3             = _mm256_setzero_ps();
1871             fjz3             = _mm256_setzero_ps();
1872
1873             /**************************
1874              * CALCULATE INTERACTIONS *
1875              **************************/
1876
1877             r00              = _mm256_mul_ps(rsq00,rinv00);
1878
1879             /* Calculate table index by multiplying r with table scale and truncate to integer */
1880             rt               = _mm256_mul_ps(r00,vftabscale);
1881             vfitab           = _mm256_cvttps_epi32(rt);
1882             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1883             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1884             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1885             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1886             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1887             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1888
1889             /* CUBIC SPLINE TABLE DISPERSION */
1890             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1891             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1892             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1893                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1894             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1895                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1896             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1897                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1898             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1899                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1900             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1901             Heps             = _mm256_mul_ps(vfeps,H);
1902             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1903             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1904             fvdw6            = _mm256_mul_ps(c6_00,FF);
1905
1906             /* CUBIC SPLINE TABLE REPULSION */
1907             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1908             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1909             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1910                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1911             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1912                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1913             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1914                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1915             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1916                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1917             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1918             Heps             = _mm256_mul_ps(vfeps,H);
1919             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1920             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1921             fvdw12           = _mm256_mul_ps(c12_00,FF);
1922             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1923
1924             fscal            = fvdw;
1925
1926             /* Calculate temporary vectorial force */
1927             tx               = _mm256_mul_ps(fscal,dx00);
1928             ty               = _mm256_mul_ps(fscal,dy00);
1929             tz               = _mm256_mul_ps(fscal,dz00);
1930
1931             /* Update vectorial force */
1932             fix0             = _mm256_add_ps(fix0,tx);
1933             fiy0             = _mm256_add_ps(fiy0,ty);
1934             fiz0             = _mm256_add_ps(fiz0,tz);
1935
1936             fjx0             = _mm256_add_ps(fjx0,tx);
1937             fjy0             = _mm256_add_ps(fjy0,ty);
1938             fjz0             = _mm256_add_ps(fjz0,tz);
1939
1940             /**************************
1941              * CALCULATE INTERACTIONS *
1942              **************************/
1943
1944             r11              = _mm256_mul_ps(rsq11,rinv11);
1945
1946             /* Calculate table index by multiplying r with table scale and truncate to integer */
1947             rt               = _mm256_mul_ps(r11,vftabscale);
1948             vfitab           = _mm256_cvttps_epi32(rt);
1949             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1950             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1951             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1952             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1953             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1954             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1955
1956             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1957             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1958                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1959             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1960                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1961             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1962                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1963             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1964                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1965             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1966             Heps             = _mm256_mul_ps(vfeps,H);
1967             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1968             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1969             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1970
1971             fscal            = felec;
1972
1973             /* Calculate temporary vectorial force */
1974             tx               = _mm256_mul_ps(fscal,dx11);
1975             ty               = _mm256_mul_ps(fscal,dy11);
1976             tz               = _mm256_mul_ps(fscal,dz11);
1977
1978             /* Update vectorial force */
1979             fix1             = _mm256_add_ps(fix1,tx);
1980             fiy1             = _mm256_add_ps(fiy1,ty);
1981             fiz1             = _mm256_add_ps(fiz1,tz);
1982
1983             fjx1             = _mm256_add_ps(fjx1,tx);
1984             fjy1             = _mm256_add_ps(fjy1,ty);
1985             fjz1             = _mm256_add_ps(fjz1,tz);
1986
1987             /**************************
1988              * CALCULATE INTERACTIONS *
1989              **************************/
1990
1991             r12              = _mm256_mul_ps(rsq12,rinv12);
1992
1993             /* Calculate table index by multiplying r with table scale and truncate to integer */
1994             rt               = _mm256_mul_ps(r12,vftabscale);
1995             vfitab           = _mm256_cvttps_epi32(rt);
1996             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1997             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1998             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1999             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2000             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2001             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2002
2003             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2004             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2005                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2006             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2007                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2008             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2009                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2010             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2011                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2012             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2013             Heps             = _mm256_mul_ps(vfeps,H);
2014             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2015             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2016             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2017
2018             fscal            = felec;
2019
2020             /* Calculate temporary vectorial force */
2021             tx               = _mm256_mul_ps(fscal,dx12);
2022             ty               = _mm256_mul_ps(fscal,dy12);
2023             tz               = _mm256_mul_ps(fscal,dz12);
2024
2025             /* Update vectorial force */
2026             fix1             = _mm256_add_ps(fix1,tx);
2027             fiy1             = _mm256_add_ps(fiy1,ty);
2028             fiz1             = _mm256_add_ps(fiz1,tz);
2029
2030             fjx2             = _mm256_add_ps(fjx2,tx);
2031             fjy2             = _mm256_add_ps(fjy2,ty);
2032             fjz2             = _mm256_add_ps(fjz2,tz);
2033
2034             /**************************
2035              * CALCULATE INTERACTIONS *
2036              **************************/
2037
2038             r13              = _mm256_mul_ps(rsq13,rinv13);
2039
2040             /* Calculate table index by multiplying r with table scale and truncate to integer */
2041             rt               = _mm256_mul_ps(r13,vftabscale);
2042             vfitab           = _mm256_cvttps_epi32(rt);
2043             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2044             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2045             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2046             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2047             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2048             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2049
2050             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2051             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2052                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2053             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2054                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2055             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2056                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2057             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2058                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2059             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2060             Heps             = _mm256_mul_ps(vfeps,H);
2061             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2062             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2063             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
2064
2065             fscal            = felec;
2066
2067             /* Calculate temporary vectorial force */
2068             tx               = _mm256_mul_ps(fscal,dx13);
2069             ty               = _mm256_mul_ps(fscal,dy13);
2070             tz               = _mm256_mul_ps(fscal,dz13);
2071
2072             /* Update vectorial force */
2073             fix1             = _mm256_add_ps(fix1,tx);
2074             fiy1             = _mm256_add_ps(fiy1,ty);
2075             fiz1             = _mm256_add_ps(fiz1,tz);
2076
2077             fjx3             = _mm256_add_ps(fjx3,tx);
2078             fjy3             = _mm256_add_ps(fjy3,ty);
2079             fjz3             = _mm256_add_ps(fjz3,tz);
2080
2081             /**************************
2082              * CALCULATE INTERACTIONS *
2083              **************************/
2084
2085             r21              = _mm256_mul_ps(rsq21,rinv21);
2086
2087             /* Calculate table index by multiplying r with table scale and truncate to integer */
2088             rt               = _mm256_mul_ps(r21,vftabscale);
2089             vfitab           = _mm256_cvttps_epi32(rt);
2090             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2091             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2092             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2093             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2094             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2095             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2096
2097             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2098             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2099                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2100             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2101                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2102             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2103                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2104             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2105                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2106             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2107             Heps             = _mm256_mul_ps(vfeps,H);
2108             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2109             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2110             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2111
2112             fscal            = felec;
2113
2114             /* Calculate temporary vectorial force */
2115             tx               = _mm256_mul_ps(fscal,dx21);
2116             ty               = _mm256_mul_ps(fscal,dy21);
2117             tz               = _mm256_mul_ps(fscal,dz21);
2118
2119             /* Update vectorial force */
2120             fix2             = _mm256_add_ps(fix2,tx);
2121             fiy2             = _mm256_add_ps(fiy2,ty);
2122             fiz2             = _mm256_add_ps(fiz2,tz);
2123
2124             fjx1             = _mm256_add_ps(fjx1,tx);
2125             fjy1             = _mm256_add_ps(fjy1,ty);
2126             fjz1             = _mm256_add_ps(fjz1,tz);
2127
2128             /**************************
2129              * CALCULATE INTERACTIONS *
2130              **************************/
2131
2132             r22              = _mm256_mul_ps(rsq22,rinv22);
2133
2134             /* Calculate table index by multiplying r with table scale and truncate to integer */
2135             rt               = _mm256_mul_ps(r22,vftabscale);
2136             vfitab           = _mm256_cvttps_epi32(rt);
2137             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2138             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2139             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2140             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2141             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2142             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2143
2144             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2145             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2146                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2147             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2148                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2149             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2150                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2151             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2152                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2153             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2154             Heps             = _mm256_mul_ps(vfeps,H);
2155             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2156             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2157             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2158
2159             fscal            = felec;
2160
2161             /* Calculate temporary vectorial force */
2162             tx               = _mm256_mul_ps(fscal,dx22);
2163             ty               = _mm256_mul_ps(fscal,dy22);
2164             tz               = _mm256_mul_ps(fscal,dz22);
2165
2166             /* Update vectorial force */
2167             fix2             = _mm256_add_ps(fix2,tx);
2168             fiy2             = _mm256_add_ps(fiy2,ty);
2169             fiz2             = _mm256_add_ps(fiz2,tz);
2170
2171             fjx2             = _mm256_add_ps(fjx2,tx);
2172             fjy2             = _mm256_add_ps(fjy2,ty);
2173             fjz2             = _mm256_add_ps(fjz2,tz);
2174
2175             /**************************
2176              * CALCULATE INTERACTIONS *
2177              **************************/
2178
2179             r23              = _mm256_mul_ps(rsq23,rinv23);
2180
2181             /* Calculate table index by multiplying r with table scale and truncate to integer */
2182             rt               = _mm256_mul_ps(r23,vftabscale);
2183             vfitab           = _mm256_cvttps_epi32(rt);
2184             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2185             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2186             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2187             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2188             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2189             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2190
2191             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2192             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2193                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2194             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2195                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2196             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2197                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2198             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2199                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2200             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2201             Heps             = _mm256_mul_ps(vfeps,H);
2202             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2203             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2204             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2205
2206             fscal            = felec;
2207
2208             /* Calculate temporary vectorial force */
2209             tx               = _mm256_mul_ps(fscal,dx23);
2210             ty               = _mm256_mul_ps(fscal,dy23);
2211             tz               = _mm256_mul_ps(fscal,dz23);
2212
2213             /* Update vectorial force */
2214             fix2             = _mm256_add_ps(fix2,tx);
2215             fiy2             = _mm256_add_ps(fiy2,ty);
2216             fiz2             = _mm256_add_ps(fiz2,tz);
2217
2218             fjx3             = _mm256_add_ps(fjx3,tx);
2219             fjy3             = _mm256_add_ps(fjy3,ty);
2220             fjz3             = _mm256_add_ps(fjz3,tz);
2221
2222             /**************************
2223              * CALCULATE INTERACTIONS *
2224              **************************/
2225
2226             r31              = _mm256_mul_ps(rsq31,rinv31);
2227
2228             /* Calculate table index by multiplying r with table scale and truncate to integer */
2229             rt               = _mm256_mul_ps(r31,vftabscale);
2230             vfitab           = _mm256_cvttps_epi32(rt);
2231             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2232             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2233             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2234             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2235             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2236             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2237
2238             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2239             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2240                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2241             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2242                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2243             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2244                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2245             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2246                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2247             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2248             Heps             = _mm256_mul_ps(vfeps,H);
2249             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2250             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2251             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2252
2253             fscal            = felec;
2254
2255             /* Calculate temporary vectorial force */
2256             tx               = _mm256_mul_ps(fscal,dx31);
2257             ty               = _mm256_mul_ps(fscal,dy31);
2258             tz               = _mm256_mul_ps(fscal,dz31);
2259
2260             /* Update vectorial force */
2261             fix3             = _mm256_add_ps(fix3,tx);
2262             fiy3             = _mm256_add_ps(fiy3,ty);
2263             fiz3             = _mm256_add_ps(fiz3,tz);
2264
2265             fjx1             = _mm256_add_ps(fjx1,tx);
2266             fjy1             = _mm256_add_ps(fjy1,ty);
2267             fjz1             = _mm256_add_ps(fjz1,tz);
2268
2269             /**************************
2270              * CALCULATE INTERACTIONS *
2271              **************************/
2272
2273             r32              = _mm256_mul_ps(rsq32,rinv32);
2274
2275             /* Calculate table index by multiplying r with table scale and truncate to integer */
2276             rt               = _mm256_mul_ps(r32,vftabscale);
2277             vfitab           = _mm256_cvttps_epi32(rt);
2278             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2279             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2280             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2281             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2282             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2283             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2284
2285             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2286             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2287                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2288             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2289                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2290             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2291                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2292             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2293                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2294             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2295             Heps             = _mm256_mul_ps(vfeps,H);
2296             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2297             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2298             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2299
2300             fscal            = felec;
2301
2302             /* Calculate temporary vectorial force */
2303             tx               = _mm256_mul_ps(fscal,dx32);
2304             ty               = _mm256_mul_ps(fscal,dy32);
2305             tz               = _mm256_mul_ps(fscal,dz32);
2306
2307             /* Update vectorial force */
2308             fix3             = _mm256_add_ps(fix3,tx);
2309             fiy3             = _mm256_add_ps(fiy3,ty);
2310             fiz3             = _mm256_add_ps(fiz3,tz);
2311
2312             fjx2             = _mm256_add_ps(fjx2,tx);
2313             fjy2             = _mm256_add_ps(fjy2,ty);
2314             fjz2             = _mm256_add_ps(fjz2,tz);
2315
2316             /**************************
2317              * CALCULATE INTERACTIONS *
2318              **************************/
2319
2320             r33              = _mm256_mul_ps(rsq33,rinv33);
2321
2322             /* Calculate table index by multiplying r with table scale and truncate to integer */
2323             rt               = _mm256_mul_ps(r33,vftabscale);
2324             vfitab           = _mm256_cvttps_epi32(rt);
2325             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2326             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2327             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2328             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2329             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2330             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2331
2332             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2333             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2334                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2335             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2336                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2337             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2338                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2339             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2340                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2341             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2342             Heps             = _mm256_mul_ps(vfeps,H);
2343             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2344             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2345             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2346
2347             fscal            = felec;
2348
2349             /* Calculate temporary vectorial force */
2350             tx               = _mm256_mul_ps(fscal,dx33);
2351             ty               = _mm256_mul_ps(fscal,dy33);
2352             tz               = _mm256_mul_ps(fscal,dz33);
2353
2354             /* Update vectorial force */
2355             fix3             = _mm256_add_ps(fix3,tx);
2356             fiy3             = _mm256_add_ps(fiy3,ty);
2357             fiz3             = _mm256_add_ps(fiz3,tz);
2358
2359             fjx3             = _mm256_add_ps(fjx3,tx);
2360             fjy3             = _mm256_add_ps(fjy3,ty);
2361             fjz3             = _mm256_add_ps(fjz3,tz);
2362
2363             fjptrA             = f+j_coord_offsetA;
2364             fjptrB             = f+j_coord_offsetB;
2365             fjptrC             = f+j_coord_offsetC;
2366             fjptrD             = f+j_coord_offsetD;
2367             fjptrE             = f+j_coord_offsetE;
2368             fjptrF             = f+j_coord_offsetF;
2369             fjptrG             = f+j_coord_offsetG;
2370             fjptrH             = f+j_coord_offsetH;
2371
2372             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2373                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2374                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2375
2376             /* Inner loop uses 402 flops */
2377         }
2378
2379         if(jidx<j_index_end)
2380         {
2381
2382             /* Get j neighbor index, and coordinate index */
2383             jnrlistA         = jjnr[jidx];
2384             jnrlistB         = jjnr[jidx+1];
2385             jnrlistC         = jjnr[jidx+2];
2386             jnrlistD         = jjnr[jidx+3];
2387             jnrlistE         = jjnr[jidx+4];
2388             jnrlistF         = jjnr[jidx+5];
2389             jnrlistG         = jjnr[jidx+6];
2390             jnrlistH         = jjnr[jidx+7];
2391             /* Sign of each element will be negative for non-real atoms.
2392              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2393              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2394              */
2395             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2396                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2397                                             
2398             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2399             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2400             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2401             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2402             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2403             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2404             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2405             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2406             j_coord_offsetA  = DIM*jnrA;
2407             j_coord_offsetB  = DIM*jnrB;
2408             j_coord_offsetC  = DIM*jnrC;
2409             j_coord_offsetD  = DIM*jnrD;
2410             j_coord_offsetE  = DIM*jnrE;
2411             j_coord_offsetF  = DIM*jnrF;
2412             j_coord_offsetG  = DIM*jnrG;
2413             j_coord_offsetH  = DIM*jnrH;
2414
2415             /* load j atom coordinates */
2416             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2417                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2418                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2419                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2420                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2421                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2422
2423             /* Calculate displacement vector */
2424             dx00             = _mm256_sub_ps(ix0,jx0);
2425             dy00             = _mm256_sub_ps(iy0,jy0);
2426             dz00             = _mm256_sub_ps(iz0,jz0);
2427             dx11             = _mm256_sub_ps(ix1,jx1);
2428             dy11             = _mm256_sub_ps(iy1,jy1);
2429             dz11             = _mm256_sub_ps(iz1,jz1);
2430             dx12             = _mm256_sub_ps(ix1,jx2);
2431             dy12             = _mm256_sub_ps(iy1,jy2);
2432             dz12             = _mm256_sub_ps(iz1,jz2);
2433             dx13             = _mm256_sub_ps(ix1,jx3);
2434             dy13             = _mm256_sub_ps(iy1,jy3);
2435             dz13             = _mm256_sub_ps(iz1,jz3);
2436             dx21             = _mm256_sub_ps(ix2,jx1);
2437             dy21             = _mm256_sub_ps(iy2,jy1);
2438             dz21             = _mm256_sub_ps(iz2,jz1);
2439             dx22             = _mm256_sub_ps(ix2,jx2);
2440             dy22             = _mm256_sub_ps(iy2,jy2);
2441             dz22             = _mm256_sub_ps(iz2,jz2);
2442             dx23             = _mm256_sub_ps(ix2,jx3);
2443             dy23             = _mm256_sub_ps(iy2,jy3);
2444             dz23             = _mm256_sub_ps(iz2,jz3);
2445             dx31             = _mm256_sub_ps(ix3,jx1);
2446             dy31             = _mm256_sub_ps(iy3,jy1);
2447             dz31             = _mm256_sub_ps(iz3,jz1);
2448             dx32             = _mm256_sub_ps(ix3,jx2);
2449             dy32             = _mm256_sub_ps(iy3,jy2);
2450             dz32             = _mm256_sub_ps(iz3,jz2);
2451             dx33             = _mm256_sub_ps(ix3,jx3);
2452             dy33             = _mm256_sub_ps(iy3,jy3);
2453             dz33             = _mm256_sub_ps(iz3,jz3);
2454
2455             /* Calculate squared distance and things based on it */
2456             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2457             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2458             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2459             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2460             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2461             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2462             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2463             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2464             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2465             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2466
2467             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2468             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2469             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2470             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
2471             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2472             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2473             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
2474             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
2475             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
2476             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
2477
2478             fjx0             = _mm256_setzero_ps();
2479             fjy0             = _mm256_setzero_ps();
2480             fjz0             = _mm256_setzero_ps();
2481             fjx1             = _mm256_setzero_ps();
2482             fjy1             = _mm256_setzero_ps();
2483             fjz1             = _mm256_setzero_ps();
2484             fjx2             = _mm256_setzero_ps();
2485             fjy2             = _mm256_setzero_ps();
2486             fjz2             = _mm256_setzero_ps();
2487             fjx3             = _mm256_setzero_ps();
2488             fjy3             = _mm256_setzero_ps();
2489             fjz3             = _mm256_setzero_ps();
2490
2491             /**************************
2492              * CALCULATE INTERACTIONS *
2493              **************************/
2494
2495             r00              = _mm256_mul_ps(rsq00,rinv00);
2496             r00              = _mm256_andnot_ps(dummy_mask,r00);
2497
2498             /* Calculate table index by multiplying r with table scale and truncate to integer */
2499             rt               = _mm256_mul_ps(r00,vftabscale);
2500             vfitab           = _mm256_cvttps_epi32(rt);
2501             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2502             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2503             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2504             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2505             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2506             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2507
2508             /* CUBIC SPLINE TABLE DISPERSION */
2509             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
2510             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
2511             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2512                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2513             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2514                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2515             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2516                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2517             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2518                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2519             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2520             Heps             = _mm256_mul_ps(vfeps,H);
2521             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2522             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2523             fvdw6            = _mm256_mul_ps(c6_00,FF);
2524
2525             /* CUBIC SPLINE TABLE REPULSION */
2526             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
2527             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
2528             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2529                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2530             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2531                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2532             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2533                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2534             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2535                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2536             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2537             Heps             = _mm256_mul_ps(vfeps,H);
2538             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2539             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2540             fvdw12           = _mm256_mul_ps(c12_00,FF);
2541             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2542
2543             fscal            = fvdw;
2544
2545             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2546
2547             /* Calculate temporary vectorial force */
2548             tx               = _mm256_mul_ps(fscal,dx00);
2549             ty               = _mm256_mul_ps(fscal,dy00);
2550             tz               = _mm256_mul_ps(fscal,dz00);
2551
2552             /* Update vectorial force */
2553             fix0             = _mm256_add_ps(fix0,tx);
2554             fiy0             = _mm256_add_ps(fiy0,ty);
2555             fiz0             = _mm256_add_ps(fiz0,tz);
2556
2557             fjx0             = _mm256_add_ps(fjx0,tx);
2558             fjy0             = _mm256_add_ps(fjy0,ty);
2559             fjz0             = _mm256_add_ps(fjz0,tz);
2560
2561             /**************************
2562              * CALCULATE INTERACTIONS *
2563              **************************/
2564
2565             r11              = _mm256_mul_ps(rsq11,rinv11);
2566             r11              = _mm256_andnot_ps(dummy_mask,r11);
2567
2568             /* Calculate table index by multiplying r with table scale and truncate to integer */
2569             rt               = _mm256_mul_ps(r11,vftabscale);
2570             vfitab           = _mm256_cvttps_epi32(rt);
2571             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2572             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2573             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2574             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2575             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2576             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2577
2578             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2579             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2580                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2581             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2582                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2583             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2584                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2585             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2586                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2587             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2588             Heps             = _mm256_mul_ps(vfeps,H);
2589             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2590             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2591             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2592
2593             fscal            = felec;
2594
2595             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2596
2597             /* Calculate temporary vectorial force */
2598             tx               = _mm256_mul_ps(fscal,dx11);
2599             ty               = _mm256_mul_ps(fscal,dy11);
2600             tz               = _mm256_mul_ps(fscal,dz11);
2601
2602             /* Update vectorial force */
2603             fix1             = _mm256_add_ps(fix1,tx);
2604             fiy1             = _mm256_add_ps(fiy1,ty);
2605             fiz1             = _mm256_add_ps(fiz1,tz);
2606
2607             fjx1             = _mm256_add_ps(fjx1,tx);
2608             fjy1             = _mm256_add_ps(fjy1,ty);
2609             fjz1             = _mm256_add_ps(fjz1,tz);
2610
2611             /**************************
2612              * CALCULATE INTERACTIONS *
2613              **************************/
2614
2615             r12              = _mm256_mul_ps(rsq12,rinv12);
2616             r12              = _mm256_andnot_ps(dummy_mask,r12);
2617
2618             /* Calculate table index by multiplying r with table scale and truncate to integer */
2619             rt               = _mm256_mul_ps(r12,vftabscale);
2620             vfitab           = _mm256_cvttps_epi32(rt);
2621             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2622             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2623             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2624             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2625             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2626             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2627
2628             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2629             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2630                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2631             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2632                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2633             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2634                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2635             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2636                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2637             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2638             Heps             = _mm256_mul_ps(vfeps,H);
2639             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2640             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2641             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2642
2643             fscal            = felec;
2644
2645             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2646
2647             /* Calculate temporary vectorial force */
2648             tx               = _mm256_mul_ps(fscal,dx12);
2649             ty               = _mm256_mul_ps(fscal,dy12);
2650             tz               = _mm256_mul_ps(fscal,dz12);
2651
2652             /* Update vectorial force */
2653             fix1             = _mm256_add_ps(fix1,tx);
2654             fiy1             = _mm256_add_ps(fiy1,ty);
2655             fiz1             = _mm256_add_ps(fiz1,tz);
2656
2657             fjx2             = _mm256_add_ps(fjx2,tx);
2658             fjy2             = _mm256_add_ps(fjy2,ty);
2659             fjz2             = _mm256_add_ps(fjz2,tz);
2660
2661             /**************************
2662              * CALCULATE INTERACTIONS *
2663              **************************/
2664
2665             r13              = _mm256_mul_ps(rsq13,rinv13);
2666             r13              = _mm256_andnot_ps(dummy_mask,r13);
2667
2668             /* Calculate table index by multiplying r with table scale and truncate to integer */
2669             rt               = _mm256_mul_ps(r13,vftabscale);
2670             vfitab           = _mm256_cvttps_epi32(rt);
2671             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2672             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2673             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2674             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2675             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2676             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2677
2678             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2679             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2680                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2681             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2682                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2683             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2684                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2685             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2686                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2687             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2688             Heps             = _mm256_mul_ps(vfeps,H);
2689             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2690             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2691             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
2692
2693             fscal            = felec;
2694
2695             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2696
2697             /* Calculate temporary vectorial force */
2698             tx               = _mm256_mul_ps(fscal,dx13);
2699             ty               = _mm256_mul_ps(fscal,dy13);
2700             tz               = _mm256_mul_ps(fscal,dz13);
2701
2702             /* Update vectorial force */
2703             fix1             = _mm256_add_ps(fix1,tx);
2704             fiy1             = _mm256_add_ps(fiy1,ty);
2705             fiz1             = _mm256_add_ps(fiz1,tz);
2706
2707             fjx3             = _mm256_add_ps(fjx3,tx);
2708             fjy3             = _mm256_add_ps(fjy3,ty);
2709             fjz3             = _mm256_add_ps(fjz3,tz);
2710
2711             /**************************
2712              * CALCULATE INTERACTIONS *
2713              **************************/
2714
2715             r21              = _mm256_mul_ps(rsq21,rinv21);
2716             r21              = _mm256_andnot_ps(dummy_mask,r21);
2717
2718             /* Calculate table index by multiplying r with table scale and truncate to integer */
2719             rt               = _mm256_mul_ps(r21,vftabscale);
2720             vfitab           = _mm256_cvttps_epi32(rt);
2721             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2722             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2723             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2724             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2725             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2726             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2727
2728             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2729             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2730                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2731             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2732                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2733             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2734                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2735             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2736                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2737             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2738             Heps             = _mm256_mul_ps(vfeps,H);
2739             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2740             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2741             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2742
2743             fscal            = felec;
2744
2745             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2746
2747             /* Calculate temporary vectorial force */
2748             tx               = _mm256_mul_ps(fscal,dx21);
2749             ty               = _mm256_mul_ps(fscal,dy21);
2750             tz               = _mm256_mul_ps(fscal,dz21);
2751
2752             /* Update vectorial force */
2753             fix2             = _mm256_add_ps(fix2,tx);
2754             fiy2             = _mm256_add_ps(fiy2,ty);
2755             fiz2             = _mm256_add_ps(fiz2,tz);
2756
2757             fjx1             = _mm256_add_ps(fjx1,tx);
2758             fjy1             = _mm256_add_ps(fjy1,ty);
2759             fjz1             = _mm256_add_ps(fjz1,tz);
2760
2761             /**************************
2762              * CALCULATE INTERACTIONS *
2763              **************************/
2764
2765             r22              = _mm256_mul_ps(rsq22,rinv22);
2766             r22              = _mm256_andnot_ps(dummy_mask,r22);
2767
2768             /* Calculate table index by multiplying r with table scale and truncate to integer */
2769             rt               = _mm256_mul_ps(r22,vftabscale);
2770             vfitab           = _mm256_cvttps_epi32(rt);
2771             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2772             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2773             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2774             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2775             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2776             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2777
2778             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2779             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2780                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2781             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2782                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2783             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2784                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2785             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2786                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2787             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2788             Heps             = _mm256_mul_ps(vfeps,H);
2789             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2790             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2791             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2792
2793             fscal            = felec;
2794
2795             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2796
2797             /* Calculate temporary vectorial force */
2798             tx               = _mm256_mul_ps(fscal,dx22);
2799             ty               = _mm256_mul_ps(fscal,dy22);
2800             tz               = _mm256_mul_ps(fscal,dz22);
2801
2802             /* Update vectorial force */
2803             fix2             = _mm256_add_ps(fix2,tx);
2804             fiy2             = _mm256_add_ps(fiy2,ty);
2805             fiz2             = _mm256_add_ps(fiz2,tz);
2806
2807             fjx2             = _mm256_add_ps(fjx2,tx);
2808             fjy2             = _mm256_add_ps(fjy2,ty);
2809             fjz2             = _mm256_add_ps(fjz2,tz);
2810
2811             /**************************
2812              * CALCULATE INTERACTIONS *
2813              **************************/
2814
2815             r23              = _mm256_mul_ps(rsq23,rinv23);
2816             r23              = _mm256_andnot_ps(dummy_mask,r23);
2817
2818             /* Calculate table index by multiplying r with table scale and truncate to integer */
2819             rt               = _mm256_mul_ps(r23,vftabscale);
2820             vfitab           = _mm256_cvttps_epi32(rt);
2821             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2822             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2823             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2824             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2825             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2826             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2827
2828             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2829             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2830                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2831             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2832                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2833             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2834                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2835             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2836                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2837             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2838             Heps             = _mm256_mul_ps(vfeps,H);
2839             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2840             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2841             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2842
2843             fscal            = felec;
2844
2845             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2846
2847             /* Calculate temporary vectorial force */
2848             tx               = _mm256_mul_ps(fscal,dx23);
2849             ty               = _mm256_mul_ps(fscal,dy23);
2850             tz               = _mm256_mul_ps(fscal,dz23);
2851
2852             /* Update vectorial force */
2853             fix2             = _mm256_add_ps(fix2,tx);
2854             fiy2             = _mm256_add_ps(fiy2,ty);
2855             fiz2             = _mm256_add_ps(fiz2,tz);
2856
2857             fjx3             = _mm256_add_ps(fjx3,tx);
2858             fjy3             = _mm256_add_ps(fjy3,ty);
2859             fjz3             = _mm256_add_ps(fjz3,tz);
2860
2861             /**************************
2862              * CALCULATE INTERACTIONS *
2863              **************************/
2864
2865             r31              = _mm256_mul_ps(rsq31,rinv31);
2866             r31              = _mm256_andnot_ps(dummy_mask,r31);
2867
2868             /* Calculate table index by multiplying r with table scale and truncate to integer */
2869             rt               = _mm256_mul_ps(r31,vftabscale);
2870             vfitab           = _mm256_cvttps_epi32(rt);
2871             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2872             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2873             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2874             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2875             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2876             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2877
2878             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2879             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2880                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2881             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2882                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2883             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2884                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2885             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2886                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2887             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2888             Heps             = _mm256_mul_ps(vfeps,H);
2889             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2890             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2891             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2892
2893             fscal            = felec;
2894
2895             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2896
2897             /* Calculate temporary vectorial force */
2898             tx               = _mm256_mul_ps(fscal,dx31);
2899             ty               = _mm256_mul_ps(fscal,dy31);
2900             tz               = _mm256_mul_ps(fscal,dz31);
2901
2902             /* Update vectorial force */
2903             fix3             = _mm256_add_ps(fix3,tx);
2904             fiy3             = _mm256_add_ps(fiy3,ty);
2905             fiz3             = _mm256_add_ps(fiz3,tz);
2906
2907             fjx1             = _mm256_add_ps(fjx1,tx);
2908             fjy1             = _mm256_add_ps(fjy1,ty);
2909             fjz1             = _mm256_add_ps(fjz1,tz);
2910
2911             /**************************
2912              * CALCULATE INTERACTIONS *
2913              **************************/
2914
2915             r32              = _mm256_mul_ps(rsq32,rinv32);
2916             r32              = _mm256_andnot_ps(dummy_mask,r32);
2917
2918             /* Calculate table index by multiplying r with table scale and truncate to integer */
2919             rt               = _mm256_mul_ps(r32,vftabscale);
2920             vfitab           = _mm256_cvttps_epi32(rt);
2921             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2922             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2923             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2924             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2925             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2926             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2927
2928             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2929             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2930                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2931             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2932                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2933             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2934                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2935             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2936                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2937             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2938             Heps             = _mm256_mul_ps(vfeps,H);
2939             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2940             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2941             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2942
2943             fscal            = felec;
2944
2945             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2946
2947             /* Calculate temporary vectorial force */
2948             tx               = _mm256_mul_ps(fscal,dx32);
2949             ty               = _mm256_mul_ps(fscal,dy32);
2950             tz               = _mm256_mul_ps(fscal,dz32);
2951
2952             /* Update vectorial force */
2953             fix3             = _mm256_add_ps(fix3,tx);
2954             fiy3             = _mm256_add_ps(fiy3,ty);
2955             fiz3             = _mm256_add_ps(fiz3,tz);
2956
2957             fjx2             = _mm256_add_ps(fjx2,tx);
2958             fjy2             = _mm256_add_ps(fjy2,ty);
2959             fjz2             = _mm256_add_ps(fjz2,tz);
2960
2961             /**************************
2962              * CALCULATE INTERACTIONS *
2963              **************************/
2964
2965             r33              = _mm256_mul_ps(rsq33,rinv33);
2966             r33              = _mm256_andnot_ps(dummy_mask,r33);
2967
2968             /* Calculate table index by multiplying r with table scale and truncate to integer */
2969             rt               = _mm256_mul_ps(r33,vftabscale);
2970             vfitab           = _mm256_cvttps_epi32(rt);
2971             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2972             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2973             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2974             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2975             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2976             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2977
2978             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2979             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2980                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2981             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2982                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2983             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2984                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2985             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2986                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2987             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2988             Heps             = _mm256_mul_ps(vfeps,H);
2989             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2990             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2991             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2992
2993             fscal            = felec;
2994
2995             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2996
2997             /* Calculate temporary vectorial force */
2998             tx               = _mm256_mul_ps(fscal,dx33);
2999             ty               = _mm256_mul_ps(fscal,dy33);
3000             tz               = _mm256_mul_ps(fscal,dz33);
3001
3002             /* Update vectorial force */
3003             fix3             = _mm256_add_ps(fix3,tx);
3004             fiy3             = _mm256_add_ps(fiy3,ty);
3005             fiz3             = _mm256_add_ps(fiz3,tz);
3006
3007             fjx3             = _mm256_add_ps(fjx3,tx);
3008             fjy3             = _mm256_add_ps(fjy3,ty);
3009             fjz3             = _mm256_add_ps(fjz3,tz);
3010
3011             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3012             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3013             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3014             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3015             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3016             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3017             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3018             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3019
3020             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
3021                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3022                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3023
3024             /* Inner loop uses 412 flops */
3025         }
3026
3027         /* End of innermost loop */
3028
3029         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3030                                                  f+i_coord_offset,fshift+i_shift_offset);
3031
3032         /* Increment number of inner iterations */
3033         inneriter                  += j_index_end - j_index_start;
3034
3035         /* Outer loop uses 24 flops */
3036     }
3037
3038     /* Increment number of outer iterations */
3039     outeriter        += nri;
3040
3041     /* Update outer/inner flops */
3042
3043     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*412);
3044 }