Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
54  * Electrostatics interaction: CubicSplineTable
55  * VdW interaction:            CubicSplineTable
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrE,jnrF,jnrG,jnrH;
78     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
83     real             rcutoff_scalar;
84     real             *shiftvec,*fshift,*x,*f;
85     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
86     real             scratch[4*DIM];
87     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88     real *           vdwioffsetptr0;
89     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90     real *           vdwioffsetptr1;
91     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92     real *           vdwioffsetptr2;
93     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
102     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
103     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
104     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
105     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
106     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
107     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
108     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
109     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
116     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
117     __m256i          vfitab;
118     __m128i          vfitab_lo,vfitab_hi;
119     __m128i          ifour       = _mm_set1_epi32(4);
120     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
121     real             *vftab;
122     __m256           dummy_mask,cutoff_mask;
123     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
124     __m256           one     = _mm256_set1_ps(1.0);
125     __m256           two     = _mm256_set1_ps(2.0);
126     x                = xx[0];
127     f                = ff[0];
128
129     nri              = nlist->nri;
130     iinr             = nlist->iinr;
131     jindex           = nlist->jindex;
132     jjnr             = nlist->jjnr;
133     shiftidx         = nlist->shift;
134     gid              = nlist->gid;
135     shiftvec         = fr->shift_vec[0];
136     fshift           = fr->fshift[0];
137     facel            = _mm256_set1_ps(fr->epsfac);
138     charge           = mdatoms->chargeA;
139     nvdwtype         = fr->ntype;
140     vdwparam         = fr->nbfp;
141     vdwtype          = mdatoms->typeA;
142
143     vftab            = kernel_data->table_elec_vdw->data;
144     vftabscale       = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
145
146     /* Setup water-specific parameters */
147     inr              = nlist->iinr[0];
148     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
149     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
150     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
151     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
152
153     jq0              = _mm256_set1_ps(charge[inr+0]);
154     jq1              = _mm256_set1_ps(charge[inr+1]);
155     jq2              = _mm256_set1_ps(charge[inr+2]);
156     vdwjidx0A        = 2*vdwtype[inr+0];
157     qq00             = _mm256_mul_ps(iq0,jq0);
158     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
159     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
160     qq01             = _mm256_mul_ps(iq0,jq1);
161     qq02             = _mm256_mul_ps(iq0,jq2);
162     qq10             = _mm256_mul_ps(iq1,jq0);
163     qq11             = _mm256_mul_ps(iq1,jq1);
164     qq12             = _mm256_mul_ps(iq1,jq2);
165     qq20             = _mm256_mul_ps(iq2,jq0);
166     qq21             = _mm256_mul_ps(iq2,jq1);
167     qq22             = _mm256_mul_ps(iq2,jq2);
168
169     /* Avoid stupid compiler warnings */
170     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
171     j_coord_offsetA = 0;
172     j_coord_offsetB = 0;
173     j_coord_offsetC = 0;
174     j_coord_offsetD = 0;
175     j_coord_offsetE = 0;
176     j_coord_offsetF = 0;
177     j_coord_offsetG = 0;
178     j_coord_offsetH = 0;
179
180     outeriter        = 0;
181     inneriter        = 0;
182
183     for(iidx=0;iidx<4*DIM;iidx++)
184     {
185         scratch[iidx] = 0.0;
186     }
187
188     /* Start outer loop over neighborlists */
189     for(iidx=0; iidx<nri; iidx++)
190     {
191         /* Load shift vector for this list */
192         i_shift_offset   = DIM*shiftidx[iidx];
193
194         /* Load limits for loop over neighbors */
195         j_index_start    = jindex[iidx];
196         j_index_end      = jindex[iidx+1];
197
198         /* Get outer coordinate index */
199         inr              = iinr[iidx];
200         i_coord_offset   = DIM*inr;
201
202         /* Load i particle coords and add shift vector */
203         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
204                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
205
206         fix0             = _mm256_setzero_ps();
207         fiy0             = _mm256_setzero_ps();
208         fiz0             = _mm256_setzero_ps();
209         fix1             = _mm256_setzero_ps();
210         fiy1             = _mm256_setzero_ps();
211         fiz1             = _mm256_setzero_ps();
212         fix2             = _mm256_setzero_ps();
213         fiy2             = _mm256_setzero_ps();
214         fiz2             = _mm256_setzero_ps();
215
216         /* Reset potential sums */
217         velecsum         = _mm256_setzero_ps();
218         vvdwsum          = _mm256_setzero_ps();
219
220         /* Start inner kernel loop */
221         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
222         {
223
224             /* Get j neighbor index, and coordinate index */
225             jnrA             = jjnr[jidx];
226             jnrB             = jjnr[jidx+1];
227             jnrC             = jjnr[jidx+2];
228             jnrD             = jjnr[jidx+3];
229             jnrE             = jjnr[jidx+4];
230             jnrF             = jjnr[jidx+5];
231             jnrG             = jjnr[jidx+6];
232             jnrH             = jjnr[jidx+7];
233             j_coord_offsetA  = DIM*jnrA;
234             j_coord_offsetB  = DIM*jnrB;
235             j_coord_offsetC  = DIM*jnrC;
236             j_coord_offsetD  = DIM*jnrD;
237             j_coord_offsetE  = DIM*jnrE;
238             j_coord_offsetF  = DIM*jnrF;
239             j_coord_offsetG  = DIM*jnrG;
240             j_coord_offsetH  = DIM*jnrH;
241
242             /* load j atom coordinates */
243             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
244                                                  x+j_coord_offsetC,x+j_coord_offsetD,
245                                                  x+j_coord_offsetE,x+j_coord_offsetF,
246                                                  x+j_coord_offsetG,x+j_coord_offsetH,
247                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
248
249             /* Calculate displacement vector */
250             dx00             = _mm256_sub_ps(ix0,jx0);
251             dy00             = _mm256_sub_ps(iy0,jy0);
252             dz00             = _mm256_sub_ps(iz0,jz0);
253             dx01             = _mm256_sub_ps(ix0,jx1);
254             dy01             = _mm256_sub_ps(iy0,jy1);
255             dz01             = _mm256_sub_ps(iz0,jz1);
256             dx02             = _mm256_sub_ps(ix0,jx2);
257             dy02             = _mm256_sub_ps(iy0,jy2);
258             dz02             = _mm256_sub_ps(iz0,jz2);
259             dx10             = _mm256_sub_ps(ix1,jx0);
260             dy10             = _mm256_sub_ps(iy1,jy0);
261             dz10             = _mm256_sub_ps(iz1,jz0);
262             dx11             = _mm256_sub_ps(ix1,jx1);
263             dy11             = _mm256_sub_ps(iy1,jy1);
264             dz11             = _mm256_sub_ps(iz1,jz1);
265             dx12             = _mm256_sub_ps(ix1,jx2);
266             dy12             = _mm256_sub_ps(iy1,jy2);
267             dz12             = _mm256_sub_ps(iz1,jz2);
268             dx20             = _mm256_sub_ps(ix2,jx0);
269             dy20             = _mm256_sub_ps(iy2,jy0);
270             dz20             = _mm256_sub_ps(iz2,jz0);
271             dx21             = _mm256_sub_ps(ix2,jx1);
272             dy21             = _mm256_sub_ps(iy2,jy1);
273             dz21             = _mm256_sub_ps(iz2,jz1);
274             dx22             = _mm256_sub_ps(ix2,jx2);
275             dy22             = _mm256_sub_ps(iy2,jy2);
276             dz22             = _mm256_sub_ps(iz2,jz2);
277
278             /* Calculate squared distance and things based on it */
279             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
280             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
281             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
282             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
283             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
284             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
285             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
286             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
287             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
288
289             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
290             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
291             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
292             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
293             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
294             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
295             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
296             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
297             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
298
299             fjx0             = _mm256_setzero_ps();
300             fjy0             = _mm256_setzero_ps();
301             fjz0             = _mm256_setzero_ps();
302             fjx1             = _mm256_setzero_ps();
303             fjy1             = _mm256_setzero_ps();
304             fjz1             = _mm256_setzero_ps();
305             fjx2             = _mm256_setzero_ps();
306             fjy2             = _mm256_setzero_ps();
307             fjz2             = _mm256_setzero_ps();
308
309             /**************************
310              * CALCULATE INTERACTIONS *
311              **************************/
312
313             r00              = _mm256_mul_ps(rsq00,rinv00);
314
315             /* Calculate table index by multiplying r with table scale and truncate to integer */
316             rt               = _mm256_mul_ps(r00,vftabscale);
317             vfitab           = _mm256_cvttps_epi32(rt);
318             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
319             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
320             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
321             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
322             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
323             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
324
325             /* CUBIC SPLINE TABLE ELECTROSTATICS */
326             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
327                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
328             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
329                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
330             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
331                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
332             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
333                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
334             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
335             Heps             = _mm256_mul_ps(vfeps,H);
336             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
337             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
338             velec            = _mm256_mul_ps(qq00,VV);
339             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
340             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
341
342             /* CUBIC SPLINE TABLE DISPERSION */
343             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
344             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
345             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
346                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
347             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
348                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
349             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
350                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
351             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
352                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
353             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
354             Heps             = _mm256_mul_ps(vfeps,H);
355             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
356             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
357             vvdw6            = _mm256_mul_ps(c6_00,VV);
358             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
359             fvdw6            = _mm256_mul_ps(c6_00,FF);
360
361             /* CUBIC SPLINE TABLE REPULSION */
362             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
363             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
364             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
365                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
366             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
367                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
368             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
369                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
370             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
371                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
372             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
373             Heps             = _mm256_mul_ps(vfeps,H);
374             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
375             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
376             vvdw12           = _mm256_mul_ps(c12_00,VV);
377             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
378             fvdw12           = _mm256_mul_ps(c12_00,FF);
379             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
380             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
381
382             /* Update potential sum for this i atom from the interaction with this j atom. */
383             velecsum         = _mm256_add_ps(velecsum,velec);
384             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
385
386             fscal            = _mm256_add_ps(felec,fvdw);
387
388             /* Calculate temporary vectorial force */
389             tx               = _mm256_mul_ps(fscal,dx00);
390             ty               = _mm256_mul_ps(fscal,dy00);
391             tz               = _mm256_mul_ps(fscal,dz00);
392
393             /* Update vectorial force */
394             fix0             = _mm256_add_ps(fix0,tx);
395             fiy0             = _mm256_add_ps(fiy0,ty);
396             fiz0             = _mm256_add_ps(fiz0,tz);
397
398             fjx0             = _mm256_add_ps(fjx0,tx);
399             fjy0             = _mm256_add_ps(fjy0,ty);
400             fjz0             = _mm256_add_ps(fjz0,tz);
401
402             /**************************
403              * CALCULATE INTERACTIONS *
404              **************************/
405
406             r01              = _mm256_mul_ps(rsq01,rinv01);
407
408             /* Calculate table index by multiplying r with table scale and truncate to integer */
409             rt               = _mm256_mul_ps(r01,vftabscale);
410             vfitab           = _mm256_cvttps_epi32(rt);
411             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
412             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
413             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
414             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
415             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
416             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
417
418             /* CUBIC SPLINE TABLE ELECTROSTATICS */
419             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
420                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
421             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
422                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
423             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
424                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
425             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
426                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
427             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
428             Heps             = _mm256_mul_ps(vfeps,H);
429             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
430             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
431             velec            = _mm256_mul_ps(qq01,VV);
432             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
433             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
434
435             /* Update potential sum for this i atom from the interaction with this j atom. */
436             velecsum         = _mm256_add_ps(velecsum,velec);
437
438             fscal            = felec;
439
440             /* Calculate temporary vectorial force */
441             tx               = _mm256_mul_ps(fscal,dx01);
442             ty               = _mm256_mul_ps(fscal,dy01);
443             tz               = _mm256_mul_ps(fscal,dz01);
444
445             /* Update vectorial force */
446             fix0             = _mm256_add_ps(fix0,tx);
447             fiy0             = _mm256_add_ps(fiy0,ty);
448             fiz0             = _mm256_add_ps(fiz0,tz);
449
450             fjx1             = _mm256_add_ps(fjx1,tx);
451             fjy1             = _mm256_add_ps(fjy1,ty);
452             fjz1             = _mm256_add_ps(fjz1,tz);
453
454             /**************************
455              * CALCULATE INTERACTIONS *
456              **************************/
457
458             r02              = _mm256_mul_ps(rsq02,rinv02);
459
460             /* Calculate table index by multiplying r with table scale and truncate to integer */
461             rt               = _mm256_mul_ps(r02,vftabscale);
462             vfitab           = _mm256_cvttps_epi32(rt);
463             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
464             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
465             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
466             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
467             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
468             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
469
470             /* CUBIC SPLINE TABLE ELECTROSTATICS */
471             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
472                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
473             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
474                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
475             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
476                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
477             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
478                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
479             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
480             Heps             = _mm256_mul_ps(vfeps,H);
481             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
482             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
483             velec            = _mm256_mul_ps(qq02,VV);
484             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
485             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
486
487             /* Update potential sum for this i atom from the interaction with this j atom. */
488             velecsum         = _mm256_add_ps(velecsum,velec);
489
490             fscal            = felec;
491
492             /* Calculate temporary vectorial force */
493             tx               = _mm256_mul_ps(fscal,dx02);
494             ty               = _mm256_mul_ps(fscal,dy02);
495             tz               = _mm256_mul_ps(fscal,dz02);
496
497             /* Update vectorial force */
498             fix0             = _mm256_add_ps(fix0,tx);
499             fiy0             = _mm256_add_ps(fiy0,ty);
500             fiz0             = _mm256_add_ps(fiz0,tz);
501
502             fjx2             = _mm256_add_ps(fjx2,tx);
503             fjy2             = _mm256_add_ps(fjy2,ty);
504             fjz2             = _mm256_add_ps(fjz2,tz);
505
506             /**************************
507              * CALCULATE INTERACTIONS *
508              **************************/
509
510             r10              = _mm256_mul_ps(rsq10,rinv10);
511
512             /* Calculate table index by multiplying r with table scale and truncate to integer */
513             rt               = _mm256_mul_ps(r10,vftabscale);
514             vfitab           = _mm256_cvttps_epi32(rt);
515             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
516             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
517             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
518             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
519             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
520             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
521
522             /* CUBIC SPLINE TABLE ELECTROSTATICS */
523             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
524                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
525             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
526                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
527             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
528                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
529             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
530                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
531             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
532             Heps             = _mm256_mul_ps(vfeps,H);
533             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
534             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
535             velec            = _mm256_mul_ps(qq10,VV);
536             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
537             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
538
539             /* Update potential sum for this i atom from the interaction with this j atom. */
540             velecsum         = _mm256_add_ps(velecsum,velec);
541
542             fscal            = felec;
543
544             /* Calculate temporary vectorial force */
545             tx               = _mm256_mul_ps(fscal,dx10);
546             ty               = _mm256_mul_ps(fscal,dy10);
547             tz               = _mm256_mul_ps(fscal,dz10);
548
549             /* Update vectorial force */
550             fix1             = _mm256_add_ps(fix1,tx);
551             fiy1             = _mm256_add_ps(fiy1,ty);
552             fiz1             = _mm256_add_ps(fiz1,tz);
553
554             fjx0             = _mm256_add_ps(fjx0,tx);
555             fjy0             = _mm256_add_ps(fjy0,ty);
556             fjz0             = _mm256_add_ps(fjz0,tz);
557
558             /**************************
559              * CALCULATE INTERACTIONS *
560              **************************/
561
562             r11              = _mm256_mul_ps(rsq11,rinv11);
563
564             /* Calculate table index by multiplying r with table scale and truncate to integer */
565             rt               = _mm256_mul_ps(r11,vftabscale);
566             vfitab           = _mm256_cvttps_epi32(rt);
567             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
568             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
569             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
570             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
571             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
572             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
573
574             /* CUBIC SPLINE TABLE ELECTROSTATICS */
575             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
576                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
577             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
578                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
579             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
580                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
581             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
582                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
583             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
584             Heps             = _mm256_mul_ps(vfeps,H);
585             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
586             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
587             velec            = _mm256_mul_ps(qq11,VV);
588             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
589             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
590
591             /* Update potential sum for this i atom from the interaction with this j atom. */
592             velecsum         = _mm256_add_ps(velecsum,velec);
593
594             fscal            = felec;
595
596             /* Calculate temporary vectorial force */
597             tx               = _mm256_mul_ps(fscal,dx11);
598             ty               = _mm256_mul_ps(fscal,dy11);
599             tz               = _mm256_mul_ps(fscal,dz11);
600
601             /* Update vectorial force */
602             fix1             = _mm256_add_ps(fix1,tx);
603             fiy1             = _mm256_add_ps(fiy1,ty);
604             fiz1             = _mm256_add_ps(fiz1,tz);
605
606             fjx1             = _mm256_add_ps(fjx1,tx);
607             fjy1             = _mm256_add_ps(fjy1,ty);
608             fjz1             = _mm256_add_ps(fjz1,tz);
609
610             /**************************
611              * CALCULATE INTERACTIONS *
612              **************************/
613
614             r12              = _mm256_mul_ps(rsq12,rinv12);
615
616             /* Calculate table index by multiplying r with table scale and truncate to integer */
617             rt               = _mm256_mul_ps(r12,vftabscale);
618             vfitab           = _mm256_cvttps_epi32(rt);
619             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
620             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
621             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
622             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
623             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
624             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
625
626             /* CUBIC SPLINE TABLE ELECTROSTATICS */
627             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
628                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
629             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
630                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
631             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
632                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
633             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
634                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
635             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
636             Heps             = _mm256_mul_ps(vfeps,H);
637             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
638             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
639             velec            = _mm256_mul_ps(qq12,VV);
640             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
641             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
642
643             /* Update potential sum for this i atom from the interaction with this j atom. */
644             velecsum         = _mm256_add_ps(velecsum,velec);
645
646             fscal            = felec;
647
648             /* Calculate temporary vectorial force */
649             tx               = _mm256_mul_ps(fscal,dx12);
650             ty               = _mm256_mul_ps(fscal,dy12);
651             tz               = _mm256_mul_ps(fscal,dz12);
652
653             /* Update vectorial force */
654             fix1             = _mm256_add_ps(fix1,tx);
655             fiy1             = _mm256_add_ps(fiy1,ty);
656             fiz1             = _mm256_add_ps(fiz1,tz);
657
658             fjx2             = _mm256_add_ps(fjx2,tx);
659             fjy2             = _mm256_add_ps(fjy2,ty);
660             fjz2             = _mm256_add_ps(fjz2,tz);
661
662             /**************************
663              * CALCULATE INTERACTIONS *
664              **************************/
665
666             r20              = _mm256_mul_ps(rsq20,rinv20);
667
668             /* Calculate table index by multiplying r with table scale and truncate to integer */
669             rt               = _mm256_mul_ps(r20,vftabscale);
670             vfitab           = _mm256_cvttps_epi32(rt);
671             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
672             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
673             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
674             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
675             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
676             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
677
678             /* CUBIC SPLINE TABLE ELECTROSTATICS */
679             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
680                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
681             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
682                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
683             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
684                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
685             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
686                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
687             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
688             Heps             = _mm256_mul_ps(vfeps,H);
689             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
690             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
691             velec            = _mm256_mul_ps(qq20,VV);
692             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
693             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
694
695             /* Update potential sum for this i atom from the interaction with this j atom. */
696             velecsum         = _mm256_add_ps(velecsum,velec);
697
698             fscal            = felec;
699
700             /* Calculate temporary vectorial force */
701             tx               = _mm256_mul_ps(fscal,dx20);
702             ty               = _mm256_mul_ps(fscal,dy20);
703             tz               = _mm256_mul_ps(fscal,dz20);
704
705             /* Update vectorial force */
706             fix2             = _mm256_add_ps(fix2,tx);
707             fiy2             = _mm256_add_ps(fiy2,ty);
708             fiz2             = _mm256_add_ps(fiz2,tz);
709
710             fjx0             = _mm256_add_ps(fjx0,tx);
711             fjy0             = _mm256_add_ps(fjy0,ty);
712             fjz0             = _mm256_add_ps(fjz0,tz);
713
714             /**************************
715              * CALCULATE INTERACTIONS *
716              **************************/
717
718             r21              = _mm256_mul_ps(rsq21,rinv21);
719
720             /* Calculate table index by multiplying r with table scale and truncate to integer */
721             rt               = _mm256_mul_ps(r21,vftabscale);
722             vfitab           = _mm256_cvttps_epi32(rt);
723             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
724             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
725             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
726             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
727             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
728             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
729
730             /* CUBIC SPLINE TABLE ELECTROSTATICS */
731             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
732                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
733             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
734                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
735             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
736                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
737             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
738                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
739             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
740             Heps             = _mm256_mul_ps(vfeps,H);
741             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
742             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
743             velec            = _mm256_mul_ps(qq21,VV);
744             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
745             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
746
747             /* Update potential sum for this i atom from the interaction with this j atom. */
748             velecsum         = _mm256_add_ps(velecsum,velec);
749
750             fscal            = felec;
751
752             /* Calculate temporary vectorial force */
753             tx               = _mm256_mul_ps(fscal,dx21);
754             ty               = _mm256_mul_ps(fscal,dy21);
755             tz               = _mm256_mul_ps(fscal,dz21);
756
757             /* Update vectorial force */
758             fix2             = _mm256_add_ps(fix2,tx);
759             fiy2             = _mm256_add_ps(fiy2,ty);
760             fiz2             = _mm256_add_ps(fiz2,tz);
761
762             fjx1             = _mm256_add_ps(fjx1,tx);
763             fjy1             = _mm256_add_ps(fjy1,ty);
764             fjz1             = _mm256_add_ps(fjz1,tz);
765
766             /**************************
767              * CALCULATE INTERACTIONS *
768              **************************/
769
770             r22              = _mm256_mul_ps(rsq22,rinv22);
771
772             /* Calculate table index by multiplying r with table scale and truncate to integer */
773             rt               = _mm256_mul_ps(r22,vftabscale);
774             vfitab           = _mm256_cvttps_epi32(rt);
775             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
776             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
777             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
778             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
779             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
780             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
781
782             /* CUBIC SPLINE TABLE ELECTROSTATICS */
783             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
784                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
785             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
786                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
787             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
788                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
789             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
790                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
791             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
792             Heps             = _mm256_mul_ps(vfeps,H);
793             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
794             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
795             velec            = _mm256_mul_ps(qq22,VV);
796             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
797             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
798
799             /* Update potential sum for this i atom from the interaction with this j atom. */
800             velecsum         = _mm256_add_ps(velecsum,velec);
801
802             fscal            = felec;
803
804             /* Calculate temporary vectorial force */
805             tx               = _mm256_mul_ps(fscal,dx22);
806             ty               = _mm256_mul_ps(fscal,dy22);
807             tz               = _mm256_mul_ps(fscal,dz22);
808
809             /* Update vectorial force */
810             fix2             = _mm256_add_ps(fix2,tx);
811             fiy2             = _mm256_add_ps(fiy2,ty);
812             fiz2             = _mm256_add_ps(fiz2,tz);
813
814             fjx2             = _mm256_add_ps(fjx2,tx);
815             fjy2             = _mm256_add_ps(fjy2,ty);
816             fjz2             = _mm256_add_ps(fjz2,tz);
817
818             fjptrA             = f+j_coord_offsetA;
819             fjptrB             = f+j_coord_offsetB;
820             fjptrC             = f+j_coord_offsetC;
821             fjptrD             = f+j_coord_offsetD;
822             fjptrE             = f+j_coord_offsetE;
823             fjptrF             = f+j_coord_offsetF;
824             fjptrG             = f+j_coord_offsetG;
825             fjptrH             = f+j_coord_offsetH;
826
827             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
828                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
829
830             /* Inner loop uses 417 flops */
831         }
832
833         if(jidx<j_index_end)
834         {
835
836             /* Get j neighbor index, and coordinate index */
837             jnrlistA         = jjnr[jidx];
838             jnrlistB         = jjnr[jidx+1];
839             jnrlistC         = jjnr[jidx+2];
840             jnrlistD         = jjnr[jidx+3];
841             jnrlistE         = jjnr[jidx+4];
842             jnrlistF         = jjnr[jidx+5];
843             jnrlistG         = jjnr[jidx+6];
844             jnrlistH         = jjnr[jidx+7];
845             /* Sign of each element will be negative for non-real atoms.
846              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
847              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
848              */
849             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
850                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
851                                             
852             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
853             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
854             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
855             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
856             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
857             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
858             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
859             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
860             j_coord_offsetA  = DIM*jnrA;
861             j_coord_offsetB  = DIM*jnrB;
862             j_coord_offsetC  = DIM*jnrC;
863             j_coord_offsetD  = DIM*jnrD;
864             j_coord_offsetE  = DIM*jnrE;
865             j_coord_offsetF  = DIM*jnrF;
866             j_coord_offsetG  = DIM*jnrG;
867             j_coord_offsetH  = DIM*jnrH;
868
869             /* load j atom coordinates */
870             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
871                                                  x+j_coord_offsetC,x+j_coord_offsetD,
872                                                  x+j_coord_offsetE,x+j_coord_offsetF,
873                                                  x+j_coord_offsetG,x+j_coord_offsetH,
874                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
875
876             /* Calculate displacement vector */
877             dx00             = _mm256_sub_ps(ix0,jx0);
878             dy00             = _mm256_sub_ps(iy0,jy0);
879             dz00             = _mm256_sub_ps(iz0,jz0);
880             dx01             = _mm256_sub_ps(ix0,jx1);
881             dy01             = _mm256_sub_ps(iy0,jy1);
882             dz01             = _mm256_sub_ps(iz0,jz1);
883             dx02             = _mm256_sub_ps(ix0,jx2);
884             dy02             = _mm256_sub_ps(iy0,jy2);
885             dz02             = _mm256_sub_ps(iz0,jz2);
886             dx10             = _mm256_sub_ps(ix1,jx0);
887             dy10             = _mm256_sub_ps(iy1,jy0);
888             dz10             = _mm256_sub_ps(iz1,jz0);
889             dx11             = _mm256_sub_ps(ix1,jx1);
890             dy11             = _mm256_sub_ps(iy1,jy1);
891             dz11             = _mm256_sub_ps(iz1,jz1);
892             dx12             = _mm256_sub_ps(ix1,jx2);
893             dy12             = _mm256_sub_ps(iy1,jy2);
894             dz12             = _mm256_sub_ps(iz1,jz2);
895             dx20             = _mm256_sub_ps(ix2,jx0);
896             dy20             = _mm256_sub_ps(iy2,jy0);
897             dz20             = _mm256_sub_ps(iz2,jz0);
898             dx21             = _mm256_sub_ps(ix2,jx1);
899             dy21             = _mm256_sub_ps(iy2,jy1);
900             dz21             = _mm256_sub_ps(iz2,jz1);
901             dx22             = _mm256_sub_ps(ix2,jx2);
902             dy22             = _mm256_sub_ps(iy2,jy2);
903             dz22             = _mm256_sub_ps(iz2,jz2);
904
905             /* Calculate squared distance and things based on it */
906             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
907             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
908             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
909             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
910             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
911             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
912             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
913             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
914             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
915
916             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
917             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
918             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
919             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
920             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
921             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
922             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
923             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
924             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
925
926             fjx0             = _mm256_setzero_ps();
927             fjy0             = _mm256_setzero_ps();
928             fjz0             = _mm256_setzero_ps();
929             fjx1             = _mm256_setzero_ps();
930             fjy1             = _mm256_setzero_ps();
931             fjz1             = _mm256_setzero_ps();
932             fjx2             = _mm256_setzero_ps();
933             fjy2             = _mm256_setzero_ps();
934             fjz2             = _mm256_setzero_ps();
935
936             /**************************
937              * CALCULATE INTERACTIONS *
938              **************************/
939
940             r00              = _mm256_mul_ps(rsq00,rinv00);
941             r00              = _mm256_andnot_ps(dummy_mask,r00);
942
943             /* Calculate table index by multiplying r with table scale and truncate to integer */
944             rt               = _mm256_mul_ps(r00,vftabscale);
945             vfitab           = _mm256_cvttps_epi32(rt);
946             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
947             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
948             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
949             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
950             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
951             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
952
953             /* CUBIC SPLINE TABLE ELECTROSTATICS */
954             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
955                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
956             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
957                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
958             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
959                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
960             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
961                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
962             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
963             Heps             = _mm256_mul_ps(vfeps,H);
964             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
965             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
966             velec            = _mm256_mul_ps(qq00,VV);
967             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
968             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
969
970             /* CUBIC SPLINE TABLE DISPERSION */
971             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
972             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
973             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
974                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
975             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
976                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
977             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
978                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
979             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
980                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
981             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
982             Heps             = _mm256_mul_ps(vfeps,H);
983             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
984             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
985             vvdw6            = _mm256_mul_ps(c6_00,VV);
986             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
987             fvdw6            = _mm256_mul_ps(c6_00,FF);
988
989             /* CUBIC SPLINE TABLE REPULSION */
990             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
991             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
992             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
993                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
994             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
995                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
996             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
997                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
998             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
999                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1000             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1001             Heps             = _mm256_mul_ps(vfeps,H);
1002             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1003             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1004             vvdw12           = _mm256_mul_ps(c12_00,VV);
1005             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1006             fvdw12           = _mm256_mul_ps(c12_00,FF);
1007             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
1008             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1009
1010             /* Update potential sum for this i atom from the interaction with this j atom. */
1011             velec            = _mm256_andnot_ps(dummy_mask,velec);
1012             velecsum         = _mm256_add_ps(velecsum,velec);
1013             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
1014             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
1015
1016             fscal            = _mm256_add_ps(felec,fvdw);
1017
1018             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1019
1020             /* Calculate temporary vectorial force */
1021             tx               = _mm256_mul_ps(fscal,dx00);
1022             ty               = _mm256_mul_ps(fscal,dy00);
1023             tz               = _mm256_mul_ps(fscal,dz00);
1024
1025             /* Update vectorial force */
1026             fix0             = _mm256_add_ps(fix0,tx);
1027             fiy0             = _mm256_add_ps(fiy0,ty);
1028             fiz0             = _mm256_add_ps(fiz0,tz);
1029
1030             fjx0             = _mm256_add_ps(fjx0,tx);
1031             fjy0             = _mm256_add_ps(fjy0,ty);
1032             fjz0             = _mm256_add_ps(fjz0,tz);
1033
1034             /**************************
1035              * CALCULATE INTERACTIONS *
1036              **************************/
1037
1038             r01              = _mm256_mul_ps(rsq01,rinv01);
1039             r01              = _mm256_andnot_ps(dummy_mask,r01);
1040
1041             /* Calculate table index by multiplying r with table scale and truncate to integer */
1042             rt               = _mm256_mul_ps(r01,vftabscale);
1043             vfitab           = _mm256_cvttps_epi32(rt);
1044             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1045             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1046             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1047             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1048             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1049             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1050
1051             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1052             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1053                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1054             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1055                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1056             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1057                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1058             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1059                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1060             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1061             Heps             = _mm256_mul_ps(vfeps,H);
1062             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1063             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1064             velec            = _mm256_mul_ps(qq01,VV);
1065             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1066             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1067
1068             /* Update potential sum for this i atom from the interaction with this j atom. */
1069             velec            = _mm256_andnot_ps(dummy_mask,velec);
1070             velecsum         = _mm256_add_ps(velecsum,velec);
1071
1072             fscal            = felec;
1073
1074             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1075
1076             /* Calculate temporary vectorial force */
1077             tx               = _mm256_mul_ps(fscal,dx01);
1078             ty               = _mm256_mul_ps(fscal,dy01);
1079             tz               = _mm256_mul_ps(fscal,dz01);
1080
1081             /* Update vectorial force */
1082             fix0             = _mm256_add_ps(fix0,tx);
1083             fiy0             = _mm256_add_ps(fiy0,ty);
1084             fiz0             = _mm256_add_ps(fiz0,tz);
1085
1086             fjx1             = _mm256_add_ps(fjx1,tx);
1087             fjy1             = _mm256_add_ps(fjy1,ty);
1088             fjz1             = _mm256_add_ps(fjz1,tz);
1089
1090             /**************************
1091              * CALCULATE INTERACTIONS *
1092              **************************/
1093
1094             r02              = _mm256_mul_ps(rsq02,rinv02);
1095             r02              = _mm256_andnot_ps(dummy_mask,r02);
1096
1097             /* Calculate table index by multiplying r with table scale and truncate to integer */
1098             rt               = _mm256_mul_ps(r02,vftabscale);
1099             vfitab           = _mm256_cvttps_epi32(rt);
1100             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1101             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1102             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1103             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1104             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1105             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1106
1107             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1108             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1109                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1110             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1111                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1112             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1113                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1114             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1115                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1116             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1117             Heps             = _mm256_mul_ps(vfeps,H);
1118             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1119             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1120             velec            = _mm256_mul_ps(qq02,VV);
1121             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1122             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1123
1124             /* Update potential sum for this i atom from the interaction with this j atom. */
1125             velec            = _mm256_andnot_ps(dummy_mask,velec);
1126             velecsum         = _mm256_add_ps(velecsum,velec);
1127
1128             fscal            = felec;
1129
1130             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1131
1132             /* Calculate temporary vectorial force */
1133             tx               = _mm256_mul_ps(fscal,dx02);
1134             ty               = _mm256_mul_ps(fscal,dy02);
1135             tz               = _mm256_mul_ps(fscal,dz02);
1136
1137             /* Update vectorial force */
1138             fix0             = _mm256_add_ps(fix0,tx);
1139             fiy0             = _mm256_add_ps(fiy0,ty);
1140             fiz0             = _mm256_add_ps(fiz0,tz);
1141
1142             fjx2             = _mm256_add_ps(fjx2,tx);
1143             fjy2             = _mm256_add_ps(fjy2,ty);
1144             fjz2             = _mm256_add_ps(fjz2,tz);
1145
1146             /**************************
1147              * CALCULATE INTERACTIONS *
1148              **************************/
1149
1150             r10              = _mm256_mul_ps(rsq10,rinv10);
1151             r10              = _mm256_andnot_ps(dummy_mask,r10);
1152
1153             /* Calculate table index by multiplying r with table scale and truncate to integer */
1154             rt               = _mm256_mul_ps(r10,vftabscale);
1155             vfitab           = _mm256_cvttps_epi32(rt);
1156             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1157             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1158             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1159             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1160             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1161             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1162
1163             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1164             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1165                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1166             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1167                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1168             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1169                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1170             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1171                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1172             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1173             Heps             = _mm256_mul_ps(vfeps,H);
1174             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1175             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1176             velec            = _mm256_mul_ps(qq10,VV);
1177             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1178             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1179
1180             /* Update potential sum for this i atom from the interaction with this j atom. */
1181             velec            = _mm256_andnot_ps(dummy_mask,velec);
1182             velecsum         = _mm256_add_ps(velecsum,velec);
1183
1184             fscal            = felec;
1185
1186             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1187
1188             /* Calculate temporary vectorial force */
1189             tx               = _mm256_mul_ps(fscal,dx10);
1190             ty               = _mm256_mul_ps(fscal,dy10);
1191             tz               = _mm256_mul_ps(fscal,dz10);
1192
1193             /* Update vectorial force */
1194             fix1             = _mm256_add_ps(fix1,tx);
1195             fiy1             = _mm256_add_ps(fiy1,ty);
1196             fiz1             = _mm256_add_ps(fiz1,tz);
1197
1198             fjx0             = _mm256_add_ps(fjx0,tx);
1199             fjy0             = _mm256_add_ps(fjy0,ty);
1200             fjz0             = _mm256_add_ps(fjz0,tz);
1201
1202             /**************************
1203              * CALCULATE INTERACTIONS *
1204              **************************/
1205
1206             r11              = _mm256_mul_ps(rsq11,rinv11);
1207             r11              = _mm256_andnot_ps(dummy_mask,r11);
1208
1209             /* Calculate table index by multiplying r with table scale and truncate to integer */
1210             rt               = _mm256_mul_ps(r11,vftabscale);
1211             vfitab           = _mm256_cvttps_epi32(rt);
1212             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1213             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1214             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1215             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1216             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1217             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1218
1219             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1220             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1221                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1222             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1223                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1224             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1225                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1226             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1227                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1228             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1229             Heps             = _mm256_mul_ps(vfeps,H);
1230             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1231             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1232             velec            = _mm256_mul_ps(qq11,VV);
1233             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1234             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1235
1236             /* Update potential sum for this i atom from the interaction with this j atom. */
1237             velec            = _mm256_andnot_ps(dummy_mask,velec);
1238             velecsum         = _mm256_add_ps(velecsum,velec);
1239
1240             fscal            = felec;
1241
1242             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1243
1244             /* Calculate temporary vectorial force */
1245             tx               = _mm256_mul_ps(fscal,dx11);
1246             ty               = _mm256_mul_ps(fscal,dy11);
1247             tz               = _mm256_mul_ps(fscal,dz11);
1248
1249             /* Update vectorial force */
1250             fix1             = _mm256_add_ps(fix1,tx);
1251             fiy1             = _mm256_add_ps(fiy1,ty);
1252             fiz1             = _mm256_add_ps(fiz1,tz);
1253
1254             fjx1             = _mm256_add_ps(fjx1,tx);
1255             fjy1             = _mm256_add_ps(fjy1,ty);
1256             fjz1             = _mm256_add_ps(fjz1,tz);
1257
1258             /**************************
1259              * CALCULATE INTERACTIONS *
1260              **************************/
1261
1262             r12              = _mm256_mul_ps(rsq12,rinv12);
1263             r12              = _mm256_andnot_ps(dummy_mask,r12);
1264
1265             /* Calculate table index by multiplying r with table scale and truncate to integer */
1266             rt               = _mm256_mul_ps(r12,vftabscale);
1267             vfitab           = _mm256_cvttps_epi32(rt);
1268             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1269             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1270             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1271             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1272             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1273             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1274
1275             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1276             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1277                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1278             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1279                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1280             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1281                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1282             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1283                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1284             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1285             Heps             = _mm256_mul_ps(vfeps,H);
1286             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1287             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1288             velec            = _mm256_mul_ps(qq12,VV);
1289             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1290             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1291
1292             /* Update potential sum for this i atom from the interaction with this j atom. */
1293             velec            = _mm256_andnot_ps(dummy_mask,velec);
1294             velecsum         = _mm256_add_ps(velecsum,velec);
1295
1296             fscal            = felec;
1297
1298             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1299
1300             /* Calculate temporary vectorial force */
1301             tx               = _mm256_mul_ps(fscal,dx12);
1302             ty               = _mm256_mul_ps(fscal,dy12);
1303             tz               = _mm256_mul_ps(fscal,dz12);
1304
1305             /* Update vectorial force */
1306             fix1             = _mm256_add_ps(fix1,tx);
1307             fiy1             = _mm256_add_ps(fiy1,ty);
1308             fiz1             = _mm256_add_ps(fiz1,tz);
1309
1310             fjx2             = _mm256_add_ps(fjx2,tx);
1311             fjy2             = _mm256_add_ps(fjy2,ty);
1312             fjz2             = _mm256_add_ps(fjz2,tz);
1313
1314             /**************************
1315              * CALCULATE INTERACTIONS *
1316              **************************/
1317
1318             r20              = _mm256_mul_ps(rsq20,rinv20);
1319             r20              = _mm256_andnot_ps(dummy_mask,r20);
1320
1321             /* Calculate table index by multiplying r with table scale and truncate to integer */
1322             rt               = _mm256_mul_ps(r20,vftabscale);
1323             vfitab           = _mm256_cvttps_epi32(rt);
1324             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1325             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1326             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1327             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1328             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1329             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1330
1331             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1332             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1333                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1334             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1335                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1336             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1337                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1338             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1339                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1340             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1341             Heps             = _mm256_mul_ps(vfeps,H);
1342             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1343             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1344             velec            = _mm256_mul_ps(qq20,VV);
1345             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1346             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1347
1348             /* Update potential sum for this i atom from the interaction with this j atom. */
1349             velec            = _mm256_andnot_ps(dummy_mask,velec);
1350             velecsum         = _mm256_add_ps(velecsum,velec);
1351
1352             fscal            = felec;
1353
1354             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1355
1356             /* Calculate temporary vectorial force */
1357             tx               = _mm256_mul_ps(fscal,dx20);
1358             ty               = _mm256_mul_ps(fscal,dy20);
1359             tz               = _mm256_mul_ps(fscal,dz20);
1360
1361             /* Update vectorial force */
1362             fix2             = _mm256_add_ps(fix2,tx);
1363             fiy2             = _mm256_add_ps(fiy2,ty);
1364             fiz2             = _mm256_add_ps(fiz2,tz);
1365
1366             fjx0             = _mm256_add_ps(fjx0,tx);
1367             fjy0             = _mm256_add_ps(fjy0,ty);
1368             fjz0             = _mm256_add_ps(fjz0,tz);
1369
1370             /**************************
1371              * CALCULATE INTERACTIONS *
1372              **************************/
1373
1374             r21              = _mm256_mul_ps(rsq21,rinv21);
1375             r21              = _mm256_andnot_ps(dummy_mask,r21);
1376
1377             /* Calculate table index by multiplying r with table scale and truncate to integer */
1378             rt               = _mm256_mul_ps(r21,vftabscale);
1379             vfitab           = _mm256_cvttps_epi32(rt);
1380             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1381             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1382             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1383             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1384             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1385             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1386
1387             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1388             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1389                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1390             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1391                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1392             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1393                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1394             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1395                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1396             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1397             Heps             = _mm256_mul_ps(vfeps,H);
1398             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1399             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1400             velec            = _mm256_mul_ps(qq21,VV);
1401             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1402             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1403
1404             /* Update potential sum for this i atom from the interaction with this j atom. */
1405             velec            = _mm256_andnot_ps(dummy_mask,velec);
1406             velecsum         = _mm256_add_ps(velecsum,velec);
1407
1408             fscal            = felec;
1409
1410             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1411
1412             /* Calculate temporary vectorial force */
1413             tx               = _mm256_mul_ps(fscal,dx21);
1414             ty               = _mm256_mul_ps(fscal,dy21);
1415             tz               = _mm256_mul_ps(fscal,dz21);
1416
1417             /* Update vectorial force */
1418             fix2             = _mm256_add_ps(fix2,tx);
1419             fiy2             = _mm256_add_ps(fiy2,ty);
1420             fiz2             = _mm256_add_ps(fiz2,tz);
1421
1422             fjx1             = _mm256_add_ps(fjx1,tx);
1423             fjy1             = _mm256_add_ps(fjy1,ty);
1424             fjz1             = _mm256_add_ps(fjz1,tz);
1425
1426             /**************************
1427              * CALCULATE INTERACTIONS *
1428              **************************/
1429
1430             r22              = _mm256_mul_ps(rsq22,rinv22);
1431             r22              = _mm256_andnot_ps(dummy_mask,r22);
1432
1433             /* Calculate table index by multiplying r with table scale and truncate to integer */
1434             rt               = _mm256_mul_ps(r22,vftabscale);
1435             vfitab           = _mm256_cvttps_epi32(rt);
1436             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1437             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1438             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1439             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1440             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1441             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1442
1443             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1444             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1445                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1446             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1447                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1448             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1449                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1450             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1451                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1452             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1453             Heps             = _mm256_mul_ps(vfeps,H);
1454             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1455             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1456             velec            = _mm256_mul_ps(qq22,VV);
1457             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1458             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1459
1460             /* Update potential sum for this i atom from the interaction with this j atom. */
1461             velec            = _mm256_andnot_ps(dummy_mask,velec);
1462             velecsum         = _mm256_add_ps(velecsum,velec);
1463
1464             fscal            = felec;
1465
1466             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1467
1468             /* Calculate temporary vectorial force */
1469             tx               = _mm256_mul_ps(fscal,dx22);
1470             ty               = _mm256_mul_ps(fscal,dy22);
1471             tz               = _mm256_mul_ps(fscal,dz22);
1472
1473             /* Update vectorial force */
1474             fix2             = _mm256_add_ps(fix2,tx);
1475             fiy2             = _mm256_add_ps(fiy2,ty);
1476             fiz2             = _mm256_add_ps(fiz2,tz);
1477
1478             fjx2             = _mm256_add_ps(fjx2,tx);
1479             fjy2             = _mm256_add_ps(fjy2,ty);
1480             fjz2             = _mm256_add_ps(fjz2,tz);
1481
1482             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1483             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1484             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1485             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1486             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1487             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1488             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1489             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1490
1491             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1492                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1493
1494             /* Inner loop uses 426 flops */
1495         }
1496
1497         /* End of innermost loop */
1498
1499         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1500                                                  f+i_coord_offset,fshift+i_shift_offset);
1501
1502         ggid                        = gid[iidx];
1503         /* Update potential energies */
1504         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1505         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1506
1507         /* Increment number of inner iterations */
1508         inneriter                  += j_index_end - j_index_start;
1509
1510         /* Outer loop uses 20 flops */
1511     }
1512
1513     /* Increment number of outer iterations */
1514     outeriter        += nri;
1515
1516     /* Update outer/inner flops */
1517
1518     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
1519 }
1520 /*
1521  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1522  * Electrostatics interaction: CubicSplineTable
1523  * VdW interaction:            CubicSplineTable
1524  * Geometry:                   Water3-Water3
1525  * Calculate force/pot:        Force
1526  */
1527 void
1528 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1529                     (t_nblist                    * gmx_restrict       nlist,
1530                      rvec                        * gmx_restrict          xx,
1531                      rvec                        * gmx_restrict          ff,
1532                      t_forcerec                  * gmx_restrict          fr,
1533                      t_mdatoms                   * gmx_restrict     mdatoms,
1534                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1535                      t_nrnb                      * gmx_restrict        nrnb)
1536 {
1537     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1538      * just 0 for non-waters.
1539      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1540      * jnr indices corresponding to data put in the four positions in the SIMD register.
1541      */
1542     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1543     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1544     int              jnrA,jnrB,jnrC,jnrD;
1545     int              jnrE,jnrF,jnrG,jnrH;
1546     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1547     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1548     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1549     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1550     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1551     real             rcutoff_scalar;
1552     real             *shiftvec,*fshift,*x,*f;
1553     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1554     real             scratch[4*DIM];
1555     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1556     real *           vdwioffsetptr0;
1557     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1558     real *           vdwioffsetptr1;
1559     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1560     real *           vdwioffsetptr2;
1561     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1562     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1563     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1564     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1565     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1566     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1567     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1568     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1569     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1570     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1571     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1572     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1573     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1574     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1575     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1576     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1577     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1578     real             *charge;
1579     int              nvdwtype;
1580     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1581     int              *vdwtype;
1582     real             *vdwparam;
1583     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1584     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1585     __m256i          vfitab;
1586     __m128i          vfitab_lo,vfitab_hi;
1587     __m128i          ifour       = _mm_set1_epi32(4);
1588     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1589     real             *vftab;
1590     __m256           dummy_mask,cutoff_mask;
1591     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1592     __m256           one     = _mm256_set1_ps(1.0);
1593     __m256           two     = _mm256_set1_ps(2.0);
1594     x                = xx[0];
1595     f                = ff[0];
1596
1597     nri              = nlist->nri;
1598     iinr             = nlist->iinr;
1599     jindex           = nlist->jindex;
1600     jjnr             = nlist->jjnr;
1601     shiftidx         = nlist->shift;
1602     gid              = nlist->gid;
1603     shiftvec         = fr->shift_vec[0];
1604     fshift           = fr->fshift[0];
1605     facel            = _mm256_set1_ps(fr->epsfac);
1606     charge           = mdatoms->chargeA;
1607     nvdwtype         = fr->ntype;
1608     vdwparam         = fr->nbfp;
1609     vdwtype          = mdatoms->typeA;
1610
1611     vftab            = kernel_data->table_elec_vdw->data;
1612     vftabscale       = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
1613
1614     /* Setup water-specific parameters */
1615     inr              = nlist->iinr[0];
1616     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1617     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1618     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1619     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1620
1621     jq0              = _mm256_set1_ps(charge[inr+0]);
1622     jq1              = _mm256_set1_ps(charge[inr+1]);
1623     jq2              = _mm256_set1_ps(charge[inr+2]);
1624     vdwjidx0A        = 2*vdwtype[inr+0];
1625     qq00             = _mm256_mul_ps(iq0,jq0);
1626     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1627     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1628     qq01             = _mm256_mul_ps(iq0,jq1);
1629     qq02             = _mm256_mul_ps(iq0,jq2);
1630     qq10             = _mm256_mul_ps(iq1,jq0);
1631     qq11             = _mm256_mul_ps(iq1,jq1);
1632     qq12             = _mm256_mul_ps(iq1,jq2);
1633     qq20             = _mm256_mul_ps(iq2,jq0);
1634     qq21             = _mm256_mul_ps(iq2,jq1);
1635     qq22             = _mm256_mul_ps(iq2,jq2);
1636
1637     /* Avoid stupid compiler warnings */
1638     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1639     j_coord_offsetA = 0;
1640     j_coord_offsetB = 0;
1641     j_coord_offsetC = 0;
1642     j_coord_offsetD = 0;
1643     j_coord_offsetE = 0;
1644     j_coord_offsetF = 0;
1645     j_coord_offsetG = 0;
1646     j_coord_offsetH = 0;
1647
1648     outeriter        = 0;
1649     inneriter        = 0;
1650
1651     for(iidx=0;iidx<4*DIM;iidx++)
1652     {
1653         scratch[iidx] = 0.0;
1654     }
1655
1656     /* Start outer loop over neighborlists */
1657     for(iidx=0; iidx<nri; iidx++)
1658     {
1659         /* Load shift vector for this list */
1660         i_shift_offset   = DIM*shiftidx[iidx];
1661
1662         /* Load limits for loop over neighbors */
1663         j_index_start    = jindex[iidx];
1664         j_index_end      = jindex[iidx+1];
1665
1666         /* Get outer coordinate index */
1667         inr              = iinr[iidx];
1668         i_coord_offset   = DIM*inr;
1669
1670         /* Load i particle coords and add shift vector */
1671         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1672                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1673
1674         fix0             = _mm256_setzero_ps();
1675         fiy0             = _mm256_setzero_ps();
1676         fiz0             = _mm256_setzero_ps();
1677         fix1             = _mm256_setzero_ps();
1678         fiy1             = _mm256_setzero_ps();
1679         fiz1             = _mm256_setzero_ps();
1680         fix2             = _mm256_setzero_ps();
1681         fiy2             = _mm256_setzero_ps();
1682         fiz2             = _mm256_setzero_ps();
1683
1684         /* Start inner kernel loop */
1685         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1686         {
1687
1688             /* Get j neighbor index, and coordinate index */
1689             jnrA             = jjnr[jidx];
1690             jnrB             = jjnr[jidx+1];
1691             jnrC             = jjnr[jidx+2];
1692             jnrD             = jjnr[jidx+3];
1693             jnrE             = jjnr[jidx+4];
1694             jnrF             = jjnr[jidx+5];
1695             jnrG             = jjnr[jidx+6];
1696             jnrH             = jjnr[jidx+7];
1697             j_coord_offsetA  = DIM*jnrA;
1698             j_coord_offsetB  = DIM*jnrB;
1699             j_coord_offsetC  = DIM*jnrC;
1700             j_coord_offsetD  = DIM*jnrD;
1701             j_coord_offsetE  = DIM*jnrE;
1702             j_coord_offsetF  = DIM*jnrF;
1703             j_coord_offsetG  = DIM*jnrG;
1704             j_coord_offsetH  = DIM*jnrH;
1705
1706             /* load j atom coordinates */
1707             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1708                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1709                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1710                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1711                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1712
1713             /* Calculate displacement vector */
1714             dx00             = _mm256_sub_ps(ix0,jx0);
1715             dy00             = _mm256_sub_ps(iy0,jy0);
1716             dz00             = _mm256_sub_ps(iz0,jz0);
1717             dx01             = _mm256_sub_ps(ix0,jx1);
1718             dy01             = _mm256_sub_ps(iy0,jy1);
1719             dz01             = _mm256_sub_ps(iz0,jz1);
1720             dx02             = _mm256_sub_ps(ix0,jx2);
1721             dy02             = _mm256_sub_ps(iy0,jy2);
1722             dz02             = _mm256_sub_ps(iz0,jz2);
1723             dx10             = _mm256_sub_ps(ix1,jx0);
1724             dy10             = _mm256_sub_ps(iy1,jy0);
1725             dz10             = _mm256_sub_ps(iz1,jz0);
1726             dx11             = _mm256_sub_ps(ix1,jx1);
1727             dy11             = _mm256_sub_ps(iy1,jy1);
1728             dz11             = _mm256_sub_ps(iz1,jz1);
1729             dx12             = _mm256_sub_ps(ix1,jx2);
1730             dy12             = _mm256_sub_ps(iy1,jy2);
1731             dz12             = _mm256_sub_ps(iz1,jz2);
1732             dx20             = _mm256_sub_ps(ix2,jx0);
1733             dy20             = _mm256_sub_ps(iy2,jy0);
1734             dz20             = _mm256_sub_ps(iz2,jz0);
1735             dx21             = _mm256_sub_ps(ix2,jx1);
1736             dy21             = _mm256_sub_ps(iy2,jy1);
1737             dz21             = _mm256_sub_ps(iz2,jz1);
1738             dx22             = _mm256_sub_ps(ix2,jx2);
1739             dy22             = _mm256_sub_ps(iy2,jy2);
1740             dz22             = _mm256_sub_ps(iz2,jz2);
1741
1742             /* Calculate squared distance and things based on it */
1743             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1744             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1745             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1746             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1747             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1748             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1749             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1750             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1751             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1752
1753             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1754             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1755             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1756             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1757             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1758             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1759             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1760             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1761             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1762
1763             fjx0             = _mm256_setzero_ps();
1764             fjy0             = _mm256_setzero_ps();
1765             fjz0             = _mm256_setzero_ps();
1766             fjx1             = _mm256_setzero_ps();
1767             fjy1             = _mm256_setzero_ps();
1768             fjz1             = _mm256_setzero_ps();
1769             fjx2             = _mm256_setzero_ps();
1770             fjy2             = _mm256_setzero_ps();
1771             fjz2             = _mm256_setzero_ps();
1772
1773             /**************************
1774              * CALCULATE INTERACTIONS *
1775              **************************/
1776
1777             r00              = _mm256_mul_ps(rsq00,rinv00);
1778
1779             /* Calculate table index by multiplying r with table scale and truncate to integer */
1780             rt               = _mm256_mul_ps(r00,vftabscale);
1781             vfitab           = _mm256_cvttps_epi32(rt);
1782             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1783             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1784             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1785             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1786             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1787             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1788
1789             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1790             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1791                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1792             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1793                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1794             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1795                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1796             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1797                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1798             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1799             Heps             = _mm256_mul_ps(vfeps,H);
1800             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1801             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1802             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1803
1804             /* CUBIC SPLINE TABLE DISPERSION */
1805             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1806             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1807             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1808                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1809             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1810                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1811             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1812                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1813             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1814                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1815             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1816             Heps             = _mm256_mul_ps(vfeps,H);
1817             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1818             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1819             fvdw6            = _mm256_mul_ps(c6_00,FF);
1820
1821             /* CUBIC SPLINE TABLE REPULSION */
1822             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1823             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1824             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1825                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1826             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1827                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1828             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1829                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1830             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1831                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1832             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1833             Heps             = _mm256_mul_ps(vfeps,H);
1834             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1835             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1836             fvdw12           = _mm256_mul_ps(c12_00,FF);
1837             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1838
1839             fscal            = _mm256_add_ps(felec,fvdw);
1840
1841             /* Calculate temporary vectorial force */
1842             tx               = _mm256_mul_ps(fscal,dx00);
1843             ty               = _mm256_mul_ps(fscal,dy00);
1844             tz               = _mm256_mul_ps(fscal,dz00);
1845
1846             /* Update vectorial force */
1847             fix0             = _mm256_add_ps(fix0,tx);
1848             fiy0             = _mm256_add_ps(fiy0,ty);
1849             fiz0             = _mm256_add_ps(fiz0,tz);
1850
1851             fjx0             = _mm256_add_ps(fjx0,tx);
1852             fjy0             = _mm256_add_ps(fjy0,ty);
1853             fjz0             = _mm256_add_ps(fjz0,tz);
1854
1855             /**************************
1856              * CALCULATE INTERACTIONS *
1857              **************************/
1858
1859             r01              = _mm256_mul_ps(rsq01,rinv01);
1860
1861             /* Calculate table index by multiplying r with table scale and truncate to integer */
1862             rt               = _mm256_mul_ps(r01,vftabscale);
1863             vfitab           = _mm256_cvttps_epi32(rt);
1864             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1865             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1866             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1867             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1868             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1869             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1870
1871             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1872             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1873                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1874             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1875                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1876             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1877                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1878             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1879                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1880             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1881             Heps             = _mm256_mul_ps(vfeps,H);
1882             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1883             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1884             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1885
1886             fscal            = felec;
1887
1888             /* Calculate temporary vectorial force */
1889             tx               = _mm256_mul_ps(fscal,dx01);
1890             ty               = _mm256_mul_ps(fscal,dy01);
1891             tz               = _mm256_mul_ps(fscal,dz01);
1892
1893             /* Update vectorial force */
1894             fix0             = _mm256_add_ps(fix0,tx);
1895             fiy0             = _mm256_add_ps(fiy0,ty);
1896             fiz0             = _mm256_add_ps(fiz0,tz);
1897
1898             fjx1             = _mm256_add_ps(fjx1,tx);
1899             fjy1             = _mm256_add_ps(fjy1,ty);
1900             fjz1             = _mm256_add_ps(fjz1,tz);
1901
1902             /**************************
1903              * CALCULATE INTERACTIONS *
1904              **************************/
1905
1906             r02              = _mm256_mul_ps(rsq02,rinv02);
1907
1908             /* Calculate table index by multiplying r with table scale and truncate to integer */
1909             rt               = _mm256_mul_ps(r02,vftabscale);
1910             vfitab           = _mm256_cvttps_epi32(rt);
1911             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1912             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1913             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1914             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1915             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1916             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1917
1918             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1919             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1920                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1921             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1922                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1923             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1924                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1925             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1926                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1927             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1928             Heps             = _mm256_mul_ps(vfeps,H);
1929             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1930             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1931             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1932
1933             fscal            = felec;
1934
1935             /* Calculate temporary vectorial force */
1936             tx               = _mm256_mul_ps(fscal,dx02);
1937             ty               = _mm256_mul_ps(fscal,dy02);
1938             tz               = _mm256_mul_ps(fscal,dz02);
1939
1940             /* Update vectorial force */
1941             fix0             = _mm256_add_ps(fix0,tx);
1942             fiy0             = _mm256_add_ps(fiy0,ty);
1943             fiz0             = _mm256_add_ps(fiz0,tz);
1944
1945             fjx2             = _mm256_add_ps(fjx2,tx);
1946             fjy2             = _mm256_add_ps(fjy2,ty);
1947             fjz2             = _mm256_add_ps(fjz2,tz);
1948
1949             /**************************
1950              * CALCULATE INTERACTIONS *
1951              **************************/
1952
1953             r10              = _mm256_mul_ps(rsq10,rinv10);
1954
1955             /* Calculate table index by multiplying r with table scale and truncate to integer */
1956             rt               = _mm256_mul_ps(r10,vftabscale);
1957             vfitab           = _mm256_cvttps_epi32(rt);
1958             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1959             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1960             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1961             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1962             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1963             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1964
1965             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1966             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1967                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1968             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1969                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1970             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1971                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1972             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1973                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1974             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1975             Heps             = _mm256_mul_ps(vfeps,H);
1976             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1977             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1978             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1979
1980             fscal            = felec;
1981
1982             /* Calculate temporary vectorial force */
1983             tx               = _mm256_mul_ps(fscal,dx10);
1984             ty               = _mm256_mul_ps(fscal,dy10);
1985             tz               = _mm256_mul_ps(fscal,dz10);
1986
1987             /* Update vectorial force */
1988             fix1             = _mm256_add_ps(fix1,tx);
1989             fiy1             = _mm256_add_ps(fiy1,ty);
1990             fiz1             = _mm256_add_ps(fiz1,tz);
1991
1992             fjx0             = _mm256_add_ps(fjx0,tx);
1993             fjy0             = _mm256_add_ps(fjy0,ty);
1994             fjz0             = _mm256_add_ps(fjz0,tz);
1995
1996             /**************************
1997              * CALCULATE INTERACTIONS *
1998              **************************/
1999
2000             r11              = _mm256_mul_ps(rsq11,rinv11);
2001
2002             /* Calculate table index by multiplying r with table scale and truncate to integer */
2003             rt               = _mm256_mul_ps(r11,vftabscale);
2004             vfitab           = _mm256_cvttps_epi32(rt);
2005             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2006             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2007             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2008             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2009             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2010             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2011
2012             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2013             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2014                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2015             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2016                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2017             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2018                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2019             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2020                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2021             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2022             Heps             = _mm256_mul_ps(vfeps,H);
2023             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2024             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2025             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2026
2027             fscal            = felec;
2028
2029             /* Calculate temporary vectorial force */
2030             tx               = _mm256_mul_ps(fscal,dx11);
2031             ty               = _mm256_mul_ps(fscal,dy11);
2032             tz               = _mm256_mul_ps(fscal,dz11);
2033
2034             /* Update vectorial force */
2035             fix1             = _mm256_add_ps(fix1,tx);
2036             fiy1             = _mm256_add_ps(fiy1,ty);
2037             fiz1             = _mm256_add_ps(fiz1,tz);
2038
2039             fjx1             = _mm256_add_ps(fjx1,tx);
2040             fjy1             = _mm256_add_ps(fjy1,ty);
2041             fjz1             = _mm256_add_ps(fjz1,tz);
2042
2043             /**************************
2044              * CALCULATE INTERACTIONS *
2045              **************************/
2046
2047             r12              = _mm256_mul_ps(rsq12,rinv12);
2048
2049             /* Calculate table index by multiplying r with table scale and truncate to integer */
2050             rt               = _mm256_mul_ps(r12,vftabscale);
2051             vfitab           = _mm256_cvttps_epi32(rt);
2052             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2053             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2054             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2055             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2056             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2057             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2058
2059             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2060             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2061                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2062             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2063                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2064             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2065                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2066             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2067                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2068             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2069             Heps             = _mm256_mul_ps(vfeps,H);
2070             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2071             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2072             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2073
2074             fscal            = felec;
2075
2076             /* Calculate temporary vectorial force */
2077             tx               = _mm256_mul_ps(fscal,dx12);
2078             ty               = _mm256_mul_ps(fscal,dy12);
2079             tz               = _mm256_mul_ps(fscal,dz12);
2080
2081             /* Update vectorial force */
2082             fix1             = _mm256_add_ps(fix1,tx);
2083             fiy1             = _mm256_add_ps(fiy1,ty);
2084             fiz1             = _mm256_add_ps(fiz1,tz);
2085
2086             fjx2             = _mm256_add_ps(fjx2,tx);
2087             fjy2             = _mm256_add_ps(fjy2,ty);
2088             fjz2             = _mm256_add_ps(fjz2,tz);
2089
2090             /**************************
2091              * CALCULATE INTERACTIONS *
2092              **************************/
2093
2094             r20              = _mm256_mul_ps(rsq20,rinv20);
2095
2096             /* Calculate table index by multiplying r with table scale and truncate to integer */
2097             rt               = _mm256_mul_ps(r20,vftabscale);
2098             vfitab           = _mm256_cvttps_epi32(rt);
2099             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2100             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2101             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2102             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2103             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2104             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2105
2106             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2107             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2108                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2109             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2110                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2111             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2112                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2113             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2114                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2115             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2116             Heps             = _mm256_mul_ps(vfeps,H);
2117             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2118             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2119             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2120
2121             fscal            = felec;
2122
2123             /* Calculate temporary vectorial force */
2124             tx               = _mm256_mul_ps(fscal,dx20);
2125             ty               = _mm256_mul_ps(fscal,dy20);
2126             tz               = _mm256_mul_ps(fscal,dz20);
2127
2128             /* Update vectorial force */
2129             fix2             = _mm256_add_ps(fix2,tx);
2130             fiy2             = _mm256_add_ps(fiy2,ty);
2131             fiz2             = _mm256_add_ps(fiz2,tz);
2132
2133             fjx0             = _mm256_add_ps(fjx0,tx);
2134             fjy0             = _mm256_add_ps(fjy0,ty);
2135             fjz0             = _mm256_add_ps(fjz0,tz);
2136
2137             /**************************
2138              * CALCULATE INTERACTIONS *
2139              **************************/
2140
2141             r21              = _mm256_mul_ps(rsq21,rinv21);
2142
2143             /* Calculate table index by multiplying r with table scale and truncate to integer */
2144             rt               = _mm256_mul_ps(r21,vftabscale);
2145             vfitab           = _mm256_cvttps_epi32(rt);
2146             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2147             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2148             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2149             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2150             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2151             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2152
2153             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2154             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2155                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2156             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2157                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2158             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2159                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2160             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2161                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2162             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2163             Heps             = _mm256_mul_ps(vfeps,H);
2164             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2165             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2166             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2167
2168             fscal            = felec;
2169
2170             /* Calculate temporary vectorial force */
2171             tx               = _mm256_mul_ps(fscal,dx21);
2172             ty               = _mm256_mul_ps(fscal,dy21);
2173             tz               = _mm256_mul_ps(fscal,dz21);
2174
2175             /* Update vectorial force */
2176             fix2             = _mm256_add_ps(fix2,tx);
2177             fiy2             = _mm256_add_ps(fiy2,ty);
2178             fiz2             = _mm256_add_ps(fiz2,tz);
2179
2180             fjx1             = _mm256_add_ps(fjx1,tx);
2181             fjy1             = _mm256_add_ps(fjy1,ty);
2182             fjz1             = _mm256_add_ps(fjz1,tz);
2183
2184             /**************************
2185              * CALCULATE INTERACTIONS *
2186              **************************/
2187
2188             r22              = _mm256_mul_ps(rsq22,rinv22);
2189
2190             /* Calculate table index by multiplying r with table scale and truncate to integer */
2191             rt               = _mm256_mul_ps(r22,vftabscale);
2192             vfitab           = _mm256_cvttps_epi32(rt);
2193             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2194             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2195             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2196             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2197             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2198             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2199
2200             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2201             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2202                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2203             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2204                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2205             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2206                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2207             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2208                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2209             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2210             Heps             = _mm256_mul_ps(vfeps,H);
2211             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2212             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2213             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2214
2215             fscal            = felec;
2216
2217             /* Calculate temporary vectorial force */
2218             tx               = _mm256_mul_ps(fscal,dx22);
2219             ty               = _mm256_mul_ps(fscal,dy22);
2220             tz               = _mm256_mul_ps(fscal,dz22);
2221
2222             /* Update vectorial force */
2223             fix2             = _mm256_add_ps(fix2,tx);
2224             fiy2             = _mm256_add_ps(fiy2,ty);
2225             fiz2             = _mm256_add_ps(fiz2,tz);
2226
2227             fjx2             = _mm256_add_ps(fjx2,tx);
2228             fjy2             = _mm256_add_ps(fjy2,ty);
2229             fjz2             = _mm256_add_ps(fjz2,tz);
2230
2231             fjptrA             = f+j_coord_offsetA;
2232             fjptrB             = f+j_coord_offsetB;
2233             fjptrC             = f+j_coord_offsetC;
2234             fjptrD             = f+j_coord_offsetD;
2235             fjptrE             = f+j_coord_offsetE;
2236             fjptrF             = f+j_coord_offsetF;
2237             fjptrG             = f+j_coord_offsetG;
2238             fjptrH             = f+j_coord_offsetH;
2239
2240             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2241                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2242
2243             /* Inner loop uses 373 flops */
2244         }
2245
2246         if(jidx<j_index_end)
2247         {
2248
2249             /* Get j neighbor index, and coordinate index */
2250             jnrlistA         = jjnr[jidx];
2251             jnrlistB         = jjnr[jidx+1];
2252             jnrlistC         = jjnr[jidx+2];
2253             jnrlistD         = jjnr[jidx+3];
2254             jnrlistE         = jjnr[jidx+4];
2255             jnrlistF         = jjnr[jidx+5];
2256             jnrlistG         = jjnr[jidx+6];
2257             jnrlistH         = jjnr[jidx+7];
2258             /* Sign of each element will be negative for non-real atoms.
2259              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2260              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2261              */
2262             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2263                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2264                                             
2265             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2266             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2267             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2268             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2269             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2270             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2271             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2272             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2273             j_coord_offsetA  = DIM*jnrA;
2274             j_coord_offsetB  = DIM*jnrB;
2275             j_coord_offsetC  = DIM*jnrC;
2276             j_coord_offsetD  = DIM*jnrD;
2277             j_coord_offsetE  = DIM*jnrE;
2278             j_coord_offsetF  = DIM*jnrF;
2279             j_coord_offsetG  = DIM*jnrG;
2280             j_coord_offsetH  = DIM*jnrH;
2281
2282             /* load j atom coordinates */
2283             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2284                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2285                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2286                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2287                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2288
2289             /* Calculate displacement vector */
2290             dx00             = _mm256_sub_ps(ix0,jx0);
2291             dy00             = _mm256_sub_ps(iy0,jy0);
2292             dz00             = _mm256_sub_ps(iz0,jz0);
2293             dx01             = _mm256_sub_ps(ix0,jx1);
2294             dy01             = _mm256_sub_ps(iy0,jy1);
2295             dz01             = _mm256_sub_ps(iz0,jz1);
2296             dx02             = _mm256_sub_ps(ix0,jx2);
2297             dy02             = _mm256_sub_ps(iy0,jy2);
2298             dz02             = _mm256_sub_ps(iz0,jz2);
2299             dx10             = _mm256_sub_ps(ix1,jx0);
2300             dy10             = _mm256_sub_ps(iy1,jy0);
2301             dz10             = _mm256_sub_ps(iz1,jz0);
2302             dx11             = _mm256_sub_ps(ix1,jx1);
2303             dy11             = _mm256_sub_ps(iy1,jy1);
2304             dz11             = _mm256_sub_ps(iz1,jz1);
2305             dx12             = _mm256_sub_ps(ix1,jx2);
2306             dy12             = _mm256_sub_ps(iy1,jy2);
2307             dz12             = _mm256_sub_ps(iz1,jz2);
2308             dx20             = _mm256_sub_ps(ix2,jx0);
2309             dy20             = _mm256_sub_ps(iy2,jy0);
2310             dz20             = _mm256_sub_ps(iz2,jz0);
2311             dx21             = _mm256_sub_ps(ix2,jx1);
2312             dy21             = _mm256_sub_ps(iy2,jy1);
2313             dz21             = _mm256_sub_ps(iz2,jz1);
2314             dx22             = _mm256_sub_ps(ix2,jx2);
2315             dy22             = _mm256_sub_ps(iy2,jy2);
2316             dz22             = _mm256_sub_ps(iz2,jz2);
2317
2318             /* Calculate squared distance and things based on it */
2319             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2320             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2321             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2322             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2323             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2324             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2325             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2326             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2327             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2328
2329             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2330             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
2331             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
2332             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
2333             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2334             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2335             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
2336             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2337             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2338
2339             fjx0             = _mm256_setzero_ps();
2340             fjy0             = _mm256_setzero_ps();
2341             fjz0             = _mm256_setzero_ps();
2342             fjx1             = _mm256_setzero_ps();
2343             fjy1             = _mm256_setzero_ps();
2344             fjz1             = _mm256_setzero_ps();
2345             fjx2             = _mm256_setzero_ps();
2346             fjy2             = _mm256_setzero_ps();
2347             fjz2             = _mm256_setzero_ps();
2348
2349             /**************************
2350              * CALCULATE INTERACTIONS *
2351              **************************/
2352
2353             r00              = _mm256_mul_ps(rsq00,rinv00);
2354             r00              = _mm256_andnot_ps(dummy_mask,r00);
2355
2356             /* Calculate table index by multiplying r with table scale and truncate to integer */
2357             rt               = _mm256_mul_ps(r00,vftabscale);
2358             vfitab           = _mm256_cvttps_epi32(rt);
2359             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2360             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2361             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2362             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2363             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2364             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2365
2366             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2367             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2368                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2369             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2370                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2371             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2372                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2373             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2374                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2375             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2376             Heps             = _mm256_mul_ps(vfeps,H);
2377             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2378             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2379             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2380
2381             /* CUBIC SPLINE TABLE DISPERSION */
2382             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
2383             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
2384             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2385                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2386             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2387                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2388             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2389                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2390             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2391                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2392             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2393             Heps             = _mm256_mul_ps(vfeps,H);
2394             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2395             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2396             fvdw6            = _mm256_mul_ps(c6_00,FF);
2397
2398             /* CUBIC SPLINE TABLE REPULSION */
2399             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
2400             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
2401             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2402                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2403             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2404                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2405             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2406                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2407             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2408                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2409             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2410             Heps             = _mm256_mul_ps(vfeps,H);
2411             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2412             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2413             fvdw12           = _mm256_mul_ps(c12_00,FF);
2414             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2415
2416             fscal            = _mm256_add_ps(felec,fvdw);
2417
2418             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2419
2420             /* Calculate temporary vectorial force */
2421             tx               = _mm256_mul_ps(fscal,dx00);
2422             ty               = _mm256_mul_ps(fscal,dy00);
2423             tz               = _mm256_mul_ps(fscal,dz00);
2424
2425             /* Update vectorial force */
2426             fix0             = _mm256_add_ps(fix0,tx);
2427             fiy0             = _mm256_add_ps(fiy0,ty);
2428             fiz0             = _mm256_add_ps(fiz0,tz);
2429
2430             fjx0             = _mm256_add_ps(fjx0,tx);
2431             fjy0             = _mm256_add_ps(fjy0,ty);
2432             fjz0             = _mm256_add_ps(fjz0,tz);
2433
2434             /**************************
2435              * CALCULATE INTERACTIONS *
2436              **************************/
2437
2438             r01              = _mm256_mul_ps(rsq01,rinv01);
2439             r01              = _mm256_andnot_ps(dummy_mask,r01);
2440
2441             /* Calculate table index by multiplying r with table scale and truncate to integer */
2442             rt               = _mm256_mul_ps(r01,vftabscale);
2443             vfitab           = _mm256_cvttps_epi32(rt);
2444             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2445             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2446             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2447             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2448             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2449             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2450
2451             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2452             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2453                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2454             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2455                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2456             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2457                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2458             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2459                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2460             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2461             Heps             = _mm256_mul_ps(vfeps,H);
2462             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2463             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2464             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2465
2466             fscal            = felec;
2467
2468             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2469
2470             /* Calculate temporary vectorial force */
2471             tx               = _mm256_mul_ps(fscal,dx01);
2472             ty               = _mm256_mul_ps(fscal,dy01);
2473             tz               = _mm256_mul_ps(fscal,dz01);
2474
2475             /* Update vectorial force */
2476             fix0             = _mm256_add_ps(fix0,tx);
2477             fiy0             = _mm256_add_ps(fiy0,ty);
2478             fiz0             = _mm256_add_ps(fiz0,tz);
2479
2480             fjx1             = _mm256_add_ps(fjx1,tx);
2481             fjy1             = _mm256_add_ps(fjy1,ty);
2482             fjz1             = _mm256_add_ps(fjz1,tz);
2483
2484             /**************************
2485              * CALCULATE INTERACTIONS *
2486              **************************/
2487
2488             r02              = _mm256_mul_ps(rsq02,rinv02);
2489             r02              = _mm256_andnot_ps(dummy_mask,r02);
2490
2491             /* Calculate table index by multiplying r with table scale and truncate to integer */
2492             rt               = _mm256_mul_ps(r02,vftabscale);
2493             vfitab           = _mm256_cvttps_epi32(rt);
2494             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2495             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2496             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2497             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2498             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2499             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2500
2501             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2502             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2503                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2504             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2505                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2506             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2507                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2508             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2509                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2510             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2511             Heps             = _mm256_mul_ps(vfeps,H);
2512             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2513             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2514             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2515
2516             fscal            = felec;
2517
2518             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2519
2520             /* Calculate temporary vectorial force */
2521             tx               = _mm256_mul_ps(fscal,dx02);
2522             ty               = _mm256_mul_ps(fscal,dy02);
2523             tz               = _mm256_mul_ps(fscal,dz02);
2524
2525             /* Update vectorial force */
2526             fix0             = _mm256_add_ps(fix0,tx);
2527             fiy0             = _mm256_add_ps(fiy0,ty);
2528             fiz0             = _mm256_add_ps(fiz0,tz);
2529
2530             fjx2             = _mm256_add_ps(fjx2,tx);
2531             fjy2             = _mm256_add_ps(fjy2,ty);
2532             fjz2             = _mm256_add_ps(fjz2,tz);
2533
2534             /**************************
2535              * CALCULATE INTERACTIONS *
2536              **************************/
2537
2538             r10              = _mm256_mul_ps(rsq10,rinv10);
2539             r10              = _mm256_andnot_ps(dummy_mask,r10);
2540
2541             /* Calculate table index by multiplying r with table scale and truncate to integer */
2542             rt               = _mm256_mul_ps(r10,vftabscale);
2543             vfitab           = _mm256_cvttps_epi32(rt);
2544             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2545             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2546             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2547             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2548             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2549             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2550
2551             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2552             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2553                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2554             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2555                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2556             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2557                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2558             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2559                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2560             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2561             Heps             = _mm256_mul_ps(vfeps,H);
2562             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2563             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2564             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2565
2566             fscal            = felec;
2567
2568             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2569
2570             /* Calculate temporary vectorial force */
2571             tx               = _mm256_mul_ps(fscal,dx10);
2572             ty               = _mm256_mul_ps(fscal,dy10);
2573             tz               = _mm256_mul_ps(fscal,dz10);
2574
2575             /* Update vectorial force */
2576             fix1             = _mm256_add_ps(fix1,tx);
2577             fiy1             = _mm256_add_ps(fiy1,ty);
2578             fiz1             = _mm256_add_ps(fiz1,tz);
2579
2580             fjx0             = _mm256_add_ps(fjx0,tx);
2581             fjy0             = _mm256_add_ps(fjy0,ty);
2582             fjz0             = _mm256_add_ps(fjz0,tz);
2583
2584             /**************************
2585              * CALCULATE INTERACTIONS *
2586              **************************/
2587
2588             r11              = _mm256_mul_ps(rsq11,rinv11);
2589             r11              = _mm256_andnot_ps(dummy_mask,r11);
2590
2591             /* Calculate table index by multiplying r with table scale and truncate to integer */
2592             rt               = _mm256_mul_ps(r11,vftabscale);
2593             vfitab           = _mm256_cvttps_epi32(rt);
2594             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2595             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2596             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2597             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2598             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2599             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2600
2601             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2602             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2603                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2604             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2605                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2606             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2607                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2608             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2609                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2610             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2611             Heps             = _mm256_mul_ps(vfeps,H);
2612             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2613             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2614             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2615
2616             fscal            = felec;
2617
2618             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2619
2620             /* Calculate temporary vectorial force */
2621             tx               = _mm256_mul_ps(fscal,dx11);
2622             ty               = _mm256_mul_ps(fscal,dy11);
2623             tz               = _mm256_mul_ps(fscal,dz11);
2624
2625             /* Update vectorial force */
2626             fix1             = _mm256_add_ps(fix1,tx);
2627             fiy1             = _mm256_add_ps(fiy1,ty);
2628             fiz1             = _mm256_add_ps(fiz1,tz);
2629
2630             fjx1             = _mm256_add_ps(fjx1,tx);
2631             fjy1             = _mm256_add_ps(fjy1,ty);
2632             fjz1             = _mm256_add_ps(fjz1,tz);
2633
2634             /**************************
2635              * CALCULATE INTERACTIONS *
2636              **************************/
2637
2638             r12              = _mm256_mul_ps(rsq12,rinv12);
2639             r12              = _mm256_andnot_ps(dummy_mask,r12);
2640
2641             /* Calculate table index by multiplying r with table scale and truncate to integer */
2642             rt               = _mm256_mul_ps(r12,vftabscale);
2643             vfitab           = _mm256_cvttps_epi32(rt);
2644             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2645             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2646             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2647             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2648             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2649             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2650
2651             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2652             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2653                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2654             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2655                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2656             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2657                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2658             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2659                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2660             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2661             Heps             = _mm256_mul_ps(vfeps,H);
2662             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2663             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2664             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2665
2666             fscal            = felec;
2667
2668             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2669
2670             /* Calculate temporary vectorial force */
2671             tx               = _mm256_mul_ps(fscal,dx12);
2672             ty               = _mm256_mul_ps(fscal,dy12);
2673             tz               = _mm256_mul_ps(fscal,dz12);
2674
2675             /* Update vectorial force */
2676             fix1             = _mm256_add_ps(fix1,tx);
2677             fiy1             = _mm256_add_ps(fiy1,ty);
2678             fiz1             = _mm256_add_ps(fiz1,tz);
2679
2680             fjx2             = _mm256_add_ps(fjx2,tx);
2681             fjy2             = _mm256_add_ps(fjy2,ty);
2682             fjz2             = _mm256_add_ps(fjz2,tz);
2683
2684             /**************************
2685              * CALCULATE INTERACTIONS *
2686              **************************/
2687
2688             r20              = _mm256_mul_ps(rsq20,rinv20);
2689             r20              = _mm256_andnot_ps(dummy_mask,r20);
2690
2691             /* Calculate table index by multiplying r with table scale and truncate to integer */
2692             rt               = _mm256_mul_ps(r20,vftabscale);
2693             vfitab           = _mm256_cvttps_epi32(rt);
2694             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2695             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2696             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2697             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2698             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2699             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2700
2701             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2702             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2703                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2704             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2705                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2706             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2707                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2708             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2709                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2710             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2711             Heps             = _mm256_mul_ps(vfeps,H);
2712             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2713             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2714             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2715
2716             fscal            = felec;
2717
2718             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2719
2720             /* Calculate temporary vectorial force */
2721             tx               = _mm256_mul_ps(fscal,dx20);
2722             ty               = _mm256_mul_ps(fscal,dy20);
2723             tz               = _mm256_mul_ps(fscal,dz20);
2724
2725             /* Update vectorial force */
2726             fix2             = _mm256_add_ps(fix2,tx);
2727             fiy2             = _mm256_add_ps(fiy2,ty);
2728             fiz2             = _mm256_add_ps(fiz2,tz);
2729
2730             fjx0             = _mm256_add_ps(fjx0,tx);
2731             fjy0             = _mm256_add_ps(fjy0,ty);
2732             fjz0             = _mm256_add_ps(fjz0,tz);
2733
2734             /**************************
2735              * CALCULATE INTERACTIONS *
2736              **************************/
2737
2738             r21              = _mm256_mul_ps(rsq21,rinv21);
2739             r21              = _mm256_andnot_ps(dummy_mask,r21);
2740
2741             /* Calculate table index by multiplying r with table scale and truncate to integer */
2742             rt               = _mm256_mul_ps(r21,vftabscale);
2743             vfitab           = _mm256_cvttps_epi32(rt);
2744             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2745             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2746             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2747             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2748             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2749             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2750
2751             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2752             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2753                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2754             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2755                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2756             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2757                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2758             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2759                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2760             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2761             Heps             = _mm256_mul_ps(vfeps,H);
2762             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2763             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2764             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2765
2766             fscal            = felec;
2767
2768             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2769
2770             /* Calculate temporary vectorial force */
2771             tx               = _mm256_mul_ps(fscal,dx21);
2772             ty               = _mm256_mul_ps(fscal,dy21);
2773             tz               = _mm256_mul_ps(fscal,dz21);
2774
2775             /* Update vectorial force */
2776             fix2             = _mm256_add_ps(fix2,tx);
2777             fiy2             = _mm256_add_ps(fiy2,ty);
2778             fiz2             = _mm256_add_ps(fiz2,tz);
2779
2780             fjx1             = _mm256_add_ps(fjx1,tx);
2781             fjy1             = _mm256_add_ps(fjy1,ty);
2782             fjz1             = _mm256_add_ps(fjz1,tz);
2783
2784             /**************************
2785              * CALCULATE INTERACTIONS *
2786              **************************/
2787
2788             r22              = _mm256_mul_ps(rsq22,rinv22);
2789             r22              = _mm256_andnot_ps(dummy_mask,r22);
2790
2791             /* Calculate table index by multiplying r with table scale and truncate to integer */
2792             rt               = _mm256_mul_ps(r22,vftabscale);
2793             vfitab           = _mm256_cvttps_epi32(rt);
2794             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2795             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2796             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2797             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2798             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2799             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2800
2801             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2802             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2803                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2804             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2805                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2806             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2807                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2808             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2809                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2810             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2811             Heps             = _mm256_mul_ps(vfeps,H);
2812             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2813             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2814             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2815
2816             fscal            = felec;
2817
2818             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2819
2820             /* Calculate temporary vectorial force */
2821             tx               = _mm256_mul_ps(fscal,dx22);
2822             ty               = _mm256_mul_ps(fscal,dy22);
2823             tz               = _mm256_mul_ps(fscal,dz22);
2824
2825             /* Update vectorial force */
2826             fix2             = _mm256_add_ps(fix2,tx);
2827             fiy2             = _mm256_add_ps(fiy2,ty);
2828             fiz2             = _mm256_add_ps(fiz2,tz);
2829
2830             fjx2             = _mm256_add_ps(fjx2,tx);
2831             fjy2             = _mm256_add_ps(fjy2,ty);
2832             fjz2             = _mm256_add_ps(fjz2,tz);
2833
2834             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2835             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2836             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2837             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2838             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2839             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2840             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2841             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2842
2843             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2844                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2845
2846             /* Inner loop uses 382 flops */
2847         }
2848
2849         /* End of innermost loop */
2850
2851         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2852                                                  f+i_coord_offset,fshift+i_shift_offset);
2853
2854         /* Increment number of inner iterations */
2855         inneriter                  += j_index_end - j_index_start;
2856
2857         /* Outer loop uses 18 flops */
2858     }
2859
2860     /* Increment number of outer iterations */
2861     outeriter        += nri;
2862
2863     /* Update outer/inner flops */
2864
2865     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);
2866 }