Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            CubicSplineTable
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrE,jnrF,jnrG,jnrH;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
83     real             scratch[4*DIM];
84     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     real *           vdwioffsetptr0;
86     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     real *           vdwioffsetptr1;
88     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     real *           vdwioffsetptr2;
90     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
92     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
94     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
96     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
113     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
114     __m256i          vfitab;
115     __m128i          vfitab_lo,vfitab_hi;
116     __m128i          ifour       = _mm_set1_epi32(4);
117     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
118     real             *vftab;
119     __m256           dummy_mask,cutoff_mask;
120     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
121     __m256           one     = _mm256_set1_ps(1.0);
122     __m256           two     = _mm256_set1_ps(2.0);
123     x                = xx[0];
124     f                = ff[0];
125
126     nri              = nlist->nri;
127     iinr             = nlist->iinr;
128     jindex           = nlist->jindex;
129     jjnr             = nlist->jjnr;
130     shiftidx         = nlist->shift;
131     gid              = nlist->gid;
132     shiftvec         = fr->shift_vec[0];
133     fshift           = fr->fshift[0];
134     facel            = _mm256_set1_ps(fr->ic->epsfac);
135     charge           = mdatoms->chargeA;
136     nvdwtype         = fr->ntype;
137     vdwparam         = fr->nbfp;
138     vdwtype          = mdatoms->typeA;
139
140     vftab            = kernel_data->table_elec_vdw->data;
141     vftabscale       = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
142
143     /* Setup water-specific parameters */
144     inr              = nlist->iinr[0];
145     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
146     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
147     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
148     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
149
150     jq0              = _mm256_set1_ps(charge[inr+0]);
151     jq1              = _mm256_set1_ps(charge[inr+1]);
152     jq2              = _mm256_set1_ps(charge[inr+2]);
153     vdwjidx0A        = 2*vdwtype[inr+0];
154     qq00             = _mm256_mul_ps(iq0,jq0);
155     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
156     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
157     qq01             = _mm256_mul_ps(iq0,jq1);
158     qq02             = _mm256_mul_ps(iq0,jq2);
159     qq10             = _mm256_mul_ps(iq1,jq0);
160     qq11             = _mm256_mul_ps(iq1,jq1);
161     qq12             = _mm256_mul_ps(iq1,jq2);
162     qq20             = _mm256_mul_ps(iq2,jq0);
163     qq21             = _mm256_mul_ps(iq2,jq1);
164     qq22             = _mm256_mul_ps(iq2,jq2);
165
166     /* Avoid stupid compiler warnings */
167     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
168     j_coord_offsetA = 0;
169     j_coord_offsetB = 0;
170     j_coord_offsetC = 0;
171     j_coord_offsetD = 0;
172     j_coord_offsetE = 0;
173     j_coord_offsetF = 0;
174     j_coord_offsetG = 0;
175     j_coord_offsetH = 0;
176
177     outeriter        = 0;
178     inneriter        = 0;
179
180     for(iidx=0;iidx<4*DIM;iidx++)
181     {
182         scratch[iidx] = 0.0;
183     }
184
185     /* Start outer loop over neighborlists */
186     for(iidx=0; iidx<nri; iidx++)
187     {
188         /* Load shift vector for this list */
189         i_shift_offset   = DIM*shiftidx[iidx];
190
191         /* Load limits for loop over neighbors */
192         j_index_start    = jindex[iidx];
193         j_index_end      = jindex[iidx+1];
194
195         /* Get outer coordinate index */
196         inr              = iinr[iidx];
197         i_coord_offset   = DIM*inr;
198
199         /* Load i particle coords and add shift vector */
200         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
201                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
202
203         fix0             = _mm256_setzero_ps();
204         fiy0             = _mm256_setzero_ps();
205         fiz0             = _mm256_setzero_ps();
206         fix1             = _mm256_setzero_ps();
207         fiy1             = _mm256_setzero_ps();
208         fiz1             = _mm256_setzero_ps();
209         fix2             = _mm256_setzero_ps();
210         fiy2             = _mm256_setzero_ps();
211         fiz2             = _mm256_setzero_ps();
212
213         /* Reset potential sums */
214         velecsum         = _mm256_setzero_ps();
215         vvdwsum          = _mm256_setzero_ps();
216
217         /* Start inner kernel loop */
218         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
219         {
220
221             /* Get j neighbor index, and coordinate index */
222             jnrA             = jjnr[jidx];
223             jnrB             = jjnr[jidx+1];
224             jnrC             = jjnr[jidx+2];
225             jnrD             = jjnr[jidx+3];
226             jnrE             = jjnr[jidx+4];
227             jnrF             = jjnr[jidx+5];
228             jnrG             = jjnr[jidx+6];
229             jnrH             = jjnr[jidx+7];
230             j_coord_offsetA  = DIM*jnrA;
231             j_coord_offsetB  = DIM*jnrB;
232             j_coord_offsetC  = DIM*jnrC;
233             j_coord_offsetD  = DIM*jnrD;
234             j_coord_offsetE  = DIM*jnrE;
235             j_coord_offsetF  = DIM*jnrF;
236             j_coord_offsetG  = DIM*jnrG;
237             j_coord_offsetH  = DIM*jnrH;
238
239             /* load j atom coordinates */
240             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
241                                                  x+j_coord_offsetC,x+j_coord_offsetD,
242                                                  x+j_coord_offsetE,x+j_coord_offsetF,
243                                                  x+j_coord_offsetG,x+j_coord_offsetH,
244                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
245
246             /* Calculate displacement vector */
247             dx00             = _mm256_sub_ps(ix0,jx0);
248             dy00             = _mm256_sub_ps(iy0,jy0);
249             dz00             = _mm256_sub_ps(iz0,jz0);
250             dx01             = _mm256_sub_ps(ix0,jx1);
251             dy01             = _mm256_sub_ps(iy0,jy1);
252             dz01             = _mm256_sub_ps(iz0,jz1);
253             dx02             = _mm256_sub_ps(ix0,jx2);
254             dy02             = _mm256_sub_ps(iy0,jy2);
255             dz02             = _mm256_sub_ps(iz0,jz2);
256             dx10             = _mm256_sub_ps(ix1,jx0);
257             dy10             = _mm256_sub_ps(iy1,jy0);
258             dz10             = _mm256_sub_ps(iz1,jz0);
259             dx11             = _mm256_sub_ps(ix1,jx1);
260             dy11             = _mm256_sub_ps(iy1,jy1);
261             dz11             = _mm256_sub_ps(iz1,jz1);
262             dx12             = _mm256_sub_ps(ix1,jx2);
263             dy12             = _mm256_sub_ps(iy1,jy2);
264             dz12             = _mm256_sub_ps(iz1,jz2);
265             dx20             = _mm256_sub_ps(ix2,jx0);
266             dy20             = _mm256_sub_ps(iy2,jy0);
267             dz20             = _mm256_sub_ps(iz2,jz0);
268             dx21             = _mm256_sub_ps(ix2,jx1);
269             dy21             = _mm256_sub_ps(iy2,jy1);
270             dz21             = _mm256_sub_ps(iz2,jz1);
271             dx22             = _mm256_sub_ps(ix2,jx2);
272             dy22             = _mm256_sub_ps(iy2,jy2);
273             dz22             = _mm256_sub_ps(iz2,jz2);
274
275             /* Calculate squared distance and things based on it */
276             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
277             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
278             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
279             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
280             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
281             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
282             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
283             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
284             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
285
286             rinv00           = avx256_invsqrt_f(rsq00);
287             rinv01           = avx256_invsqrt_f(rsq01);
288             rinv02           = avx256_invsqrt_f(rsq02);
289             rinv10           = avx256_invsqrt_f(rsq10);
290             rinv11           = avx256_invsqrt_f(rsq11);
291             rinv12           = avx256_invsqrt_f(rsq12);
292             rinv20           = avx256_invsqrt_f(rsq20);
293             rinv21           = avx256_invsqrt_f(rsq21);
294             rinv22           = avx256_invsqrt_f(rsq22);
295
296             fjx0             = _mm256_setzero_ps();
297             fjy0             = _mm256_setzero_ps();
298             fjz0             = _mm256_setzero_ps();
299             fjx1             = _mm256_setzero_ps();
300             fjy1             = _mm256_setzero_ps();
301             fjz1             = _mm256_setzero_ps();
302             fjx2             = _mm256_setzero_ps();
303             fjy2             = _mm256_setzero_ps();
304             fjz2             = _mm256_setzero_ps();
305
306             /**************************
307              * CALCULATE INTERACTIONS *
308              **************************/
309
310             r00              = _mm256_mul_ps(rsq00,rinv00);
311
312             /* Calculate table index by multiplying r with table scale and truncate to integer */
313             rt               = _mm256_mul_ps(r00,vftabscale);
314             vfitab           = _mm256_cvttps_epi32(rt);
315             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
316             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
317             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
318             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
319             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
320             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
321
322             /* CUBIC SPLINE TABLE ELECTROSTATICS */
323             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
324                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
325             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
326                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
327             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
328                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
329             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
330                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
331             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
332             Heps             = _mm256_mul_ps(vfeps,H);
333             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
334             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
335             velec            = _mm256_mul_ps(qq00,VV);
336             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
337             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
338
339             /* CUBIC SPLINE TABLE DISPERSION */
340             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
341             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
342             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
343                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
344             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
345                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
346             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
347                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
348             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
349                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
350             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
351             Heps             = _mm256_mul_ps(vfeps,H);
352             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
353             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
354             vvdw6            = _mm256_mul_ps(c6_00,VV);
355             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
356             fvdw6            = _mm256_mul_ps(c6_00,FF);
357
358             /* CUBIC SPLINE TABLE REPULSION */
359             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
360             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
361             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
362                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
363             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
364                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
365             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
366                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
367             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
368                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
369             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
370             Heps             = _mm256_mul_ps(vfeps,H);
371             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
372             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
373             vvdw12           = _mm256_mul_ps(c12_00,VV);
374             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
375             fvdw12           = _mm256_mul_ps(c12_00,FF);
376             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
377             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
378
379             /* Update potential sum for this i atom from the interaction with this j atom. */
380             velecsum         = _mm256_add_ps(velecsum,velec);
381             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
382
383             fscal            = _mm256_add_ps(felec,fvdw);
384
385             /* Calculate temporary vectorial force */
386             tx               = _mm256_mul_ps(fscal,dx00);
387             ty               = _mm256_mul_ps(fscal,dy00);
388             tz               = _mm256_mul_ps(fscal,dz00);
389
390             /* Update vectorial force */
391             fix0             = _mm256_add_ps(fix0,tx);
392             fiy0             = _mm256_add_ps(fiy0,ty);
393             fiz0             = _mm256_add_ps(fiz0,tz);
394
395             fjx0             = _mm256_add_ps(fjx0,tx);
396             fjy0             = _mm256_add_ps(fjy0,ty);
397             fjz0             = _mm256_add_ps(fjz0,tz);
398
399             /**************************
400              * CALCULATE INTERACTIONS *
401              **************************/
402
403             r01              = _mm256_mul_ps(rsq01,rinv01);
404
405             /* Calculate table index by multiplying r with table scale and truncate to integer */
406             rt               = _mm256_mul_ps(r01,vftabscale);
407             vfitab           = _mm256_cvttps_epi32(rt);
408             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
409             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
410             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
411             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
412             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
413             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
414
415             /* CUBIC SPLINE TABLE ELECTROSTATICS */
416             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
417                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
418             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
419                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
420             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
421                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
422             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
423                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
424             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
425             Heps             = _mm256_mul_ps(vfeps,H);
426             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
427             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
428             velec            = _mm256_mul_ps(qq01,VV);
429             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
430             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
431
432             /* Update potential sum for this i atom from the interaction with this j atom. */
433             velecsum         = _mm256_add_ps(velecsum,velec);
434
435             fscal            = felec;
436
437             /* Calculate temporary vectorial force */
438             tx               = _mm256_mul_ps(fscal,dx01);
439             ty               = _mm256_mul_ps(fscal,dy01);
440             tz               = _mm256_mul_ps(fscal,dz01);
441
442             /* Update vectorial force */
443             fix0             = _mm256_add_ps(fix0,tx);
444             fiy0             = _mm256_add_ps(fiy0,ty);
445             fiz0             = _mm256_add_ps(fiz0,tz);
446
447             fjx1             = _mm256_add_ps(fjx1,tx);
448             fjy1             = _mm256_add_ps(fjy1,ty);
449             fjz1             = _mm256_add_ps(fjz1,tz);
450
451             /**************************
452              * CALCULATE INTERACTIONS *
453              **************************/
454
455             r02              = _mm256_mul_ps(rsq02,rinv02);
456
457             /* Calculate table index by multiplying r with table scale and truncate to integer */
458             rt               = _mm256_mul_ps(r02,vftabscale);
459             vfitab           = _mm256_cvttps_epi32(rt);
460             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
461             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
462             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
463             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
464             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
465             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
466
467             /* CUBIC SPLINE TABLE ELECTROSTATICS */
468             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
469                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
470             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
471                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
472             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
473                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
474             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
475                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
476             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
477             Heps             = _mm256_mul_ps(vfeps,H);
478             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
479             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
480             velec            = _mm256_mul_ps(qq02,VV);
481             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
482             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
483
484             /* Update potential sum for this i atom from the interaction with this j atom. */
485             velecsum         = _mm256_add_ps(velecsum,velec);
486
487             fscal            = felec;
488
489             /* Calculate temporary vectorial force */
490             tx               = _mm256_mul_ps(fscal,dx02);
491             ty               = _mm256_mul_ps(fscal,dy02);
492             tz               = _mm256_mul_ps(fscal,dz02);
493
494             /* Update vectorial force */
495             fix0             = _mm256_add_ps(fix0,tx);
496             fiy0             = _mm256_add_ps(fiy0,ty);
497             fiz0             = _mm256_add_ps(fiz0,tz);
498
499             fjx2             = _mm256_add_ps(fjx2,tx);
500             fjy2             = _mm256_add_ps(fjy2,ty);
501             fjz2             = _mm256_add_ps(fjz2,tz);
502
503             /**************************
504              * CALCULATE INTERACTIONS *
505              **************************/
506
507             r10              = _mm256_mul_ps(rsq10,rinv10);
508
509             /* Calculate table index by multiplying r with table scale and truncate to integer */
510             rt               = _mm256_mul_ps(r10,vftabscale);
511             vfitab           = _mm256_cvttps_epi32(rt);
512             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
513             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
514             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
515             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
516             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
517             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
518
519             /* CUBIC SPLINE TABLE ELECTROSTATICS */
520             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
521                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
522             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
523                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
524             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
525                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
526             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
527                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
528             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
529             Heps             = _mm256_mul_ps(vfeps,H);
530             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
531             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
532             velec            = _mm256_mul_ps(qq10,VV);
533             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
534             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
535
536             /* Update potential sum for this i atom from the interaction with this j atom. */
537             velecsum         = _mm256_add_ps(velecsum,velec);
538
539             fscal            = felec;
540
541             /* Calculate temporary vectorial force */
542             tx               = _mm256_mul_ps(fscal,dx10);
543             ty               = _mm256_mul_ps(fscal,dy10);
544             tz               = _mm256_mul_ps(fscal,dz10);
545
546             /* Update vectorial force */
547             fix1             = _mm256_add_ps(fix1,tx);
548             fiy1             = _mm256_add_ps(fiy1,ty);
549             fiz1             = _mm256_add_ps(fiz1,tz);
550
551             fjx0             = _mm256_add_ps(fjx0,tx);
552             fjy0             = _mm256_add_ps(fjy0,ty);
553             fjz0             = _mm256_add_ps(fjz0,tz);
554
555             /**************************
556              * CALCULATE INTERACTIONS *
557              **************************/
558
559             r11              = _mm256_mul_ps(rsq11,rinv11);
560
561             /* Calculate table index by multiplying r with table scale and truncate to integer */
562             rt               = _mm256_mul_ps(r11,vftabscale);
563             vfitab           = _mm256_cvttps_epi32(rt);
564             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
565             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
566             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
567             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
568             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
569             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
570
571             /* CUBIC SPLINE TABLE ELECTROSTATICS */
572             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
573                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
574             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
575                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
576             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
577                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
578             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
579                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
580             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
581             Heps             = _mm256_mul_ps(vfeps,H);
582             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
583             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
584             velec            = _mm256_mul_ps(qq11,VV);
585             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
586             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
587
588             /* Update potential sum for this i atom from the interaction with this j atom. */
589             velecsum         = _mm256_add_ps(velecsum,velec);
590
591             fscal            = felec;
592
593             /* Calculate temporary vectorial force */
594             tx               = _mm256_mul_ps(fscal,dx11);
595             ty               = _mm256_mul_ps(fscal,dy11);
596             tz               = _mm256_mul_ps(fscal,dz11);
597
598             /* Update vectorial force */
599             fix1             = _mm256_add_ps(fix1,tx);
600             fiy1             = _mm256_add_ps(fiy1,ty);
601             fiz1             = _mm256_add_ps(fiz1,tz);
602
603             fjx1             = _mm256_add_ps(fjx1,tx);
604             fjy1             = _mm256_add_ps(fjy1,ty);
605             fjz1             = _mm256_add_ps(fjz1,tz);
606
607             /**************************
608              * CALCULATE INTERACTIONS *
609              **************************/
610
611             r12              = _mm256_mul_ps(rsq12,rinv12);
612
613             /* Calculate table index by multiplying r with table scale and truncate to integer */
614             rt               = _mm256_mul_ps(r12,vftabscale);
615             vfitab           = _mm256_cvttps_epi32(rt);
616             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
617             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
618             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
619             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
620             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
621             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
622
623             /* CUBIC SPLINE TABLE ELECTROSTATICS */
624             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
625                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
626             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
627                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
628             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
629                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
630             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
631                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
632             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
633             Heps             = _mm256_mul_ps(vfeps,H);
634             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
635             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
636             velec            = _mm256_mul_ps(qq12,VV);
637             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
638             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
639
640             /* Update potential sum for this i atom from the interaction with this j atom. */
641             velecsum         = _mm256_add_ps(velecsum,velec);
642
643             fscal            = felec;
644
645             /* Calculate temporary vectorial force */
646             tx               = _mm256_mul_ps(fscal,dx12);
647             ty               = _mm256_mul_ps(fscal,dy12);
648             tz               = _mm256_mul_ps(fscal,dz12);
649
650             /* Update vectorial force */
651             fix1             = _mm256_add_ps(fix1,tx);
652             fiy1             = _mm256_add_ps(fiy1,ty);
653             fiz1             = _mm256_add_ps(fiz1,tz);
654
655             fjx2             = _mm256_add_ps(fjx2,tx);
656             fjy2             = _mm256_add_ps(fjy2,ty);
657             fjz2             = _mm256_add_ps(fjz2,tz);
658
659             /**************************
660              * CALCULATE INTERACTIONS *
661              **************************/
662
663             r20              = _mm256_mul_ps(rsq20,rinv20);
664
665             /* Calculate table index by multiplying r with table scale and truncate to integer */
666             rt               = _mm256_mul_ps(r20,vftabscale);
667             vfitab           = _mm256_cvttps_epi32(rt);
668             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
669             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
670             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
671             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
672             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
673             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
674
675             /* CUBIC SPLINE TABLE ELECTROSTATICS */
676             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
677                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
678             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
679                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
680             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
681                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
682             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
683                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
684             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
685             Heps             = _mm256_mul_ps(vfeps,H);
686             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
687             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
688             velec            = _mm256_mul_ps(qq20,VV);
689             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
690             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
691
692             /* Update potential sum for this i atom from the interaction with this j atom. */
693             velecsum         = _mm256_add_ps(velecsum,velec);
694
695             fscal            = felec;
696
697             /* Calculate temporary vectorial force */
698             tx               = _mm256_mul_ps(fscal,dx20);
699             ty               = _mm256_mul_ps(fscal,dy20);
700             tz               = _mm256_mul_ps(fscal,dz20);
701
702             /* Update vectorial force */
703             fix2             = _mm256_add_ps(fix2,tx);
704             fiy2             = _mm256_add_ps(fiy2,ty);
705             fiz2             = _mm256_add_ps(fiz2,tz);
706
707             fjx0             = _mm256_add_ps(fjx0,tx);
708             fjy0             = _mm256_add_ps(fjy0,ty);
709             fjz0             = _mm256_add_ps(fjz0,tz);
710
711             /**************************
712              * CALCULATE INTERACTIONS *
713              **************************/
714
715             r21              = _mm256_mul_ps(rsq21,rinv21);
716
717             /* Calculate table index by multiplying r with table scale and truncate to integer */
718             rt               = _mm256_mul_ps(r21,vftabscale);
719             vfitab           = _mm256_cvttps_epi32(rt);
720             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
721             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
722             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
723             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
724             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
725             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
726
727             /* CUBIC SPLINE TABLE ELECTROSTATICS */
728             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
729                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
730             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
731                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
732             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
733                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
734             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
735                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
736             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
737             Heps             = _mm256_mul_ps(vfeps,H);
738             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
739             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
740             velec            = _mm256_mul_ps(qq21,VV);
741             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
742             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
743
744             /* Update potential sum for this i atom from the interaction with this j atom. */
745             velecsum         = _mm256_add_ps(velecsum,velec);
746
747             fscal            = felec;
748
749             /* Calculate temporary vectorial force */
750             tx               = _mm256_mul_ps(fscal,dx21);
751             ty               = _mm256_mul_ps(fscal,dy21);
752             tz               = _mm256_mul_ps(fscal,dz21);
753
754             /* Update vectorial force */
755             fix2             = _mm256_add_ps(fix2,tx);
756             fiy2             = _mm256_add_ps(fiy2,ty);
757             fiz2             = _mm256_add_ps(fiz2,tz);
758
759             fjx1             = _mm256_add_ps(fjx1,tx);
760             fjy1             = _mm256_add_ps(fjy1,ty);
761             fjz1             = _mm256_add_ps(fjz1,tz);
762
763             /**************************
764              * CALCULATE INTERACTIONS *
765              **************************/
766
767             r22              = _mm256_mul_ps(rsq22,rinv22);
768
769             /* Calculate table index by multiplying r with table scale and truncate to integer */
770             rt               = _mm256_mul_ps(r22,vftabscale);
771             vfitab           = _mm256_cvttps_epi32(rt);
772             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
773             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
774             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
775             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
776             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
777             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
778
779             /* CUBIC SPLINE TABLE ELECTROSTATICS */
780             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
781                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
782             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
783                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
784             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
785                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
786             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
787                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
788             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
789             Heps             = _mm256_mul_ps(vfeps,H);
790             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
791             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
792             velec            = _mm256_mul_ps(qq22,VV);
793             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
794             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
795
796             /* Update potential sum for this i atom from the interaction with this j atom. */
797             velecsum         = _mm256_add_ps(velecsum,velec);
798
799             fscal            = felec;
800
801             /* Calculate temporary vectorial force */
802             tx               = _mm256_mul_ps(fscal,dx22);
803             ty               = _mm256_mul_ps(fscal,dy22);
804             tz               = _mm256_mul_ps(fscal,dz22);
805
806             /* Update vectorial force */
807             fix2             = _mm256_add_ps(fix2,tx);
808             fiy2             = _mm256_add_ps(fiy2,ty);
809             fiz2             = _mm256_add_ps(fiz2,tz);
810
811             fjx2             = _mm256_add_ps(fjx2,tx);
812             fjy2             = _mm256_add_ps(fjy2,ty);
813             fjz2             = _mm256_add_ps(fjz2,tz);
814
815             fjptrA             = f+j_coord_offsetA;
816             fjptrB             = f+j_coord_offsetB;
817             fjptrC             = f+j_coord_offsetC;
818             fjptrD             = f+j_coord_offsetD;
819             fjptrE             = f+j_coord_offsetE;
820             fjptrF             = f+j_coord_offsetF;
821             fjptrG             = f+j_coord_offsetG;
822             fjptrH             = f+j_coord_offsetH;
823
824             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
825                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
826
827             /* Inner loop uses 417 flops */
828         }
829
830         if(jidx<j_index_end)
831         {
832
833             /* Get j neighbor index, and coordinate index */
834             jnrlistA         = jjnr[jidx];
835             jnrlistB         = jjnr[jidx+1];
836             jnrlistC         = jjnr[jidx+2];
837             jnrlistD         = jjnr[jidx+3];
838             jnrlistE         = jjnr[jidx+4];
839             jnrlistF         = jjnr[jidx+5];
840             jnrlistG         = jjnr[jidx+6];
841             jnrlistH         = jjnr[jidx+7];
842             /* Sign of each element will be negative for non-real atoms.
843              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
844              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
845              */
846             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
847                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
848                                             
849             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
850             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
851             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
852             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
853             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
854             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
855             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
856             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
857             j_coord_offsetA  = DIM*jnrA;
858             j_coord_offsetB  = DIM*jnrB;
859             j_coord_offsetC  = DIM*jnrC;
860             j_coord_offsetD  = DIM*jnrD;
861             j_coord_offsetE  = DIM*jnrE;
862             j_coord_offsetF  = DIM*jnrF;
863             j_coord_offsetG  = DIM*jnrG;
864             j_coord_offsetH  = DIM*jnrH;
865
866             /* load j atom coordinates */
867             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
868                                                  x+j_coord_offsetC,x+j_coord_offsetD,
869                                                  x+j_coord_offsetE,x+j_coord_offsetF,
870                                                  x+j_coord_offsetG,x+j_coord_offsetH,
871                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
872
873             /* Calculate displacement vector */
874             dx00             = _mm256_sub_ps(ix0,jx0);
875             dy00             = _mm256_sub_ps(iy0,jy0);
876             dz00             = _mm256_sub_ps(iz0,jz0);
877             dx01             = _mm256_sub_ps(ix0,jx1);
878             dy01             = _mm256_sub_ps(iy0,jy1);
879             dz01             = _mm256_sub_ps(iz0,jz1);
880             dx02             = _mm256_sub_ps(ix0,jx2);
881             dy02             = _mm256_sub_ps(iy0,jy2);
882             dz02             = _mm256_sub_ps(iz0,jz2);
883             dx10             = _mm256_sub_ps(ix1,jx0);
884             dy10             = _mm256_sub_ps(iy1,jy0);
885             dz10             = _mm256_sub_ps(iz1,jz0);
886             dx11             = _mm256_sub_ps(ix1,jx1);
887             dy11             = _mm256_sub_ps(iy1,jy1);
888             dz11             = _mm256_sub_ps(iz1,jz1);
889             dx12             = _mm256_sub_ps(ix1,jx2);
890             dy12             = _mm256_sub_ps(iy1,jy2);
891             dz12             = _mm256_sub_ps(iz1,jz2);
892             dx20             = _mm256_sub_ps(ix2,jx0);
893             dy20             = _mm256_sub_ps(iy2,jy0);
894             dz20             = _mm256_sub_ps(iz2,jz0);
895             dx21             = _mm256_sub_ps(ix2,jx1);
896             dy21             = _mm256_sub_ps(iy2,jy1);
897             dz21             = _mm256_sub_ps(iz2,jz1);
898             dx22             = _mm256_sub_ps(ix2,jx2);
899             dy22             = _mm256_sub_ps(iy2,jy2);
900             dz22             = _mm256_sub_ps(iz2,jz2);
901
902             /* Calculate squared distance and things based on it */
903             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
904             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
905             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
906             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
907             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
908             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
909             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
910             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
911             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
912
913             rinv00           = avx256_invsqrt_f(rsq00);
914             rinv01           = avx256_invsqrt_f(rsq01);
915             rinv02           = avx256_invsqrt_f(rsq02);
916             rinv10           = avx256_invsqrt_f(rsq10);
917             rinv11           = avx256_invsqrt_f(rsq11);
918             rinv12           = avx256_invsqrt_f(rsq12);
919             rinv20           = avx256_invsqrt_f(rsq20);
920             rinv21           = avx256_invsqrt_f(rsq21);
921             rinv22           = avx256_invsqrt_f(rsq22);
922
923             fjx0             = _mm256_setzero_ps();
924             fjy0             = _mm256_setzero_ps();
925             fjz0             = _mm256_setzero_ps();
926             fjx1             = _mm256_setzero_ps();
927             fjy1             = _mm256_setzero_ps();
928             fjz1             = _mm256_setzero_ps();
929             fjx2             = _mm256_setzero_ps();
930             fjy2             = _mm256_setzero_ps();
931             fjz2             = _mm256_setzero_ps();
932
933             /**************************
934              * CALCULATE INTERACTIONS *
935              **************************/
936
937             r00              = _mm256_mul_ps(rsq00,rinv00);
938             r00              = _mm256_andnot_ps(dummy_mask,r00);
939
940             /* Calculate table index by multiplying r with table scale and truncate to integer */
941             rt               = _mm256_mul_ps(r00,vftabscale);
942             vfitab           = _mm256_cvttps_epi32(rt);
943             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
944             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
945             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
946             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
947             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
948             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
949
950             /* CUBIC SPLINE TABLE ELECTROSTATICS */
951             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
952                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
953             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
954                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
955             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
956                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
957             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
958                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
959             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
960             Heps             = _mm256_mul_ps(vfeps,H);
961             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
962             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
963             velec            = _mm256_mul_ps(qq00,VV);
964             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
965             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
966
967             /* CUBIC SPLINE TABLE DISPERSION */
968             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
969             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
970             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
971                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
972             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
973                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
974             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
975                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
976             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
977                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
978             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
979             Heps             = _mm256_mul_ps(vfeps,H);
980             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
981             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
982             vvdw6            = _mm256_mul_ps(c6_00,VV);
983             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
984             fvdw6            = _mm256_mul_ps(c6_00,FF);
985
986             /* CUBIC SPLINE TABLE REPULSION */
987             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
988             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
989             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
990                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
991             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
992                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
993             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
994                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
995             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
996                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
997             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
998             Heps             = _mm256_mul_ps(vfeps,H);
999             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1000             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1001             vvdw12           = _mm256_mul_ps(c12_00,VV);
1002             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1003             fvdw12           = _mm256_mul_ps(c12_00,FF);
1004             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
1005             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1006
1007             /* Update potential sum for this i atom from the interaction with this j atom. */
1008             velec            = _mm256_andnot_ps(dummy_mask,velec);
1009             velecsum         = _mm256_add_ps(velecsum,velec);
1010             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
1011             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
1012
1013             fscal            = _mm256_add_ps(felec,fvdw);
1014
1015             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1016
1017             /* Calculate temporary vectorial force */
1018             tx               = _mm256_mul_ps(fscal,dx00);
1019             ty               = _mm256_mul_ps(fscal,dy00);
1020             tz               = _mm256_mul_ps(fscal,dz00);
1021
1022             /* Update vectorial force */
1023             fix0             = _mm256_add_ps(fix0,tx);
1024             fiy0             = _mm256_add_ps(fiy0,ty);
1025             fiz0             = _mm256_add_ps(fiz0,tz);
1026
1027             fjx0             = _mm256_add_ps(fjx0,tx);
1028             fjy0             = _mm256_add_ps(fjy0,ty);
1029             fjz0             = _mm256_add_ps(fjz0,tz);
1030
1031             /**************************
1032              * CALCULATE INTERACTIONS *
1033              **************************/
1034
1035             r01              = _mm256_mul_ps(rsq01,rinv01);
1036             r01              = _mm256_andnot_ps(dummy_mask,r01);
1037
1038             /* Calculate table index by multiplying r with table scale and truncate to integer */
1039             rt               = _mm256_mul_ps(r01,vftabscale);
1040             vfitab           = _mm256_cvttps_epi32(rt);
1041             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1042             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1043             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1044             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1045             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1046             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1047
1048             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1049             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1050                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1051             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1052                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1053             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1054                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1055             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1056                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1057             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1058             Heps             = _mm256_mul_ps(vfeps,H);
1059             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1060             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1061             velec            = _mm256_mul_ps(qq01,VV);
1062             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1063             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1064
1065             /* Update potential sum for this i atom from the interaction with this j atom. */
1066             velec            = _mm256_andnot_ps(dummy_mask,velec);
1067             velecsum         = _mm256_add_ps(velecsum,velec);
1068
1069             fscal            = felec;
1070
1071             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1072
1073             /* Calculate temporary vectorial force */
1074             tx               = _mm256_mul_ps(fscal,dx01);
1075             ty               = _mm256_mul_ps(fscal,dy01);
1076             tz               = _mm256_mul_ps(fscal,dz01);
1077
1078             /* Update vectorial force */
1079             fix0             = _mm256_add_ps(fix0,tx);
1080             fiy0             = _mm256_add_ps(fiy0,ty);
1081             fiz0             = _mm256_add_ps(fiz0,tz);
1082
1083             fjx1             = _mm256_add_ps(fjx1,tx);
1084             fjy1             = _mm256_add_ps(fjy1,ty);
1085             fjz1             = _mm256_add_ps(fjz1,tz);
1086
1087             /**************************
1088              * CALCULATE INTERACTIONS *
1089              **************************/
1090
1091             r02              = _mm256_mul_ps(rsq02,rinv02);
1092             r02              = _mm256_andnot_ps(dummy_mask,r02);
1093
1094             /* Calculate table index by multiplying r with table scale and truncate to integer */
1095             rt               = _mm256_mul_ps(r02,vftabscale);
1096             vfitab           = _mm256_cvttps_epi32(rt);
1097             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1098             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1099             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1100             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1101             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1102             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1103
1104             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1105             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1106                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1107             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1108                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1109             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1110                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1111             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1112                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1113             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1114             Heps             = _mm256_mul_ps(vfeps,H);
1115             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1116             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1117             velec            = _mm256_mul_ps(qq02,VV);
1118             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1119             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1120
1121             /* Update potential sum for this i atom from the interaction with this j atom. */
1122             velec            = _mm256_andnot_ps(dummy_mask,velec);
1123             velecsum         = _mm256_add_ps(velecsum,velec);
1124
1125             fscal            = felec;
1126
1127             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1128
1129             /* Calculate temporary vectorial force */
1130             tx               = _mm256_mul_ps(fscal,dx02);
1131             ty               = _mm256_mul_ps(fscal,dy02);
1132             tz               = _mm256_mul_ps(fscal,dz02);
1133
1134             /* Update vectorial force */
1135             fix0             = _mm256_add_ps(fix0,tx);
1136             fiy0             = _mm256_add_ps(fiy0,ty);
1137             fiz0             = _mm256_add_ps(fiz0,tz);
1138
1139             fjx2             = _mm256_add_ps(fjx2,tx);
1140             fjy2             = _mm256_add_ps(fjy2,ty);
1141             fjz2             = _mm256_add_ps(fjz2,tz);
1142
1143             /**************************
1144              * CALCULATE INTERACTIONS *
1145              **************************/
1146
1147             r10              = _mm256_mul_ps(rsq10,rinv10);
1148             r10              = _mm256_andnot_ps(dummy_mask,r10);
1149
1150             /* Calculate table index by multiplying r with table scale and truncate to integer */
1151             rt               = _mm256_mul_ps(r10,vftabscale);
1152             vfitab           = _mm256_cvttps_epi32(rt);
1153             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1154             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1155             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1156             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1157             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1158             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1159
1160             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1161             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1162                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1163             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1164                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1165             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1166                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1167             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1168                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1169             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1170             Heps             = _mm256_mul_ps(vfeps,H);
1171             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1172             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1173             velec            = _mm256_mul_ps(qq10,VV);
1174             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1175             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1176
1177             /* Update potential sum for this i atom from the interaction with this j atom. */
1178             velec            = _mm256_andnot_ps(dummy_mask,velec);
1179             velecsum         = _mm256_add_ps(velecsum,velec);
1180
1181             fscal            = felec;
1182
1183             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1184
1185             /* Calculate temporary vectorial force */
1186             tx               = _mm256_mul_ps(fscal,dx10);
1187             ty               = _mm256_mul_ps(fscal,dy10);
1188             tz               = _mm256_mul_ps(fscal,dz10);
1189
1190             /* Update vectorial force */
1191             fix1             = _mm256_add_ps(fix1,tx);
1192             fiy1             = _mm256_add_ps(fiy1,ty);
1193             fiz1             = _mm256_add_ps(fiz1,tz);
1194
1195             fjx0             = _mm256_add_ps(fjx0,tx);
1196             fjy0             = _mm256_add_ps(fjy0,ty);
1197             fjz0             = _mm256_add_ps(fjz0,tz);
1198
1199             /**************************
1200              * CALCULATE INTERACTIONS *
1201              **************************/
1202
1203             r11              = _mm256_mul_ps(rsq11,rinv11);
1204             r11              = _mm256_andnot_ps(dummy_mask,r11);
1205
1206             /* Calculate table index by multiplying r with table scale and truncate to integer */
1207             rt               = _mm256_mul_ps(r11,vftabscale);
1208             vfitab           = _mm256_cvttps_epi32(rt);
1209             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1210             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1211             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1212             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1213             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1214             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1215
1216             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1217             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1218                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1219             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1220                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1221             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1222                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1223             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1224                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1225             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1226             Heps             = _mm256_mul_ps(vfeps,H);
1227             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1228             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1229             velec            = _mm256_mul_ps(qq11,VV);
1230             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1231             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1232
1233             /* Update potential sum for this i atom from the interaction with this j atom. */
1234             velec            = _mm256_andnot_ps(dummy_mask,velec);
1235             velecsum         = _mm256_add_ps(velecsum,velec);
1236
1237             fscal            = felec;
1238
1239             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1240
1241             /* Calculate temporary vectorial force */
1242             tx               = _mm256_mul_ps(fscal,dx11);
1243             ty               = _mm256_mul_ps(fscal,dy11);
1244             tz               = _mm256_mul_ps(fscal,dz11);
1245
1246             /* Update vectorial force */
1247             fix1             = _mm256_add_ps(fix1,tx);
1248             fiy1             = _mm256_add_ps(fiy1,ty);
1249             fiz1             = _mm256_add_ps(fiz1,tz);
1250
1251             fjx1             = _mm256_add_ps(fjx1,tx);
1252             fjy1             = _mm256_add_ps(fjy1,ty);
1253             fjz1             = _mm256_add_ps(fjz1,tz);
1254
1255             /**************************
1256              * CALCULATE INTERACTIONS *
1257              **************************/
1258
1259             r12              = _mm256_mul_ps(rsq12,rinv12);
1260             r12              = _mm256_andnot_ps(dummy_mask,r12);
1261
1262             /* Calculate table index by multiplying r with table scale and truncate to integer */
1263             rt               = _mm256_mul_ps(r12,vftabscale);
1264             vfitab           = _mm256_cvttps_epi32(rt);
1265             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1266             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1267             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1268             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1269             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1270             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1271
1272             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1273             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1274                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1275             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1276                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1277             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1278                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1279             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1280                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1281             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1282             Heps             = _mm256_mul_ps(vfeps,H);
1283             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1284             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1285             velec            = _mm256_mul_ps(qq12,VV);
1286             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1287             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1288
1289             /* Update potential sum for this i atom from the interaction with this j atom. */
1290             velec            = _mm256_andnot_ps(dummy_mask,velec);
1291             velecsum         = _mm256_add_ps(velecsum,velec);
1292
1293             fscal            = felec;
1294
1295             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1296
1297             /* Calculate temporary vectorial force */
1298             tx               = _mm256_mul_ps(fscal,dx12);
1299             ty               = _mm256_mul_ps(fscal,dy12);
1300             tz               = _mm256_mul_ps(fscal,dz12);
1301
1302             /* Update vectorial force */
1303             fix1             = _mm256_add_ps(fix1,tx);
1304             fiy1             = _mm256_add_ps(fiy1,ty);
1305             fiz1             = _mm256_add_ps(fiz1,tz);
1306
1307             fjx2             = _mm256_add_ps(fjx2,tx);
1308             fjy2             = _mm256_add_ps(fjy2,ty);
1309             fjz2             = _mm256_add_ps(fjz2,tz);
1310
1311             /**************************
1312              * CALCULATE INTERACTIONS *
1313              **************************/
1314
1315             r20              = _mm256_mul_ps(rsq20,rinv20);
1316             r20              = _mm256_andnot_ps(dummy_mask,r20);
1317
1318             /* Calculate table index by multiplying r with table scale and truncate to integer */
1319             rt               = _mm256_mul_ps(r20,vftabscale);
1320             vfitab           = _mm256_cvttps_epi32(rt);
1321             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1322             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1323             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1324             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1325             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1326             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1327
1328             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1329             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1330                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1331             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1332                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1333             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1334                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1335             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1336                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1337             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1338             Heps             = _mm256_mul_ps(vfeps,H);
1339             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1340             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1341             velec            = _mm256_mul_ps(qq20,VV);
1342             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1343             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1344
1345             /* Update potential sum for this i atom from the interaction with this j atom. */
1346             velec            = _mm256_andnot_ps(dummy_mask,velec);
1347             velecsum         = _mm256_add_ps(velecsum,velec);
1348
1349             fscal            = felec;
1350
1351             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1352
1353             /* Calculate temporary vectorial force */
1354             tx               = _mm256_mul_ps(fscal,dx20);
1355             ty               = _mm256_mul_ps(fscal,dy20);
1356             tz               = _mm256_mul_ps(fscal,dz20);
1357
1358             /* Update vectorial force */
1359             fix2             = _mm256_add_ps(fix2,tx);
1360             fiy2             = _mm256_add_ps(fiy2,ty);
1361             fiz2             = _mm256_add_ps(fiz2,tz);
1362
1363             fjx0             = _mm256_add_ps(fjx0,tx);
1364             fjy0             = _mm256_add_ps(fjy0,ty);
1365             fjz0             = _mm256_add_ps(fjz0,tz);
1366
1367             /**************************
1368              * CALCULATE INTERACTIONS *
1369              **************************/
1370
1371             r21              = _mm256_mul_ps(rsq21,rinv21);
1372             r21              = _mm256_andnot_ps(dummy_mask,r21);
1373
1374             /* Calculate table index by multiplying r with table scale and truncate to integer */
1375             rt               = _mm256_mul_ps(r21,vftabscale);
1376             vfitab           = _mm256_cvttps_epi32(rt);
1377             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1378             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1379             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1380             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1381             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1382             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1383
1384             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1385             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1386                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1387             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1388                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1389             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1390                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1391             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1392                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1393             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1394             Heps             = _mm256_mul_ps(vfeps,H);
1395             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1396             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1397             velec            = _mm256_mul_ps(qq21,VV);
1398             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1399             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1400
1401             /* Update potential sum for this i atom from the interaction with this j atom. */
1402             velec            = _mm256_andnot_ps(dummy_mask,velec);
1403             velecsum         = _mm256_add_ps(velecsum,velec);
1404
1405             fscal            = felec;
1406
1407             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1408
1409             /* Calculate temporary vectorial force */
1410             tx               = _mm256_mul_ps(fscal,dx21);
1411             ty               = _mm256_mul_ps(fscal,dy21);
1412             tz               = _mm256_mul_ps(fscal,dz21);
1413
1414             /* Update vectorial force */
1415             fix2             = _mm256_add_ps(fix2,tx);
1416             fiy2             = _mm256_add_ps(fiy2,ty);
1417             fiz2             = _mm256_add_ps(fiz2,tz);
1418
1419             fjx1             = _mm256_add_ps(fjx1,tx);
1420             fjy1             = _mm256_add_ps(fjy1,ty);
1421             fjz1             = _mm256_add_ps(fjz1,tz);
1422
1423             /**************************
1424              * CALCULATE INTERACTIONS *
1425              **************************/
1426
1427             r22              = _mm256_mul_ps(rsq22,rinv22);
1428             r22              = _mm256_andnot_ps(dummy_mask,r22);
1429
1430             /* Calculate table index by multiplying r with table scale and truncate to integer */
1431             rt               = _mm256_mul_ps(r22,vftabscale);
1432             vfitab           = _mm256_cvttps_epi32(rt);
1433             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1434             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1435             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1436             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1437             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1438             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1439
1440             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1441             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1442                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1443             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1444                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1445             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1446                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1447             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1448                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1449             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1450             Heps             = _mm256_mul_ps(vfeps,H);
1451             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1452             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1453             velec            = _mm256_mul_ps(qq22,VV);
1454             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1455             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1456
1457             /* Update potential sum for this i atom from the interaction with this j atom. */
1458             velec            = _mm256_andnot_ps(dummy_mask,velec);
1459             velecsum         = _mm256_add_ps(velecsum,velec);
1460
1461             fscal            = felec;
1462
1463             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1464
1465             /* Calculate temporary vectorial force */
1466             tx               = _mm256_mul_ps(fscal,dx22);
1467             ty               = _mm256_mul_ps(fscal,dy22);
1468             tz               = _mm256_mul_ps(fscal,dz22);
1469
1470             /* Update vectorial force */
1471             fix2             = _mm256_add_ps(fix2,tx);
1472             fiy2             = _mm256_add_ps(fiy2,ty);
1473             fiz2             = _mm256_add_ps(fiz2,tz);
1474
1475             fjx2             = _mm256_add_ps(fjx2,tx);
1476             fjy2             = _mm256_add_ps(fjy2,ty);
1477             fjz2             = _mm256_add_ps(fjz2,tz);
1478
1479             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1480             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1481             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1482             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1483             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1484             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1485             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1486             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1487
1488             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1489                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1490
1491             /* Inner loop uses 426 flops */
1492         }
1493
1494         /* End of innermost loop */
1495
1496         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1497                                                  f+i_coord_offset,fshift+i_shift_offset);
1498
1499         ggid                        = gid[iidx];
1500         /* Update potential energies */
1501         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1502         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1503
1504         /* Increment number of inner iterations */
1505         inneriter                  += j_index_end - j_index_start;
1506
1507         /* Outer loop uses 20 flops */
1508     }
1509
1510     /* Increment number of outer iterations */
1511     outeriter        += nri;
1512
1513     /* Update outer/inner flops */
1514
1515     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
1516 }
1517 /*
1518  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1519  * Electrostatics interaction: CubicSplineTable
1520  * VdW interaction:            CubicSplineTable
1521  * Geometry:                   Water3-Water3
1522  * Calculate force/pot:        Force
1523  */
1524 void
1525 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1526                     (t_nblist                    * gmx_restrict       nlist,
1527                      rvec                        * gmx_restrict          xx,
1528                      rvec                        * gmx_restrict          ff,
1529                      struct t_forcerec           * gmx_restrict          fr,
1530                      t_mdatoms                   * gmx_restrict     mdatoms,
1531                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1532                      t_nrnb                      * gmx_restrict        nrnb)
1533 {
1534     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1535      * just 0 for non-waters.
1536      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1537      * jnr indices corresponding to data put in the four positions in the SIMD register.
1538      */
1539     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1540     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1541     int              jnrA,jnrB,jnrC,jnrD;
1542     int              jnrE,jnrF,jnrG,jnrH;
1543     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1544     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1545     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1546     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1547     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1548     real             rcutoff_scalar;
1549     real             *shiftvec,*fshift,*x,*f;
1550     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1551     real             scratch[4*DIM];
1552     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1553     real *           vdwioffsetptr0;
1554     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1555     real *           vdwioffsetptr1;
1556     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1557     real *           vdwioffsetptr2;
1558     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1559     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1560     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1561     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1562     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1563     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1564     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1565     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1566     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1567     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1568     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1569     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1570     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1571     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1572     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1573     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1574     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1575     real             *charge;
1576     int              nvdwtype;
1577     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1578     int              *vdwtype;
1579     real             *vdwparam;
1580     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1581     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1582     __m256i          vfitab;
1583     __m128i          vfitab_lo,vfitab_hi;
1584     __m128i          ifour       = _mm_set1_epi32(4);
1585     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1586     real             *vftab;
1587     __m256           dummy_mask,cutoff_mask;
1588     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1589     __m256           one     = _mm256_set1_ps(1.0);
1590     __m256           two     = _mm256_set1_ps(2.0);
1591     x                = xx[0];
1592     f                = ff[0];
1593
1594     nri              = nlist->nri;
1595     iinr             = nlist->iinr;
1596     jindex           = nlist->jindex;
1597     jjnr             = nlist->jjnr;
1598     shiftidx         = nlist->shift;
1599     gid              = nlist->gid;
1600     shiftvec         = fr->shift_vec[0];
1601     fshift           = fr->fshift[0];
1602     facel            = _mm256_set1_ps(fr->ic->epsfac);
1603     charge           = mdatoms->chargeA;
1604     nvdwtype         = fr->ntype;
1605     vdwparam         = fr->nbfp;
1606     vdwtype          = mdatoms->typeA;
1607
1608     vftab            = kernel_data->table_elec_vdw->data;
1609     vftabscale       = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
1610
1611     /* Setup water-specific parameters */
1612     inr              = nlist->iinr[0];
1613     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1614     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1615     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1616     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1617
1618     jq0              = _mm256_set1_ps(charge[inr+0]);
1619     jq1              = _mm256_set1_ps(charge[inr+1]);
1620     jq2              = _mm256_set1_ps(charge[inr+2]);
1621     vdwjidx0A        = 2*vdwtype[inr+0];
1622     qq00             = _mm256_mul_ps(iq0,jq0);
1623     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1624     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1625     qq01             = _mm256_mul_ps(iq0,jq1);
1626     qq02             = _mm256_mul_ps(iq0,jq2);
1627     qq10             = _mm256_mul_ps(iq1,jq0);
1628     qq11             = _mm256_mul_ps(iq1,jq1);
1629     qq12             = _mm256_mul_ps(iq1,jq2);
1630     qq20             = _mm256_mul_ps(iq2,jq0);
1631     qq21             = _mm256_mul_ps(iq2,jq1);
1632     qq22             = _mm256_mul_ps(iq2,jq2);
1633
1634     /* Avoid stupid compiler warnings */
1635     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1636     j_coord_offsetA = 0;
1637     j_coord_offsetB = 0;
1638     j_coord_offsetC = 0;
1639     j_coord_offsetD = 0;
1640     j_coord_offsetE = 0;
1641     j_coord_offsetF = 0;
1642     j_coord_offsetG = 0;
1643     j_coord_offsetH = 0;
1644
1645     outeriter        = 0;
1646     inneriter        = 0;
1647
1648     for(iidx=0;iidx<4*DIM;iidx++)
1649     {
1650         scratch[iidx] = 0.0;
1651     }
1652
1653     /* Start outer loop over neighborlists */
1654     for(iidx=0; iidx<nri; iidx++)
1655     {
1656         /* Load shift vector for this list */
1657         i_shift_offset   = DIM*shiftidx[iidx];
1658
1659         /* Load limits for loop over neighbors */
1660         j_index_start    = jindex[iidx];
1661         j_index_end      = jindex[iidx+1];
1662
1663         /* Get outer coordinate index */
1664         inr              = iinr[iidx];
1665         i_coord_offset   = DIM*inr;
1666
1667         /* Load i particle coords and add shift vector */
1668         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1669                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1670
1671         fix0             = _mm256_setzero_ps();
1672         fiy0             = _mm256_setzero_ps();
1673         fiz0             = _mm256_setzero_ps();
1674         fix1             = _mm256_setzero_ps();
1675         fiy1             = _mm256_setzero_ps();
1676         fiz1             = _mm256_setzero_ps();
1677         fix2             = _mm256_setzero_ps();
1678         fiy2             = _mm256_setzero_ps();
1679         fiz2             = _mm256_setzero_ps();
1680
1681         /* Start inner kernel loop */
1682         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1683         {
1684
1685             /* Get j neighbor index, and coordinate index */
1686             jnrA             = jjnr[jidx];
1687             jnrB             = jjnr[jidx+1];
1688             jnrC             = jjnr[jidx+2];
1689             jnrD             = jjnr[jidx+3];
1690             jnrE             = jjnr[jidx+4];
1691             jnrF             = jjnr[jidx+5];
1692             jnrG             = jjnr[jidx+6];
1693             jnrH             = jjnr[jidx+7];
1694             j_coord_offsetA  = DIM*jnrA;
1695             j_coord_offsetB  = DIM*jnrB;
1696             j_coord_offsetC  = DIM*jnrC;
1697             j_coord_offsetD  = DIM*jnrD;
1698             j_coord_offsetE  = DIM*jnrE;
1699             j_coord_offsetF  = DIM*jnrF;
1700             j_coord_offsetG  = DIM*jnrG;
1701             j_coord_offsetH  = DIM*jnrH;
1702
1703             /* load j atom coordinates */
1704             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1705                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1706                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1707                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1708                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1709
1710             /* Calculate displacement vector */
1711             dx00             = _mm256_sub_ps(ix0,jx0);
1712             dy00             = _mm256_sub_ps(iy0,jy0);
1713             dz00             = _mm256_sub_ps(iz0,jz0);
1714             dx01             = _mm256_sub_ps(ix0,jx1);
1715             dy01             = _mm256_sub_ps(iy0,jy1);
1716             dz01             = _mm256_sub_ps(iz0,jz1);
1717             dx02             = _mm256_sub_ps(ix0,jx2);
1718             dy02             = _mm256_sub_ps(iy0,jy2);
1719             dz02             = _mm256_sub_ps(iz0,jz2);
1720             dx10             = _mm256_sub_ps(ix1,jx0);
1721             dy10             = _mm256_sub_ps(iy1,jy0);
1722             dz10             = _mm256_sub_ps(iz1,jz0);
1723             dx11             = _mm256_sub_ps(ix1,jx1);
1724             dy11             = _mm256_sub_ps(iy1,jy1);
1725             dz11             = _mm256_sub_ps(iz1,jz1);
1726             dx12             = _mm256_sub_ps(ix1,jx2);
1727             dy12             = _mm256_sub_ps(iy1,jy2);
1728             dz12             = _mm256_sub_ps(iz1,jz2);
1729             dx20             = _mm256_sub_ps(ix2,jx0);
1730             dy20             = _mm256_sub_ps(iy2,jy0);
1731             dz20             = _mm256_sub_ps(iz2,jz0);
1732             dx21             = _mm256_sub_ps(ix2,jx1);
1733             dy21             = _mm256_sub_ps(iy2,jy1);
1734             dz21             = _mm256_sub_ps(iz2,jz1);
1735             dx22             = _mm256_sub_ps(ix2,jx2);
1736             dy22             = _mm256_sub_ps(iy2,jy2);
1737             dz22             = _mm256_sub_ps(iz2,jz2);
1738
1739             /* Calculate squared distance and things based on it */
1740             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1741             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1742             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1743             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1744             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1745             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1746             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1747             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1748             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1749
1750             rinv00           = avx256_invsqrt_f(rsq00);
1751             rinv01           = avx256_invsqrt_f(rsq01);
1752             rinv02           = avx256_invsqrt_f(rsq02);
1753             rinv10           = avx256_invsqrt_f(rsq10);
1754             rinv11           = avx256_invsqrt_f(rsq11);
1755             rinv12           = avx256_invsqrt_f(rsq12);
1756             rinv20           = avx256_invsqrt_f(rsq20);
1757             rinv21           = avx256_invsqrt_f(rsq21);
1758             rinv22           = avx256_invsqrt_f(rsq22);
1759
1760             fjx0             = _mm256_setzero_ps();
1761             fjy0             = _mm256_setzero_ps();
1762             fjz0             = _mm256_setzero_ps();
1763             fjx1             = _mm256_setzero_ps();
1764             fjy1             = _mm256_setzero_ps();
1765             fjz1             = _mm256_setzero_ps();
1766             fjx2             = _mm256_setzero_ps();
1767             fjy2             = _mm256_setzero_ps();
1768             fjz2             = _mm256_setzero_ps();
1769
1770             /**************************
1771              * CALCULATE INTERACTIONS *
1772              **************************/
1773
1774             r00              = _mm256_mul_ps(rsq00,rinv00);
1775
1776             /* Calculate table index by multiplying r with table scale and truncate to integer */
1777             rt               = _mm256_mul_ps(r00,vftabscale);
1778             vfitab           = _mm256_cvttps_epi32(rt);
1779             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1780             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1781             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1782             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1783             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1784             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1785
1786             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1787             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1788                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1789             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1790                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1791             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1792                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1793             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1794                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1795             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1796             Heps             = _mm256_mul_ps(vfeps,H);
1797             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1798             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1799             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1800
1801             /* CUBIC SPLINE TABLE DISPERSION */
1802             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1803             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1804             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1805                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1806             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1807                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1808             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1809                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1810             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1811                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1812             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1813             Heps             = _mm256_mul_ps(vfeps,H);
1814             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1815             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1816             fvdw6            = _mm256_mul_ps(c6_00,FF);
1817
1818             /* CUBIC SPLINE TABLE REPULSION */
1819             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1820             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1821             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1822                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1823             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1824                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1825             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1826                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1827             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1828                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1829             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1830             Heps             = _mm256_mul_ps(vfeps,H);
1831             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1832             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1833             fvdw12           = _mm256_mul_ps(c12_00,FF);
1834             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1835
1836             fscal            = _mm256_add_ps(felec,fvdw);
1837
1838             /* Calculate temporary vectorial force */
1839             tx               = _mm256_mul_ps(fscal,dx00);
1840             ty               = _mm256_mul_ps(fscal,dy00);
1841             tz               = _mm256_mul_ps(fscal,dz00);
1842
1843             /* Update vectorial force */
1844             fix0             = _mm256_add_ps(fix0,tx);
1845             fiy0             = _mm256_add_ps(fiy0,ty);
1846             fiz0             = _mm256_add_ps(fiz0,tz);
1847
1848             fjx0             = _mm256_add_ps(fjx0,tx);
1849             fjy0             = _mm256_add_ps(fjy0,ty);
1850             fjz0             = _mm256_add_ps(fjz0,tz);
1851
1852             /**************************
1853              * CALCULATE INTERACTIONS *
1854              **************************/
1855
1856             r01              = _mm256_mul_ps(rsq01,rinv01);
1857
1858             /* Calculate table index by multiplying r with table scale and truncate to integer */
1859             rt               = _mm256_mul_ps(r01,vftabscale);
1860             vfitab           = _mm256_cvttps_epi32(rt);
1861             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1862             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1863             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1864             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1865             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1866             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1867
1868             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1869             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1870                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1871             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1872                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1873             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1874                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1875             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1876                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1877             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1878             Heps             = _mm256_mul_ps(vfeps,H);
1879             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1880             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1881             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1882
1883             fscal            = felec;
1884
1885             /* Calculate temporary vectorial force */
1886             tx               = _mm256_mul_ps(fscal,dx01);
1887             ty               = _mm256_mul_ps(fscal,dy01);
1888             tz               = _mm256_mul_ps(fscal,dz01);
1889
1890             /* Update vectorial force */
1891             fix0             = _mm256_add_ps(fix0,tx);
1892             fiy0             = _mm256_add_ps(fiy0,ty);
1893             fiz0             = _mm256_add_ps(fiz0,tz);
1894
1895             fjx1             = _mm256_add_ps(fjx1,tx);
1896             fjy1             = _mm256_add_ps(fjy1,ty);
1897             fjz1             = _mm256_add_ps(fjz1,tz);
1898
1899             /**************************
1900              * CALCULATE INTERACTIONS *
1901              **************************/
1902
1903             r02              = _mm256_mul_ps(rsq02,rinv02);
1904
1905             /* Calculate table index by multiplying r with table scale and truncate to integer */
1906             rt               = _mm256_mul_ps(r02,vftabscale);
1907             vfitab           = _mm256_cvttps_epi32(rt);
1908             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1909             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1910             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1911             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1912             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1913             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1914
1915             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1916             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1917                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1918             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1919                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1920             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1921                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1922             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1923                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1924             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1925             Heps             = _mm256_mul_ps(vfeps,H);
1926             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1927             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1928             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1929
1930             fscal            = felec;
1931
1932             /* Calculate temporary vectorial force */
1933             tx               = _mm256_mul_ps(fscal,dx02);
1934             ty               = _mm256_mul_ps(fscal,dy02);
1935             tz               = _mm256_mul_ps(fscal,dz02);
1936
1937             /* Update vectorial force */
1938             fix0             = _mm256_add_ps(fix0,tx);
1939             fiy0             = _mm256_add_ps(fiy0,ty);
1940             fiz0             = _mm256_add_ps(fiz0,tz);
1941
1942             fjx2             = _mm256_add_ps(fjx2,tx);
1943             fjy2             = _mm256_add_ps(fjy2,ty);
1944             fjz2             = _mm256_add_ps(fjz2,tz);
1945
1946             /**************************
1947              * CALCULATE INTERACTIONS *
1948              **************************/
1949
1950             r10              = _mm256_mul_ps(rsq10,rinv10);
1951
1952             /* Calculate table index by multiplying r with table scale and truncate to integer */
1953             rt               = _mm256_mul_ps(r10,vftabscale);
1954             vfitab           = _mm256_cvttps_epi32(rt);
1955             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1956             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1957             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1958             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1959             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1960             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1961
1962             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1963             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1964                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1965             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1966                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1967             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1968                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1969             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1970                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1971             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1972             Heps             = _mm256_mul_ps(vfeps,H);
1973             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1974             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1975             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1976
1977             fscal            = felec;
1978
1979             /* Calculate temporary vectorial force */
1980             tx               = _mm256_mul_ps(fscal,dx10);
1981             ty               = _mm256_mul_ps(fscal,dy10);
1982             tz               = _mm256_mul_ps(fscal,dz10);
1983
1984             /* Update vectorial force */
1985             fix1             = _mm256_add_ps(fix1,tx);
1986             fiy1             = _mm256_add_ps(fiy1,ty);
1987             fiz1             = _mm256_add_ps(fiz1,tz);
1988
1989             fjx0             = _mm256_add_ps(fjx0,tx);
1990             fjy0             = _mm256_add_ps(fjy0,ty);
1991             fjz0             = _mm256_add_ps(fjz0,tz);
1992
1993             /**************************
1994              * CALCULATE INTERACTIONS *
1995              **************************/
1996
1997             r11              = _mm256_mul_ps(rsq11,rinv11);
1998
1999             /* Calculate table index by multiplying r with table scale and truncate to integer */
2000             rt               = _mm256_mul_ps(r11,vftabscale);
2001             vfitab           = _mm256_cvttps_epi32(rt);
2002             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2003             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2004             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2005             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2006             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2007             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2008
2009             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2010             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2011                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2012             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2013                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2014             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2015                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2016             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2017                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2018             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2019             Heps             = _mm256_mul_ps(vfeps,H);
2020             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2021             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2022             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2023
2024             fscal            = felec;
2025
2026             /* Calculate temporary vectorial force */
2027             tx               = _mm256_mul_ps(fscal,dx11);
2028             ty               = _mm256_mul_ps(fscal,dy11);
2029             tz               = _mm256_mul_ps(fscal,dz11);
2030
2031             /* Update vectorial force */
2032             fix1             = _mm256_add_ps(fix1,tx);
2033             fiy1             = _mm256_add_ps(fiy1,ty);
2034             fiz1             = _mm256_add_ps(fiz1,tz);
2035
2036             fjx1             = _mm256_add_ps(fjx1,tx);
2037             fjy1             = _mm256_add_ps(fjy1,ty);
2038             fjz1             = _mm256_add_ps(fjz1,tz);
2039
2040             /**************************
2041              * CALCULATE INTERACTIONS *
2042              **************************/
2043
2044             r12              = _mm256_mul_ps(rsq12,rinv12);
2045
2046             /* Calculate table index by multiplying r with table scale and truncate to integer */
2047             rt               = _mm256_mul_ps(r12,vftabscale);
2048             vfitab           = _mm256_cvttps_epi32(rt);
2049             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2050             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2051             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2052             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2053             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2054             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2055
2056             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2057             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2058                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2059             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2060                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2061             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2062                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2063             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2064                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2065             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2066             Heps             = _mm256_mul_ps(vfeps,H);
2067             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2068             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2069             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2070
2071             fscal            = felec;
2072
2073             /* Calculate temporary vectorial force */
2074             tx               = _mm256_mul_ps(fscal,dx12);
2075             ty               = _mm256_mul_ps(fscal,dy12);
2076             tz               = _mm256_mul_ps(fscal,dz12);
2077
2078             /* Update vectorial force */
2079             fix1             = _mm256_add_ps(fix1,tx);
2080             fiy1             = _mm256_add_ps(fiy1,ty);
2081             fiz1             = _mm256_add_ps(fiz1,tz);
2082
2083             fjx2             = _mm256_add_ps(fjx2,tx);
2084             fjy2             = _mm256_add_ps(fjy2,ty);
2085             fjz2             = _mm256_add_ps(fjz2,tz);
2086
2087             /**************************
2088              * CALCULATE INTERACTIONS *
2089              **************************/
2090
2091             r20              = _mm256_mul_ps(rsq20,rinv20);
2092
2093             /* Calculate table index by multiplying r with table scale and truncate to integer */
2094             rt               = _mm256_mul_ps(r20,vftabscale);
2095             vfitab           = _mm256_cvttps_epi32(rt);
2096             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2097             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2098             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2099             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2100             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2101             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2102
2103             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2104             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2105                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2106             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2107                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2108             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2109                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2110             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2111                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2112             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2113             Heps             = _mm256_mul_ps(vfeps,H);
2114             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2115             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2116             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2117
2118             fscal            = felec;
2119
2120             /* Calculate temporary vectorial force */
2121             tx               = _mm256_mul_ps(fscal,dx20);
2122             ty               = _mm256_mul_ps(fscal,dy20);
2123             tz               = _mm256_mul_ps(fscal,dz20);
2124
2125             /* Update vectorial force */
2126             fix2             = _mm256_add_ps(fix2,tx);
2127             fiy2             = _mm256_add_ps(fiy2,ty);
2128             fiz2             = _mm256_add_ps(fiz2,tz);
2129
2130             fjx0             = _mm256_add_ps(fjx0,tx);
2131             fjy0             = _mm256_add_ps(fjy0,ty);
2132             fjz0             = _mm256_add_ps(fjz0,tz);
2133
2134             /**************************
2135              * CALCULATE INTERACTIONS *
2136              **************************/
2137
2138             r21              = _mm256_mul_ps(rsq21,rinv21);
2139
2140             /* Calculate table index by multiplying r with table scale and truncate to integer */
2141             rt               = _mm256_mul_ps(r21,vftabscale);
2142             vfitab           = _mm256_cvttps_epi32(rt);
2143             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2144             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2145             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2146             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2147             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2148             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2149
2150             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2151             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2152                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2153             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2154                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2155             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2156                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2157             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2158                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2159             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2160             Heps             = _mm256_mul_ps(vfeps,H);
2161             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2162             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2163             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2164
2165             fscal            = felec;
2166
2167             /* Calculate temporary vectorial force */
2168             tx               = _mm256_mul_ps(fscal,dx21);
2169             ty               = _mm256_mul_ps(fscal,dy21);
2170             tz               = _mm256_mul_ps(fscal,dz21);
2171
2172             /* Update vectorial force */
2173             fix2             = _mm256_add_ps(fix2,tx);
2174             fiy2             = _mm256_add_ps(fiy2,ty);
2175             fiz2             = _mm256_add_ps(fiz2,tz);
2176
2177             fjx1             = _mm256_add_ps(fjx1,tx);
2178             fjy1             = _mm256_add_ps(fjy1,ty);
2179             fjz1             = _mm256_add_ps(fjz1,tz);
2180
2181             /**************************
2182              * CALCULATE INTERACTIONS *
2183              **************************/
2184
2185             r22              = _mm256_mul_ps(rsq22,rinv22);
2186
2187             /* Calculate table index by multiplying r with table scale and truncate to integer */
2188             rt               = _mm256_mul_ps(r22,vftabscale);
2189             vfitab           = _mm256_cvttps_epi32(rt);
2190             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2191             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2192             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2193             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2194             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2195             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2196
2197             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2198             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2199                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2200             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2201                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2202             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2203                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2204             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2205                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2206             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2207             Heps             = _mm256_mul_ps(vfeps,H);
2208             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2209             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2210             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2211
2212             fscal            = felec;
2213
2214             /* Calculate temporary vectorial force */
2215             tx               = _mm256_mul_ps(fscal,dx22);
2216             ty               = _mm256_mul_ps(fscal,dy22);
2217             tz               = _mm256_mul_ps(fscal,dz22);
2218
2219             /* Update vectorial force */
2220             fix2             = _mm256_add_ps(fix2,tx);
2221             fiy2             = _mm256_add_ps(fiy2,ty);
2222             fiz2             = _mm256_add_ps(fiz2,tz);
2223
2224             fjx2             = _mm256_add_ps(fjx2,tx);
2225             fjy2             = _mm256_add_ps(fjy2,ty);
2226             fjz2             = _mm256_add_ps(fjz2,tz);
2227
2228             fjptrA             = f+j_coord_offsetA;
2229             fjptrB             = f+j_coord_offsetB;
2230             fjptrC             = f+j_coord_offsetC;
2231             fjptrD             = f+j_coord_offsetD;
2232             fjptrE             = f+j_coord_offsetE;
2233             fjptrF             = f+j_coord_offsetF;
2234             fjptrG             = f+j_coord_offsetG;
2235             fjptrH             = f+j_coord_offsetH;
2236
2237             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2238                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2239
2240             /* Inner loop uses 373 flops */
2241         }
2242
2243         if(jidx<j_index_end)
2244         {
2245
2246             /* Get j neighbor index, and coordinate index */
2247             jnrlistA         = jjnr[jidx];
2248             jnrlistB         = jjnr[jidx+1];
2249             jnrlistC         = jjnr[jidx+2];
2250             jnrlistD         = jjnr[jidx+3];
2251             jnrlistE         = jjnr[jidx+4];
2252             jnrlistF         = jjnr[jidx+5];
2253             jnrlistG         = jjnr[jidx+6];
2254             jnrlistH         = jjnr[jidx+7];
2255             /* Sign of each element will be negative for non-real atoms.
2256              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2257              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2258              */
2259             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2260                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2261                                             
2262             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2263             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2264             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2265             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2266             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2267             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2268             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2269             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2270             j_coord_offsetA  = DIM*jnrA;
2271             j_coord_offsetB  = DIM*jnrB;
2272             j_coord_offsetC  = DIM*jnrC;
2273             j_coord_offsetD  = DIM*jnrD;
2274             j_coord_offsetE  = DIM*jnrE;
2275             j_coord_offsetF  = DIM*jnrF;
2276             j_coord_offsetG  = DIM*jnrG;
2277             j_coord_offsetH  = DIM*jnrH;
2278
2279             /* load j atom coordinates */
2280             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2281                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2282                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2283                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2284                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2285
2286             /* Calculate displacement vector */
2287             dx00             = _mm256_sub_ps(ix0,jx0);
2288             dy00             = _mm256_sub_ps(iy0,jy0);
2289             dz00             = _mm256_sub_ps(iz0,jz0);
2290             dx01             = _mm256_sub_ps(ix0,jx1);
2291             dy01             = _mm256_sub_ps(iy0,jy1);
2292             dz01             = _mm256_sub_ps(iz0,jz1);
2293             dx02             = _mm256_sub_ps(ix0,jx2);
2294             dy02             = _mm256_sub_ps(iy0,jy2);
2295             dz02             = _mm256_sub_ps(iz0,jz2);
2296             dx10             = _mm256_sub_ps(ix1,jx0);
2297             dy10             = _mm256_sub_ps(iy1,jy0);
2298             dz10             = _mm256_sub_ps(iz1,jz0);
2299             dx11             = _mm256_sub_ps(ix1,jx1);
2300             dy11             = _mm256_sub_ps(iy1,jy1);
2301             dz11             = _mm256_sub_ps(iz1,jz1);
2302             dx12             = _mm256_sub_ps(ix1,jx2);
2303             dy12             = _mm256_sub_ps(iy1,jy2);
2304             dz12             = _mm256_sub_ps(iz1,jz2);
2305             dx20             = _mm256_sub_ps(ix2,jx0);
2306             dy20             = _mm256_sub_ps(iy2,jy0);
2307             dz20             = _mm256_sub_ps(iz2,jz0);
2308             dx21             = _mm256_sub_ps(ix2,jx1);
2309             dy21             = _mm256_sub_ps(iy2,jy1);
2310             dz21             = _mm256_sub_ps(iz2,jz1);
2311             dx22             = _mm256_sub_ps(ix2,jx2);
2312             dy22             = _mm256_sub_ps(iy2,jy2);
2313             dz22             = _mm256_sub_ps(iz2,jz2);
2314
2315             /* Calculate squared distance and things based on it */
2316             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2317             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2318             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2319             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2320             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2321             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2322             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2323             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2324             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2325
2326             rinv00           = avx256_invsqrt_f(rsq00);
2327             rinv01           = avx256_invsqrt_f(rsq01);
2328             rinv02           = avx256_invsqrt_f(rsq02);
2329             rinv10           = avx256_invsqrt_f(rsq10);
2330             rinv11           = avx256_invsqrt_f(rsq11);
2331             rinv12           = avx256_invsqrt_f(rsq12);
2332             rinv20           = avx256_invsqrt_f(rsq20);
2333             rinv21           = avx256_invsqrt_f(rsq21);
2334             rinv22           = avx256_invsqrt_f(rsq22);
2335
2336             fjx0             = _mm256_setzero_ps();
2337             fjy0             = _mm256_setzero_ps();
2338             fjz0             = _mm256_setzero_ps();
2339             fjx1             = _mm256_setzero_ps();
2340             fjy1             = _mm256_setzero_ps();
2341             fjz1             = _mm256_setzero_ps();
2342             fjx2             = _mm256_setzero_ps();
2343             fjy2             = _mm256_setzero_ps();
2344             fjz2             = _mm256_setzero_ps();
2345
2346             /**************************
2347              * CALCULATE INTERACTIONS *
2348              **************************/
2349
2350             r00              = _mm256_mul_ps(rsq00,rinv00);
2351             r00              = _mm256_andnot_ps(dummy_mask,r00);
2352
2353             /* Calculate table index by multiplying r with table scale and truncate to integer */
2354             rt               = _mm256_mul_ps(r00,vftabscale);
2355             vfitab           = _mm256_cvttps_epi32(rt);
2356             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2357             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2358             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2359             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2360             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2361             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2362
2363             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2364             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2365                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2366             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2367                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2368             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2369                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2370             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2371                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2372             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2373             Heps             = _mm256_mul_ps(vfeps,H);
2374             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2375             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2376             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2377
2378             /* CUBIC SPLINE TABLE DISPERSION */
2379             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
2380             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
2381             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2382                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2383             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2384                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2385             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2386                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2387             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2388                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2389             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2390             Heps             = _mm256_mul_ps(vfeps,H);
2391             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2392             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2393             fvdw6            = _mm256_mul_ps(c6_00,FF);
2394
2395             /* CUBIC SPLINE TABLE REPULSION */
2396             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
2397             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
2398             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2399                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2400             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2401                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2402             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2403                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2404             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2405                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2406             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2407             Heps             = _mm256_mul_ps(vfeps,H);
2408             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2409             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2410             fvdw12           = _mm256_mul_ps(c12_00,FF);
2411             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2412
2413             fscal            = _mm256_add_ps(felec,fvdw);
2414
2415             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2416
2417             /* Calculate temporary vectorial force */
2418             tx               = _mm256_mul_ps(fscal,dx00);
2419             ty               = _mm256_mul_ps(fscal,dy00);
2420             tz               = _mm256_mul_ps(fscal,dz00);
2421
2422             /* Update vectorial force */
2423             fix0             = _mm256_add_ps(fix0,tx);
2424             fiy0             = _mm256_add_ps(fiy0,ty);
2425             fiz0             = _mm256_add_ps(fiz0,tz);
2426
2427             fjx0             = _mm256_add_ps(fjx0,tx);
2428             fjy0             = _mm256_add_ps(fjy0,ty);
2429             fjz0             = _mm256_add_ps(fjz0,tz);
2430
2431             /**************************
2432              * CALCULATE INTERACTIONS *
2433              **************************/
2434
2435             r01              = _mm256_mul_ps(rsq01,rinv01);
2436             r01              = _mm256_andnot_ps(dummy_mask,r01);
2437
2438             /* Calculate table index by multiplying r with table scale and truncate to integer */
2439             rt               = _mm256_mul_ps(r01,vftabscale);
2440             vfitab           = _mm256_cvttps_epi32(rt);
2441             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2442             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2443             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2444             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2445             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2446             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2447
2448             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2449             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2450                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2451             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2452                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2453             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2454                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2455             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2456                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2457             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2458             Heps             = _mm256_mul_ps(vfeps,H);
2459             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2460             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2461             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2462
2463             fscal            = felec;
2464
2465             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2466
2467             /* Calculate temporary vectorial force */
2468             tx               = _mm256_mul_ps(fscal,dx01);
2469             ty               = _mm256_mul_ps(fscal,dy01);
2470             tz               = _mm256_mul_ps(fscal,dz01);
2471
2472             /* Update vectorial force */
2473             fix0             = _mm256_add_ps(fix0,tx);
2474             fiy0             = _mm256_add_ps(fiy0,ty);
2475             fiz0             = _mm256_add_ps(fiz0,tz);
2476
2477             fjx1             = _mm256_add_ps(fjx1,tx);
2478             fjy1             = _mm256_add_ps(fjy1,ty);
2479             fjz1             = _mm256_add_ps(fjz1,tz);
2480
2481             /**************************
2482              * CALCULATE INTERACTIONS *
2483              **************************/
2484
2485             r02              = _mm256_mul_ps(rsq02,rinv02);
2486             r02              = _mm256_andnot_ps(dummy_mask,r02);
2487
2488             /* Calculate table index by multiplying r with table scale and truncate to integer */
2489             rt               = _mm256_mul_ps(r02,vftabscale);
2490             vfitab           = _mm256_cvttps_epi32(rt);
2491             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2492             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2493             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2494             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2495             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2496             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2497
2498             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2499             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2500                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2501             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2502                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2503             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2504                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2505             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2506                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2507             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2508             Heps             = _mm256_mul_ps(vfeps,H);
2509             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2510             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2511             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2512
2513             fscal            = felec;
2514
2515             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2516
2517             /* Calculate temporary vectorial force */
2518             tx               = _mm256_mul_ps(fscal,dx02);
2519             ty               = _mm256_mul_ps(fscal,dy02);
2520             tz               = _mm256_mul_ps(fscal,dz02);
2521
2522             /* Update vectorial force */
2523             fix0             = _mm256_add_ps(fix0,tx);
2524             fiy0             = _mm256_add_ps(fiy0,ty);
2525             fiz0             = _mm256_add_ps(fiz0,tz);
2526
2527             fjx2             = _mm256_add_ps(fjx2,tx);
2528             fjy2             = _mm256_add_ps(fjy2,ty);
2529             fjz2             = _mm256_add_ps(fjz2,tz);
2530
2531             /**************************
2532              * CALCULATE INTERACTIONS *
2533              **************************/
2534
2535             r10              = _mm256_mul_ps(rsq10,rinv10);
2536             r10              = _mm256_andnot_ps(dummy_mask,r10);
2537
2538             /* Calculate table index by multiplying r with table scale and truncate to integer */
2539             rt               = _mm256_mul_ps(r10,vftabscale);
2540             vfitab           = _mm256_cvttps_epi32(rt);
2541             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2542             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2543             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2544             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2545             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2546             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2547
2548             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2549             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2550                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2551             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2552                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2553             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2554                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2555             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2556                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2557             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2558             Heps             = _mm256_mul_ps(vfeps,H);
2559             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2560             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2561             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2562
2563             fscal            = felec;
2564
2565             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2566
2567             /* Calculate temporary vectorial force */
2568             tx               = _mm256_mul_ps(fscal,dx10);
2569             ty               = _mm256_mul_ps(fscal,dy10);
2570             tz               = _mm256_mul_ps(fscal,dz10);
2571
2572             /* Update vectorial force */
2573             fix1             = _mm256_add_ps(fix1,tx);
2574             fiy1             = _mm256_add_ps(fiy1,ty);
2575             fiz1             = _mm256_add_ps(fiz1,tz);
2576
2577             fjx0             = _mm256_add_ps(fjx0,tx);
2578             fjy0             = _mm256_add_ps(fjy0,ty);
2579             fjz0             = _mm256_add_ps(fjz0,tz);
2580
2581             /**************************
2582              * CALCULATE INTERACTIONS *
2583              **************************/
2584
2585             r11              = _mm256_mul_ps(rsq11,rinv11);
2586             r11              = _mm256_andnot_ps(dummy_mask,r11);
2587
2588             /* Calculate table index by multiplying r with table scale and truncate to integer */
2589             rt               = _mm256_mul_ps(r11,vftabscale);
2590             vfitab           = _mm256_cvttps_epi32(rt);
2591             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2592             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2593             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2594             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2595             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2596             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2597
2598             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2599             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2600                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2601             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2602                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2603             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2604                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2605             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2606                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2607             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2608             Heps             = _mm256_mul_ps(vfeps,H);
2609             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2610             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2611             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2612
2613             fscal            = felec;
2614
2615             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2616
2617             /* Calculate temporary vectorial force */
2618             tx               = _mm256_mul_ps(fscal,dx11);
2619             ty               = _mm256_mul_ps(fscal,dy11);
2620             tz               = _mm256_mul_ps(fscal,dz11);
2621
2622             /* Update vectorial force */
2623             fix1             = _mm256_add_ps(fix1,tx);
2624             fiy1             = _mm256_add_ps(fiy1,ty);
2625             fiz1             = _mm256_add_ps(fiz1,tz);
2626
2627             fjx1             = _mm256_add_ps(fjx1,tx);
2628             fjy1             = _mm256_add_ps(fjy1,ty);
2629             fjz1             = _mm256_add_ps(fjz1,tz);
2630
2631             /**************************
2632              * CALCULATE INTERACTIONS *
2633              **************************/
2634
2635             r12              = _mm256_mul_ps(rsq12,rinv12);
2636             r12              = _mm256_andnot_ps(dummy_mask,r12);
2637
2638             /* Calculate table index by multiplying r with table scale and truncate to integer */
2639             rt               = _mm256_mul_ps(r12,vftabscale);
2640             vfitab           = _mm256_cvttps_epi32(rt);
2641             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2642             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2643             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2644             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2645             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2646             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2647
2648             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2649             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2650                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2651             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2652                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2653             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2654                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2655             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2656                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2657             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2658             Heps             = _mm256_mul_ps(vfeps,H);
2659             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2660             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2661             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2662
2663             fscal            = felec;
2664
2665             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2666
2667             /* Calculate temporary vectorial force */
2668             tx               = _mm256_mul_ps(fscal,dx12);
2669             ty               = _mm256_mul_ps(fscal,dy12);
2670             tz               = _mm256_mul_ps(fscal,dz12);
2671
2672             /* Update vectorial force */
2673             fix1             = _mm256_add_ps(fix1,tx);
2674             fiy1             = _mm256_add_ps(fiy1,ty);
2675             fiz1             = _mm256_add_ps(fiz1,tz);
2676
2677             fjx2             = _mm256_add_ps(fjx2,tx);
2678             fjy2             = _mm256_add_ps(fjy2,ty);
2679             fjz2             = _mm256_add_ps(fjz2,tz);
2680
2681             /**************************
2682              * CALCULATE INTERACTIONS *
2683              **************************/
2684
2685             r20              = _mm256_mul_ps(rsq20,rinv20);
2686             r20              = _mm256_andnot_ps(dummy_mask,r20);
2687
2688             /* Calculate table index by multiplying r with table scale and truncate to integer */
2689             rt               = _mm256_mul_ps(r20,vftabscale);
2690             vfitab           = _mm256_cvttps_epi32(rt);
2691             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2692             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2693             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2694             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2695             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2696             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2697
2698             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2699             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2700                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2701             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2702                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2703             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2704                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2705             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2706                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2707             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2708             Heps             = _mm256_mul_ps(vfeps,H);
2709             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2710             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2711             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2712
2713             fscal            = felec;
2714
2715             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2716
2717             /* Calculate temporary vectorial force */
2718             tx               = _mm256_mul_ps(fscal,dx20);
2719             ty               = _mm256_mul_ps(fscal,dy20);
2720             tz               = _mm256_mul_ps(fscal,dz20);
2721
2722             /* Update vectorial force */
2723             fix2             = _mm256_add_ps(fix2,tx);
2724             fiy2             = _mm256_add_ps(fiy2,ty);
2725             fiz2             = _mm256_add_ps(fiz2,tz);
2726
2727             fjx0             = _mm256_add_ps(fjx0,tx);
2728             fjy0             = _mm256_add_ps(fjy0,ty);
2729             fjz0             = _mm256_add_ps(fjz0,tz);
2730
2731             /**************************
2732              * CALCULATE INTERACTIONS *
2733              **************************/
2734
2735             r21              = _mm256_mul_ps(rsq21,rinv21);
2736             r21              = _mm256_andnot_ps(dummy_mask,r21);
2737
2738             /* Calculate table index by multiplying r with table scale and truncate to integer */
2739             rt               = _mm256_mul_ps(r21,vftabscale);
2740             vfitab           = _mm256_cvttps_epi32(rt);
2741             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2742             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2743             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2744             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2745             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2746             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2747
2748             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2749             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2750                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2751             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2752                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2753             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2754                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2755             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2756                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2757             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2758             Heps             = _mm256_mul_ps(vfeps,H);
2759             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2760             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2761             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2762
2763             fscal            = felec;
2764
2765             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2766
2767             /* Calculate temporary vectorial force */
2768             tx               = _mm256_mul_ps(fscal,dx21);
2769             ty               = _mm256_mul_ps(fscal,dy21);
2770             tz               = _mm256_mul_ps(fscal,dz21);
2771
2772             /* Update vectorial force */
2773             fix2             = _mm256_add_ps(fix2,tx);
2774             fiy2             = _mm256_add_ps(fiy2,ty);
2775             fiz2             = _mm256_add_ps(fiz2,tz);
2776
2777             fjx1             = _mm256_add_ps(fjx1,tx);
2778             fjy1             = _mm256_add_ps(fjy1,ty);
2779             fjz1             = _mm256_add_ps(fjz1,tz);
2780
2781             /**************************
2782              * CALCULATE INTERACTIONS *
2783              **************************/
2784
2785             r22              = _mm256_mul_ps(rsq22,rinv22);
2786             r22              = _mm256_andnot_ps(dummy_mask,r22);
2787
2788             /* Calculate table index by multiplying r with table scale and truncate to integer */
2789             rt               = _mm256_mul_ps(r22,vftabscale);
2790             vfitab           = _mm256_cvttps_epi32(rt);
2791             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2792             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2793             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2794             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2795             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2796             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2797
2798             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2799             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2800                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2801             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2802                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2803             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2804                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2805             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2806                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2807             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2808             Heps             = _mm256_mul_ps(vfeps,H);
2809             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2810             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2811             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2812
2813             fscal            = felec;
2814
2815             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2816
2817             /* Calculate temporary vectorial force */
2818             tx               = _mm256_mul_ps(fscal,dx22);
2819             ty               = _mm256_mul_ps(fscal,dy22);
2820             tz               = _mm256_mul_ps(fscal,dz22);
2821
2822             /* Update vectorial force */
2823             fix2             = _mm256_add_ps(fix2,tx);
2824             fiy2             = _mm256_add_ps(fiy2,ty);
2825             fiz2             = _mm256_add_ps(fiz2,tz);
2826
2827             fjx2             = _mm256_add_ps(fjx2,tx);
2828             fjy2             = _mm256_add_ps(fjy2,ty);
2829             fjz2             = _mm256_add_ps(fjz2,tz);
2830
2831             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2832             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2833             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2834             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2835             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2836             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2837             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2838             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2839
2840             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2841                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2842
2843             /* Inner loop uses 382 flops */
2844         }
2845
2846         /* End of innermost loop */
2847
2848         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2849                                                  f+i_coord_offset,fshift+i_shift_offset);
2850
2851         /* Increment number of inner iterations */
2852         inneriter                  += j_index_end - j_index_start;
2853
2854         /* Outer loop uses 18 flops */
2855     }
2856
2857     /* Increment number of outer iterations */
2858     outeriter        += nri;
2859
2860     /* Update outer/inner flops */
2861
2862     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);
2863 }