Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_single
52  * Electrostatics interaction: CubicSplineTable
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrE,jnrF,jnrG,jnrH;
76     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84     real             scratch[4*DIM];
85     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     real *           vdwioffsetptr3;
93     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
101     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
102     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
103     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
106     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
109     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
110     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
111     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
112     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
113     real             *charge;
114     int              nvdwtype;
115     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
116     int              *vdwtype;
117     real             *vdwparam;
118     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
119     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
120     __m256i          vfitab;
121     __m128i          vfitab_lo,vfitab_hi;
122     __m128i          ifour       = _mm_set1_epi32(4);
123     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
124     real             *vftab;
125     __m256           dummy_mask,cutoff_mask;
126     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
127     __m256           one     = _mm256_set1_ps(1.0);
128     __m256           two     = _mm256_set1_ps(2.0);
129     x                = xx[0];
130     f                = ff[0];
131
132     nri              = nlist->nri;
133     iinr             = nlist->iinr;
134     jindex           = nlist->jindex;
135     jjnr             = nlist->jjnr;
136     shiftidx         = nlist->shift;
137     gid              = nlist->gid;
138     shiftvec         = fr->shift_vec[0];
139     fshift           = fr->fshift[0];
140     facel            = _mm256_set1_ps(fr->epsfac);
141     charge           = mdatoms->chargeA;
142     nvdwtype         = fr->ntype;
143     vdwparam         = fr->nbfp;
144     vdwtype          = mdatoms->typeA;
145
146     vftab            = kernel_data->table_elec->data;
147     vftabscale       = _mm256_set1_ps(kernel_data->table_elec->scale);
148
149     /* Setup water-specific parameters */
150     inr              = nlist->iinr[0];
151     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
152     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
153     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
154     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
155
156     jq1              = _mm256_set1_ps(charge[inr+1]);
157     jq2              = _mm256_set1_ps(charge[inr+2]);
158     jq3              = _mm256_set1_ps(charge[inr+3]);
159     vdwjidx0A        = 2*vdwtype[inr+0];
160     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
161     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
162     qq11             = _mm256_mul_ps(iq1,jq1);
163     qq12             = _mm256_mul_ps(iq1,jq2);
164     qq13             = _mm256_mul_ps(iq1,jq3);
165     qq21             = _mm256_mul_ps(iq2,jq1);
166     qq22             = _mm256_mul_ps(iq2,jq2);
167     qq23             = _mm256_mul_ps(iq2,jq3);
168     qq31             = _mm256_mul_ps(iq3,jq1);
169     qq32             = _mm256_mul_ps(iq3,jq2);
170     qq33             = _mm256_mul_ps(iq3,jq3);
171
172     /* Avoid stupid compiler warnings */
173     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
174     j_coord_offsetA = 0;
175     j_coord_offsetB = 0;
176     j_coord_offsetC = 0;
177     j_coord_offsetD = 0;
178     j_coord_offsetE = 0;
179     j_coord_offsetF = 0;
180     j_coord_offsetG = 0;
181     j_coord_offsetH = 0;
182
183     outeriter        = 0;
184     inneriter        = 0;
185
186     for(iidx=0;iidx<4*DIM;iidx++)
187     {
188         scratch[iidx] = 0.0;
189     }
190
191     /* Start outer loop over neighborlists */
192     for(iidx=0; iidx<nri; iidx++)
193     {
194         /* Load shift vector for this list */
195         i_shift_offset   = DIM*shiftidx[iidx];
196
197         /* Load limits for loop over neighbors */
198         j_index_start    = jindex[iidx];
199         j_index_end      = jindex[iidx+1];
200
201         /* Get outer coordinate index */
202         inr              = iinr[iidx];
203         i_coord_offset   = DIM*inr;
204
205         /* Load i particle coords and add shift vector */
206         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
207                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
208
209         fix0             = _mm256_setzero_ps();
210         fiy0             = _mm256_setzero_ps();
211         fiz0             = _mm256_setzero_ps();
212         fix1             = _mm256_setzero_ps();
213         fiy1             = _mm256_setzero_ps();
214         fiz1             = _mm256_setzero_ps();
215         fix2             = _mm256_setzero_ps();
216         fiy2             = _mm256_setzero_ps();
217         fiz2             = _mm256_setzero_ps();
218         fix3             = _mm256_setzero_ps();
219         fiy3             = _mm256_setzero_ps();
220         fiz3             = _mm256_setzero_ps();
221
222         /* Reset potential sums */
223         velecsum         = _mm256_setzero_ps();
224         vvdwsum          = _mm256_setzero_ps();
225
226         /* Start inner kernel loop */
227         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
228         {
229
230             /* Get j neighbor index, and coordinate index */
231             jnrA             = jjnr[jidx];
232             jnrB             = jjnr[jidx+1];
233             jnrC             = jjnr[jidx+2];
234             jnrD             = jjnr[jidx+3];
235             jnrE             = jjnr[jidx+4];
236             jnrF             = jjnr[jidx+5];
237             jnrG             = jjnr[jidx+6];
238             jnrH             = jjnr[jidx+7];
239             j_coord_offsetA  = DIM*jnrA;
240             j_coord_offsetB  = DIM*jnrB;
241             j_coord_offsetC  = DIM*jnrC;
242             j_coord_offsetD  = DIM*jnrD;
243             j_coord_offsetE  = DIM*jnrE;
244             j_coord_offsetF  = DIM*jnrF;
245             j_coord_offsetG  = DIM*jnrG;
246             j_coord_offsetH  = DIM*jnrH;
247
248             /* load j atom coordinates */
249             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
250                                                  x+j_coord_offsetC,x+j_coord_offsetD,
251                                                  x+j_coord_offsetE,x+j_coord_offsetF,
252                                                  x+j_coord_offsetG,x+j_coord_offsetH,
253                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
254                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
255
256             /* Calculate displacement vector */
257             dx00             = _mm256_sub_ps(ix0,jx0);
258             dy00             = _mm256_sub_ps(iy0,jy0);
259             dz00             = _mm256_sub_ps(iz0,jz0);
260             dx11             = _mm256_sub_ps(ix1,jx1);
261             dy11             = _mm256_sub_ps(iy1,jy1);
262             dz11             = _mm256_sub_ps(iz1,jz1);
263             dx12             = _mm256_sub_ps(ix1,jx2);
264             dy12             = _mm256_sub_ps(iy1,jy2);
265             dz12             = _mm256_sub_ps(iz1,jz2);
266             dx13             = _mm256_sub_ps(ix1,jx3);
267             dy13             = _mm256_sub_ps(iy1,jy3);
268             dz13             = _mm256_sub_ps(iz1,jz3);
269             dx21             = _mm256_sub_ps(ix2,jx1);
270             dy21             = _mm256_sub_ps(iy2,jy1);
271             dz21             = _mm256_sub_ps(iz2,jz1);
272             dx22             = _mm256_sub_ps(ix2,jx2);
273             dy22             = _mm256_sub_ps(iy2,jy2);
274             dz22             = _mm256_sub_ps(iz2,jz2);
275             dx23             = _mm256_sub_ps(ix2,jx3);
276             dy23             = _mm256_sub_ps(iy2,jy3);
277             dz23             = _mm256_sub_ps(iz2,jz3);
278             dx31             = _mm256_sub_ps(ix3,jx1);
279             dy31             = _mm256_sub_ps(iy3,jy1);
280             dz31             = _mm256_sub_ps(iz3,jz1);
281             dx32             = _mm256_sub_ps(ix3,jx2);
282             dy32             = _mm256_sub_ps(iy3,jy2);
283             dz32             = _mm256_sub_ps(iz3,jz2);
284             dx33             = _mm256_sub_ps(ix3,jx3);
285             dy33             = _mm256_sub_ps(iy3,jy3);
286             dz33             = _mm256_sub_ps(iz3,jz3);
287
288             /* Calculate squared distance and things based on it */
289             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
290             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
291             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
292             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
293             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
294             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
295             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
296             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
297             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
298             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
299
300             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
301             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
302             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
303             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
304             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
305             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
306             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
307             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
308             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
309
310             rinvsq00         = gmx_mm256_inv_ps(rsq00);
311
312             fjx0             = _mm256_setzero_ps();
313             fjy0             = _mm256_setzero_ps();
314             fjz0             = _mm256_setzero_ps();
315             fjx1             = _mm256_setzero_ps();
316             fjy1             = _mm256_setzero_ps();
317             fjz1             = _mm256_setzero_ps();
318             fjx2             = _mm256_setzero_ps();
319             fjy2             = _mm256_setzero_ps();
320             fjz2             = _mm256_setzero_ps();
321             fjx3             = _mm256_setzero_ps();
322             fjy3             = _mm256_setzero_ps();
323             fjz3             = _mm256_setzero_ps();
324
325             /**************************
326              * CALCULATE INTERACTIONS *
327              **************************/
328
329             /* LENNARD-JONES DISPERSION/REPULSION */
330
331             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
332             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
333             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
334             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
335             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
336
337             /* Update potential sum for this i atom from the interaction with this j atom. */
338             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
339
340             fscal            = fvdw;
341
342             /* Calculate temporary vectorial force */
343             tx               = _mm256_mul_ps(fscal,dx00);
344             ty               = _mm256_mul_ps(fscal,dy00);
345             tz               = _mm256_mul_ps(fscal,dz00);
346
347             /* Update vectorial force */
348             fix0             = _mm256_add_ps(fix0,tx);
349             fiy0             = _mm256_add_ps(fiy0,ty);
350             fiz0             = _mm256_add_ps(fiz0,tz);
351
352             fjx0             = _mm256_add_ps(fjx0,tx);
353             fjy0             = _mm256_add_ps(fjy0,ty);
354             fjz0             = _mm256_add_ps(fjz0,tz);
355
356             /**************************
357              * CALCULATE INTERACTIONS *
358              **************************/
359
360             r11              = _mm256_mul_ps(rsq11,rinv11);
361
362             /* Calculate table index by multiplying r with table scale and truncate to integer */
363             rt               = _mm256_mul_ps(r11,vftabscale);
364             vfitab           = _mm256_cvttps_epi32(rt);
365             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
366             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
367             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
368             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
369             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
370             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
371
372             /* CUBIC SPLINE TABLE ELECTROSTATICS */
373             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
374                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
375             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
376                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
377             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
378                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
379             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
380                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
381             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
382             Heps             = _mm256_mul_ps(vfeps,H);
383             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
384             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
385             velec            = _mm256_mul_ps(qq11,VV);
386             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
387             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velecsum         = _mm256_add_ps(velecsum,velec);
391
392             fscal            = felec;
393
394             /* Calculate temporary vectorial force */
395             tx               = _mm256_mul_ps(fscal,dx11);
396             ty               = _mm256_mul_ps(fscal,dy11);
397             tz               = _mm256_mul_ps(fscal,dz11);
398
399             /* Update vectorial force */
400             fix1             = _mm256_add_ps(fix1,tx);
401             fiy1             = _mm256_add_ps(fiy1,ty);
402             fiz1             = _mm256_add_ps(fiz1,tz);
403
404             fjx1             = _mm256_add_ps(fjx1,tx);
405             fjy1             = _mm256_add_ps(fjy1,ty);
406             fjz1             = _mm256_add_ps(fjz1,tz);
407
408             /**************************
409              * CALCULATE INTERACTIONS *
410              **************************/
411
412             r12              = _mm256_mul_ps(rsq12,rinv12);
413
414             /* Calculate table index by multiplying r with table scale and truncate to integer */
415             rt               = _mm256_mul_ps(r12,vftabscale);
416             vfitab           = _mm256_cvttps_epi32(rt);
417             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
418             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
419             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
420             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
421             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
422             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
423
424             /* CUBIC SPLINE TABLE ELECTROSTATICS */
425             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
426                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
427             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
428                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
429             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
430                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
431             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
432                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
433             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
434             Heps             = _mm256_mul_ps(vfeps,H);
435             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
436             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
437             velec            = _mm256_mul_ps(qq12,VV);
438             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
439             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
440
441             /* Update potential sum for this i atom from the interaction with this j atom. */
442             velecsum         = _mm256_add_ps(velecsum,velec);
443
444             fscal            = felec;
445
446             /* Calculate temporary vectorial force */
447             tx               = _mm256_mul_ps(fscal,dx12);
448             ty               = _mm256_mul_ps(fscal,dy12);
449             tz               = _mm256_mul_ps(fscal,dz12);
450
451             /* Update vectorial force */
452             fix1             = _mm256_add_ps(fix1,tx);
453             fiy1             = _mm256_add_ps(fiy1,ty);
454             fiz1             = _mm256_add_ps(fiz1,tz);
455
456             fjx2             = _mm256_add_ps(fjx2,tx);
457             fjy2             = _mm256_add_ps(fjy2,ty);
458             fjz2             = _mm256_add_ps(fjz2,tz);
459
460             /**************************
461              * CALCULATE INTERACTIONS *
462              **************************/
463
464             r13              = _mm256_mul_ps(rsq13,rinv13);
465
466             /* Calculate table index by multiplying r with table scale and truncate to integer */
467             rt               = _mm256_mul_ps(r13,vftabscale);
468             vfitab           = _mm256_cvttps_epi32(rt);
469             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
470             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
471             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
472             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
473             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
474             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
475
476             /* CUBIC SPLINE TABLE ELECTROSTATICS */
477             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
478                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
479             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
480                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
481             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
482                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
483             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
484                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
485             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
486             Heps             = _mm256_mul_ps(vfeps,H);
487             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
488             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
489             velec            = _mm256_mul_ps(qq13,VV);
490             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
491             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
492
493             /* Update potential sum for this i atom from the interaction with this j atom. */
494             velecsum         = _mm256_add_ps(velecsum,velec);
495
496             fscal            = felec;
497
498             /* Calculate temporary vectorial force */
499             tx               = _mm256_mul_ps(fscal,dx13);
500             ty               = _mm256_mul_ps(fscal,dy13);
501             tz               = _mm256_mul_ps(fscal,dz13);
502
503             /* Update vectorial force */
504             fix1             = _mm256_add_ps(fix1,tx);
505             fiy1             = _mm256_add_ps(fiy1,ty);
506             fiz1             = _mm256_add_ps(fiz1,tz);
507
508             fjx3             = _mm256_add_ps(fjx3,tx);
509             fjy3             = _mm256_add_ps(fjy3,ty);
510             fjz3             = _mm256_add_ps(fjz3,tz);
511
512             /**************************
513              * CALCULATE INTERACTIONS *
514              **************************/
515
516             r21              = _mm256_mul_ps(rsq21,rinv21);
517
518             /* Calculate table index by multiplying r with table scale and truncate to integer */
519             rt               = _mm256_mul_ps(r21,vftabscale);
520             vfitab           = _mm256_cvttps_epi32(rt);
521             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
522             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
523             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
524             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
525             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
526             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
527
528             /* CUBIC SPLINE TABLE ELECTROSTATICS */
529             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
530                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
531             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
532                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
533             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
534                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
535             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
536                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
537             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
538             Heps             = _mm256_mul_ps(vfeps,H);
539             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
540             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
541             velec            = _mm256_mul_ps(qq21,VV);
542             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
543             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
544
545             /* Update potential sum for this i atom from the interaction with this j atom. */
546             velecsum         = _mm256_add_ps(velecsum,velec);
547
548             fscal            = felec;
549
550             /* Calculate temporary vectorial force */
551             tx               = _mm256_mul_ps(fscal,dx21);
552             ty               = _mm256_mul_ps(fscal,dy21);
553             tz               = _mm256_mul_ps(fscal,dz21);
554
555             /* Update vectorial force */
556             fix2             = _mm256_add_ps(fix2,tx);
557             fiy2             = _mm256_add_ps(fiy2,ty);
558             fiz2             = _mm256_add_ps(fiz2,tz);
559
560             fjx1             = _mm256_add_ps(fjx1,tx);
561             fjy1             = _mm256_add_ps(fjy1,ty);
562             fjz1             = _mm256_add_ps(fjz1,tz);
563
564             /**************************
565              * CALCULATE INTERACTIONS *
566              **************************/
567
568             r22              = _mm256_mul_ps(rsq22,rinv22);
569
570             /* Calculate table index by multiplying r with table scale and truncate to integer */
571             rt               = _mm256_mul_ps(r22,vftabscale);
572             vfitab           = _mm256_cvttps_epi32(rt);
573             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
574             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
575             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
576             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
577             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
578             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
579
580             /* CUBIC SPLINE TABLE ELECTROSTATICS */
581             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
582                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
583             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
584                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
585             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
586                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
587             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
588                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
589             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
590             Heps             = _mm256_mul_ps(vfeps,H);
591             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
592             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
593             velec            = _mm256_mul_ps(qq22,VV);
594             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
595             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
596
597             /* Update potential sum for this i atom from the interaction with this j atom. */
598             velecsum         = _mm256_add_ps(velecsum,velec);
599
600             fscal            = felec;
601
602             /* Calculate temporary vectorial force */
603             tx               = _mm256_mul_ps(fscal,dx22);
604             ty               = _mm256_mul_ps(fscal,dy22);
605             tz               = _mm256_mul_ps(fscal,dz22);
606
607             /* Update vectorial force */
608             fix2             = _mm256_add_ps(fix2,tx);
609             fiy2             = _mm256_add_ps(fiy2,ty);
610             fiz2             = _mm256_add_ps(fiz2,tz);
611
612             fjx2             = _mm256_add_ps(fjx2,tx);
613             fjy2             = _mm256_add_ps(fjy2,ty);
614             fjz2             = _mm256_add_ps(fjz2,tz);
615
616             /**************************
617              * CALCULATE INTERACTIONS *
618              **************************/
619
620             r23              = _mm256_mul_ps(rsq23,rinv23);
621
622             /* Calculate table index by multiplying r with table scale and truncate to integer */
623             rt               = _mm256_mul_ps(r23,vftabscale);
624             vfitab           = _mm256_cvttps_epi32(rt);
625             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
626             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
627             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
628             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
629             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
630             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
631
632             /* CUBIC SPLINE TABLE ELECTROSTATICS */
633             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
634                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
635             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
636                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
637             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
638                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
639             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
640                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
641             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
642             Heps             = _mm256_mul_ps(vfeps,H);
643             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
644             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
645             velec            = _mm256_mul_ps(qq23,VV);
646             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
647             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
648
649             /* Update potential sum for this i atom from the interaction with this j atom. */
650             velecsum         = _mm256_add_ps(velecsum,velec);
651
652             fscal            = felec;
653
654             /* Calculate temporary vectorial force */
655             tx               = _mm256_mul_ps(fscal,dx23);
656             ty               = _mm256_mul_ps(fscal,dy23);
657             tz               = _mm256_mul_ps(fscal,dz23);
658
659             /* Update vectorial force */
660             fix2             = _mm256_add_ps(fix2,tx);
661             fiy2             = _mm256_add_ps(fiy2,ty);
662             fiz2             = _mm256_add_ps(fiz2,tz);
663
664             fjx3             = _mm256_add_ps(fjx3,tx);
665             fjy3             = _mm256_add_ps(fjy3,ty);
666             fjz3             = _mm256_add_ps(fjz3,tz);
667
668             /**************************
669              * CALCULATE INTERACTIONS *
670              **************************/
671
672             r31              = _mm256_mul_ps(rsq31,rinv31);
673
674             /* Calculate table index by multiplying r with table scale and truncate to integer */
675             rt               = _mm256_mul_ps(r31,vftabscale);
676             vfitab           = _mm256_cvttps_epi32(rt);
677             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
678             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
679             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
680             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
681             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
682             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
683
684             /* CUBIC SPLINE TABLE ELECTROSTATICS */
685             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
686                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
687             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
688                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
689             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
690                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
691             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
692                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
693             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
694             Heps             = _mm256_mul_ps(vfeps,H);
695             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
696             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
697             velec            = _mm256_mul_ps(qq31,VV);
698             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
699             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
700
701             /* Update potential sum for this i atom from the interaction with this j atom. */
702             velecsum         = _mm256_add_ps(velecsum,velec);
703
704             fscal            = felec;
705
706             /* Calculate temporary vectorial force */
707             tx               = _mm256_mul_ps(fscal,dx31);
708             ty               = _mm256_mul_ps(fscal,dy31);
709             tz               = _mm256_mul_ps(fscal,dz31);
710
711             /* Update vectorial force */
712             fix3             = _mm256_add_ps(fix3,tx);
713             fiy3             = _mm256_add_ps(fiy3,ty);
714             fiz3             = _mm256_add_ps(fiz3,tz);
715
716             fjx1             = _mm256_add_ps(fjx1,tx);
717             fjy1             = _mm256_add_ps(fjy1,ty);
718             fjz1             = _mm256_add_ps(fjz1,tz);
719
720             /**************************
721              * CALCULATE INTERACTIONS *
722              **************************/
723
724             r32              = _mm256_mul_ps(rsq32,rinv32);
725
726             /* Calculate table index by multiplying r with table scale and truncate to integer */
727             rt               = _mm256_mul_ps(r32,vftabscale);
728             vfitab           = _mm256_cvttps_epi32(rt);
729             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
730             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
731             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
732             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
733             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
734             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
735
736             /* CUBIC SPLINE TABLE ELECTROSTATICS */
737             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
738                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
739             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
740                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
741             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
742                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
743             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
744                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
745             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
746             Heps             = _mm256_mul_ps(vfeps,H);
747             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
748             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
749             velec            = _mm256_mul_ps(qq32,VV);
750             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
751             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
752
753             /* Update potential sum for this i atom from the interaction with this j atom. */
754             velecsum         = _mm256_add_ps(velecsum,velec);
755
756             fscal            = felec;
757
758             /* Calculate temporary vectorial force */
759             tx               = _mm256_mul_ps(fscal,dx32);
760             ty               = _mm256_mul_ps(fscal,dy32);
761             tz               = _mm256_mul_ps(fscal,dz32);
762
763             /* Update vectorial force */
764             fix3             = _mm256_add_ps(fix3,tx);
765             fiy3             = _mm256_add_ps(fiy3,ty);
766             fiz3             = _mm256_add_ps(fiz3,tz);
767
768             fjx2             = _mm256_add_ps(fjx2,tx);
769             fjy2             = _mm256_add_ps(fjy2,ty);
770             fjz2             = _mm256_add_ps(fjz2,tz);
771
772             /**************************
773              * CALCULATE INTERACTIONS *
774              **************************/
775
776             r33              = _mm256_mul_ps(rsq33,rinv33);
777
778             /* Calculate table index by multiplying r with table scale and truncate to integer */
779             rt               = _mm256_mul_ps(r33,vftabscale);
780             vfitab           = _mm256_cvttps_epi32(rt);
781             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
782             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
783             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
784             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
785             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
786             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
787
788             /* CUBIC SPLINE TABLE ELECTROSTATICS */
789             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
790                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
791             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
792                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
793             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
794                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
795             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
796                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
797             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
798             Heps             = _mm256_mul_ps(vfeps,H);
799             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
800             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
801             velec            = _mm256_mul_ps(qq33,VV);
802             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
803             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
804
805             /* Update potential sum for this i atom from the interaction with this j atom. */
806             velecsum         = _mm256_add_ps(velecsum,velec);
807
808             fscal            = felec;
809
810             /* Calculate temporary vectorial force */
811             tx               = _mm256_mul_ps(fscal,dx33);
812             ty               = _mm256_mul_ps(fscal,dy33);
813             tz               = _mm256_mul_ps(fscal,dz33);
814
815             /* Update vectorial force */
816             fix3             = _mm256_add_ps(fix3,tx);
817             fiy3             = _mm256_add_ps(fiy3,ty);
818             fiz3             = _mm256_add_ps(fiz3,tz);
819
820             fjx3             = _mm256_add_ps(fjx3,tx);
821             fjy3             = _mm256_add_ps(fjy3,ty);
822             fjz3             = _mm256_add_ps(fjz3,tz);
823
824             fjptrA             = f+j_coord_offsetA;
825             fjptrB             = f+j_coord_offsetB;
826             fjptrC             = f+j_coord_offsetC;
827             fjptrD             = f+j_coord_offsetD;
828             fjptrE             = f+j_coord_offsetE;
829             fjptrF             = f+j_coord_offsetF;
830             fjptrG             = f+j_coord_offsetG;
831             fjptrH             = f+j_coord_offsetH;
832
833             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
834                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
835                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
836
837             /* Inner loop uses 422 flops */
838         }
839
840         if(jidx<j_index_end)
841         {
842
843             /* Get j neighbor index, and coordinate index */
844             jnrlistA         = jjnr[jidx];
845             jnrlistB         = jjnr[jidx+1];
846             jnrlistC         = jjnr[jidx+2];
847             jnrlistD         = jjnr[jidx+3];
848             jnrlistE         = jjnr[jidx+4];
849             jnrlistF         = jjnr[jidx+5];
850             jnrlistG         = jjnr[jidx+6];
851             jnrlistH         = jjnr[jidx+7];
852             /* Sign of each element will be negative for non-real atoms.
853              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
854              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
855              */
856             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
857                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
858                                             
859             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
860             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
861             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
862             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
863             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
864             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
865             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
866             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
867             j_coord_offsetA  = DIM*jnrA;
868             j_coord_offsetB  = DIM*jnrB;
869             j_coord_offsetC  = DIM*jnrC;
870             j_coord_offsetD  = DIM*jnrD;
871             j_coord_offsetE  = DIM*jnrE;
872             j_coord_offsetF  = DIM*jnrF;
873             j_coord_offsetG  = DIM*jnrG;
874             j_coord_offsetH  = DIM*jnrH;
875
876             /* load j atom coordinates */
877             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
878                                                  x+j_coord_offsetC,x+j_coord_offsetD,
879                                                  x+j_coord_offsetE,x+j_coord_offsetF,
880                                                  x+j_coord_offsetG,x+j_coord_offsetH,
881                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
882                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
883
884             /* Calculate displacement vector */
885             dx00             = _mm256_sub_ps(ix0,jx0);
886             dy00             = _mm256_sub_ps(iy0,jy0);
887             dz00             = _mm256_sub_ps(iz0,jz0);
888             dx11             = _mm256_sub_ps(ix1,jx1);
889             dy11             = _mm256_sub_ps(iy1,jy1);
890             dz11             = _mm256_sub_ps(iz1,jz1);
891             dx12             = _mm256_sub_ps(ix1,jx2);
892             dy12             = _mm256_sub_ps(iy1,jy2);
893             dz12             = _mm256_sub_ps(iz1,jz2);
894             dx13             = _mm256_sub_ps(ix1,jx3);
895             dy13             = _mm256_sub_ps(iy1,jy3);
896             dz13             = _mm256_sub_ps(iz1,jz3);
897             dx21             = _mm256_sub_ps(ix2,jx1);
898             dy21             = _mm256_sub_ps(iy2,jy1);
899             dz21             = _mm256_sub_ps(iz2,jz1);
900             dx22             = _mm256_sub_ps(ix2,jx2);
901             dy22             = _mm256_sub_ps(iy2,jy2);
902             dz22             = _mm256_sub_ps(iz2,jz2);
903             dx23             = _mm256_sub_ps(ix2,jx3);
904             dy23             = _mm256_sub_ps(iy2,jy3);
905             dz23             = _mm256_sub_ps(iz2,jz3);
906             dx31             = _mm256_sub_ps(ix3,jx1);
907             dy31             = _mm256_sub_ps(iy3,jy1);
908             dz31             = _mm256_sub_ps(iz3,jz1);
909             dx32             = _mm256_sub_ps(ix3,jx2);
910             dy32             = _mm256_sub_ps(iy3,jy2);
911             dz32             = _mm256_sub_ps(iz3,jz2);
912             dx33             = _mm256_sub_ps(ix3,jx3);
913             dy33             = _mm256_sub_ps(iy3,jy3);
914             dz33             = _mm256_sub_ps(iz3,jz3);
915
916             /* Calculate squared distance and things based on it */
917             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
918             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
919             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
920             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
921             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
922             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
923             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
924             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
925             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
926             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
927
928             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
929             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
930             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
931             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
932             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
933             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
934             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
935             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
936             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
937
938             rinvsq00         = gmx_mm256_inv_ps(rsq00);
939
940             fjx0             = _mm256_setzero_ps();
941             fjy0             = _mm256_setzero_ps();
942             fjz0             = _mm256_setzero_ps();
943             fjx1             = _mm256_setzero_ps();
944             fjy1             = _mm256_setzero_ps();
945             fjz1             = _mm256_setzero_ps();
946             fjx2             = _mm256_setzero_ps();
947             fjy2             = _mm256_setzero_ps();
948             fjz2             = _mm256_setzero_ps();
949             fjx3             = _mm256_setzero_ps();
950             fjy3             = _mm256_setzero_ps();
951             fjz3             = _mm256_setzero_ps();
952
953             /**************************
954              * CALCULATE INTERACTIONS *
955              **************************/
956
957             /* LENNARD-JONES DISPERSION/REPULSION */
958
959             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
960             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
961             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
962             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
963             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
964
965             /* Update potential sum for this i atom from the interaction with this j atom. */
966             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
967             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
968
969             fscal            = fvdw;
970
971             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
972
973             /* Calculate temporary vectorial force */
974             tx               = _mm256_mul_ps(fscal,dx00);
975             ty               = _mm256_mul_ps(fscal,dy00);
976             tz               = _mm256_mul_ps(fscal,dz00);
977
978             /* Update vectorial force */
979             fix0             = _mm256_add_ps(fix0,tx);
980             fiy0             = _mm256_add_ps(fiy0,ty);
981             fiz0             = _mm256_add_ps(fiz0,tz);
982
983             fjx0             = _mm256_add_ps(fjx0,tx);
984             fjy0             = _mm256_add_ps(fjy0,ty);
985             fjz0             = _mm256_add_ps(fjz0,tz);
986
987             /**************************
988              * CALCULATE INTERACTIONS *
989              **************************/
990
991             r11              = _mm256_mul_ps(rsq11,rinv11);
992             r11              = _mm256_andnot_ps(dummy_mask,r11);
993
994             /* Calculate table index by multiplying r with table scale and truncate to integer */
995             rt               = _mm256_mul_ps(r11,vftabscale);
996             vfitab           = _mm256_cvttps_epi32(rt);
997             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
998             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
999             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1000             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1001             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1002             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1003
1004             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1005             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1006                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1007             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1008                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1009             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1010                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1011             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1012                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1013             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1014             Heps             = _mm256_mul_ps(vfeps,H);
1015             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1016             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1017             velec            = _mm256_mul_ps(qq11,VV);
1018             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1019             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1020
1021             /* Update potential sum for this i atom from the interaction with this j atom. */
1022             velec            = _mm256_andnot_ps(dummy_mask,velec);
1023             velecsum         = _mm256_add_ps(velecsum,velec);
1024
1025             fscal            = felec;
1026
1027             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1028
1029             /* Calculate temporary vectorial force */
1030             tx               = _mm256_mul_ps(fscal,dx11);
1031             ty               = _mm256_mul_ps(fscal,dy11);
1032             tz               = _mm256_mul_ps(fscal,dz11);
1033
1034             /* Update vectorial force */
1035             fix1             = _mm256_add_ps(fix1,tx);
1036             fiy1             = _mm256_add_ps(fiy1,ty);
1037             fiz1             = _mm256_add_ps(fiz1,tz);
1038
1039             fjx1             = _mm256_add_ps(fjx1,tx);
1040             fjy1             = _mm256_add_ps(fjy1,ty);
1041             fjz1             = _mm256_add_ps(fjz1,tz);
1042
1043             /**************************
1044              * CALCULATE INTERACTIONS *
1045              **************************/
1046
1047             r12              = _mm256_mul_ps(rsq12,rinv12);
1048             r12              = _mm256_andnot_ps(dummy_mask,r12);
1049
1050             /* Calculate table index by multiplying r with table scale and truncate to integer */
1051             rt               = _mm256_mul_ps(r12,vftabscale);
1052             vfitab           = _mm256_cvttps_epi32(rt);
1053             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1054             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1055             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1056             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1057             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1058             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1059
1060             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1061             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1062                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1063             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1064                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1065             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1066                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1067             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1068                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1069             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1070             Heps             = _mm256_mul_ps(vfeps,H);
1071             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1072             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1073             velec            = _mm256_mul_ps(qq12,VV);
1074             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1075             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1076
1077             /* Update potential sum for this i atom from the interaction with this j atom. */
1078             velec            = _mm256_andnot_ps(dummy_mask,velec);
1079             velecsum         = _mm256_add_ps(velecsum,velec);
1080
1081             fscal            = felec;
1082
1083             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1084
1085             /* Calculate temporary vectorial force */
1086             tx               = _mm256_mul_ps(fscal,dx12);
1087             ty               = _mm256_mul_ps(fscal,dy12);
1088             tz               = _mm256_mul_ps(fscal,dz12);
1089
1090             /* Update vectorial force */
1091             fix1             = _mm256_add_ps(fix1,tx);
1092             fiy1             = _mm256_add_ps(fiy1,ty);
1093             fiz1             = _mm256_add_ps(fiz1,tz);
1094
1095             fjx2             = _mm256_add_ps(fjx2,tx);
1096             fjy2             = _mm256_add_ps(fjy2,ty);
1097             fjz2             = _mm256_add_ps(fjz2,tz);
1098
1099             /**************************
1100              * CALCULATE INTERACTIONS *
1101              **************************/
1102
1103             r13              = _mm256_mul_ps(rsq13,rinv13);
1104             r13              = _mm256_andnot_ps(dummy_mask,r13);
1105
1106             /* Calculate table index by multiplying r with table scale and truncate to integer */
1107             rt               = _mm256_mul_ps(r13,vftabscale);
1108             vfitab           = _mm256_cvttps_epi32(rt);
1109             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1110             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1111             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1112             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1113             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1114             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1115
1116             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1117             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1118                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1119             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1120                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1121             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1122                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1123             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1124                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1125             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1126             Heps             = _mm256_mul_ps(vfeps,H);
1127             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1128             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1129             velec            = _mm256_mul_ps(qq13,VV);
1130             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1131             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
1132
1133             /* Update potential sum for this i atom from the interaction with this j atom. */
1134             velec            = _mm256_andnot_ps(dummy_mask,velec);
1135             velecsum         = _mm256_add_ps(velecsum,velec);
1136
1137             fscal            = felec;
1138
1139             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1140
1141             /* Calculate temporary vectorial force */
1142             tx               = _mm256_mul_ps(fscal,dx13);
1143             ty               = _mm256_mul_ps(fscal,dy13);
1144             tz               = _mm256_mul_ps(fscal,dz13);
1145
1146             /* Update vectorial force */
1147             fix1             = _mm256_add_ps(fix1,tx);
1148             fiy1             = _mm256_add_ps(fiy1,ty);
1149             fiz1             = _mm256_add_ps(fiz1,tz);
1150
1151             fjx3             = _mm256_add_ps(fjx3,tx);
1152             fjy3             = _mm256_add_ps(fjy3,ty);
1153             fjz3             = _mm256_add_ps(fjz3,tz);
1154
1155             /**************************
1156              * CALCULATE INTERACTIONS *
1157              **************************/
1158
1159             r21              = _mm256_mul_ps(rsq21,rinv21);
1160             r21              = _mm256_andnot_ps(dummy_mask,r21);
1161
1162             /* Calculate table index by multiplying r with table scale and truncate to integer */
1163             rt               = _mm256_mul_ps(r21,vftabscale);
1164             vfitab           = _mm256_cvttps_epi32(rt);
1165             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1166             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1167             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1168             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1169             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1170             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1171
1172             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1173             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1174                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1175             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1176                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1177             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1178                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1179             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1180                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1181             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1182             Heps             = _mm256_mul_ps(vfeps,H);
1183             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1184             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1185             velec            = _mm256_mul_ps(qq21,VV);
1186             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1187             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1188
1189             /* Update potential sum for this i atom from the interaction with this j atom. */
1190             velec            = _mm256_andnot_ps(dummy_mask,velec);
1191             velecsum         = _mm256_add_ps(velecsum,velec);
1192
1193             fscal            = felec;
1194
1195             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1196
1197             /* Calculate temporary vectorial force */
1198             tx               = _mm256_mul_ps(fscal,dx21);
1199             ty               = _mm256_mul_ps(fscal,dy21);
1200             tz               = _mm256_mul_ps(fscal,dz21);
1201
1202             /* Update vectorial force */
1203             fix2             = _mm256_add_ps(fix2,tx);
1204             fiy2             = _mm256_add_ps(fiy2,ty);
1205             fiz2             = _mm256_add_ps(fiz2,tz);
1206
1207             fjx1             = _mm256_add_ps(fjx1,tx);
1208             fjy1             = _mm256_add_ps(fjy1,ty);
1209             fjz1             = _mm256_add_ps(fjz1,tz);
1210
1211             /**************************
1212              * CALCULATE INTERACTIONS *
1213              **************************/
1214
1215             r22              = _mm256_mul_ps(rsq22,rinv22);
1216             r22              = _mm256_andnot_ps(dummy_mask,r22);
1217
1218             /* Calculate table index by multiplying r with table scale and truncate to integer */
1219             rt               = _mm256_mul_ps(r22,vftabscale);
1220             vfitab           = _mm256_cvttps_epi32(rt);
1221             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1222             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1223             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1224             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1225             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1226             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1227
1228             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1229             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1230                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1231             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1232                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1233             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1234                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1235             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1236                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1237             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1238             Heps             = _mm256_mul_ps(vfeps,H);
1239             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1240             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1241             velec            = _mm256_mul_ps(qq22,VV);
1242             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1243             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1244
1245             /* Update potential sum for this i atom from the interaction with this j atom. */
1246             velec            = _mm256_andnot_ps(dummy_mask,velec);
1247             velecsum         = _mm256_add_ps(velecsum,velec);
1248
1249             fscal            = felec;
1250
1251             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1252
1253             /* Calculate temporary vectorial force */
1254             tx               = _mm256_mul_ps(fscal,dx22);
1255             ty               = _mm256_mul_ps(fscal,dy22);
1256             tz               = _mm256_mul_ps(fscal,dz22);
1257
1258             /* Update vectorial force */
1259             fix2             = _mm256_add_ps(fix2,tx);
1260             fiy2             = _mm256_add_ps(fiy2,ty);
1261             fiz2             = _mm256_add_ps(fiz2,tz);
1262
1263             fjx2             = _mm256_add_ps(fjx2,tx);
1264             fjy2             = _mm256_add_ps(fjy2,ty);
1265             fjz2             = _mm256_add_ps(fjz2,tz);
1266
1267             /**************************
1268              * CALCULATE INTERACTIONS *
1269              **************************/
1270
1271             r23              = _mm256_mul_ps(rsq23,rinv23);
1272             r23              = _mm256_andnot_ps(dummy_mask,r23);
1273
1274             /* Calculate table index by multiplying r with table scale and truncate to integer */
1275             rt               = _mm256_mul_ps(r23,vftabscale);
1276             vfitab           = _mm256_cvttps_epi32(rt);
1277             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1278             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1279             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1280             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1281             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1282             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1283
1284             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1285             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1286                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1287             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1288                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1289             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1290                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1291             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1292                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1293             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1294             Heps             = _mm256_mul_ps(vfeps,H);
1295             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1296             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1297             velec            = _mm256_mul_ps(qq23,VV);
1298             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1299             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
1300
1301             /* Update potential sum for this i atom from the interaction with this j atom. */
1302             velec            = _mm256_andnot_ps(dummy_mask,velec);
1303             velecsum         = _mm256_add_ps(velecsum,velec);
1304
1305             fscal            = felec;
1306
1307             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1308
1309             /* Calculate temporary vectorial force */
1310             tx               = _mm256_mul_ps(fscal,dx23);
1311             ty               = _mm256_mul_ps(fscal,dy23);
1312             tz               = _mm256_mul_ps(fscal,dz23);
1313
1314             /* Update vectorial force */
1315             fix2             = _mm256_add_ps(fix2,tx);
1316             fiy2             = _mm256_add_ps(fiy2,ty);
1317             fiz2             = _mm256_add_ps(fiz2,tz);
1318
1319             fjx3             = _mm256_add_ps(fjx3,tx);
1320             fjy3             = _mm256_add_ps(fjy3,ty);
1321             fjz3             = _mm256_add_ps(fjz3,tz);
1322
1323             /**************************
1324              * CALCULATE INTERACTIONS *
1325              **************************/
1326
1327             r31              = _mm256_mul_ps(rsq31,rinv31);
1328             r31              = _mm256_andnot_ps(dummy_mask,r31);
1329
1330             /* Calculate table index by multiplying r with table scale and truncate to integer */
1331             rt               = _mm256_mul_ps(r31,vftabscale);
1332             vfitab           = _mm256_cvttps_epi32(rt);
1333             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1334             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1335             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1336             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1337             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1338             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1339
1340             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1341             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1342                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1343             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1344                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1345             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1346                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1347             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1348                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1349             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1350             Heps             = _mm256_mul_ps(vfeps,H);
1351             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1352             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1353             velec            = _mm256_mul_ps(qq31,VV);
1354             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1355             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
1356
1357             /* Update potential sum for this i atom from the interaction with this j atom. */
1358             velec            = _mm256_andnot_ps(dummy_mask,velec);
1359             velecsum         = _mm256_add_ps(velecsum,velec);
1360
1361             fscal            = felec;
1362
1363             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1364
1365             /* Calculate temporary vectorial force */
1366             tx               = _mm256_mul_ps(fscal,dx31);
1367             ty               = _mm256_mul_ps(fscal,dy31);
1368             tz               = _mm256_mul_ps(fscal,dz31);
1369
1370             /* Update vectorial force */
1371             fix3             = _mm256_add_ps(fix3,tx);
1372             fiy3             = _mm256_add_ps(fiy3,ty);
1373             fiz3             = _mm256_add_ps(fiz3,tz);
1374
1375             fjx1             = _mm256_add_ps(fjx1,tx);
1376             fjy1             = _mm256_add_ps(fjy1,ty);
1377             fjz1             = _mm256_add_ps(fjz1,tz);
1378
1379             /**************************
1380              * CALCULATE INTERACTIONS *
1381              **************************/
1382
1383             r32              = _mm256_mul_ps(rsq32,rinv32);
1384             r32              = _mm256_andnot_ps(dummy_mask,r32);
1385
1386             /* Calculate table index by multiplying r with table scale and truncate to integer */
1387             rt               = _mm256_mul_ps(r32,vftabscale);
1388             vfitab           = _mm256_cvttps_epi32(rt);
1389             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1390             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1391             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1392             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1393             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1394             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1395
1396             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1397             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1398                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1399             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1400                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1401             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1402                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1403             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1404                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1405             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1406             Heps             = _mm256_mul_ps(vfeps,H);
1407             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1408             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1409             velec            = _mm256_mul_ps(qq32,VV);
1410             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1411             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
1412
1413             /* Update potential sum for this i atom from the interaction with this j atom. */
1414             velec            = _mm256_andnot_ps(dummy_mask,velec);
1415             velecsum         = _mm256_add_ps(velecsum,velec);
1416
1417             fscal            = felec;
1418
1419             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1420
1421             /* Calculate temporary vectorial force */
1422             tx               = _mm256_mul_ps(fscal,dx32);
1423             ty               = _mm256_mul_ps(fscal,dy32);
1424             tz               = _mm256_mul_ps(fscal,dz32);
1425
1426             /* Update vectorial force */
1427             fix3             = _mm256_add_ps(fix3,tx);
1428             fiy3             = _mm256_add_ps(fiy3,ty);
1429             fiz3             = _mm256_add_ps(fiz3,tz);
1430
1431             fjx2             = _mm256_add_ps(fjx2,tx);
1432             fjy2             = _mm256_add_ps(fjy2,ty);
1433             fjz2             = _mm256_add_ps(fjz2,tz);
1434
1435             /**************************
1436              * CALCULATE INTERACTIONS *
1437              **************************/
1438
1439             r33              = _mm256_mul_ps(rsq33,rinv33);
1440             r33              = _mm256_andnot_ps(dummy_mask,r33);
1441
1442             /* Calculate table index by multiplying r with table scale and truncate to integer */
1443             rt               = _mm256_mul_ps(r33,vftabscale);
1444             vfitab           = _mm256_cvttps_epi32(rt);
1445             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1446             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1447             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1448             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1449             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1450             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1451
1452             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1453             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1454                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1455             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1456                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1457             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1458                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1459             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1460                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1461             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1462             Heps             = _mm256_mul_ps(vfeps,H);
1463             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1464             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1465             velec            = _mm256_mul_ps(qq33,VV);
1466             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1467             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
1468
1469             /* Update potential sum for this i atom from the interaction with this j atom. */
1470             velec            = _mm256_andnot_ps(dummy_mask,velec);
1471             velecsum         = _mm256_add_ps(velecsum,velec);
1472
1473             fscal            = felec;
1474
1475             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1476
1477             /* Calculate temporary vectorial force */
1478             tx               = _mm256_mul_ps(fscal,dx33);
1479             ty               = _mm256_mul_ps(fscal,dy33);
1480             tz               = _mm256_mul_ps(fscal,dz33);
1481
1482             /* Update vectorial force */
1483             fix3             = _mm256_add_ps(fix3,tx);
1484             fiy3             = _mm256_add_ps(fiy3,ty);
1485             fiz3             = _mm256_add_ps(fiz3,tz);
1486
1487             fjx3             = _mm256_add_ps(fjx3,tx);
1488             fjy3             = _mm256_add_ps(fjy3,ty);
1489             fjz3             = _mm256_add_ps(fjz3,tz);
1490
1491             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1492             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1493             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1494             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1495             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1496             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1497             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1498             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1499
1500             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1501                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1502                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1503
1504             /* Inner loop uses 431 flops */
1505         }
1506
1507         /* End of innermost loop */
1508
1509         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1510                                                  f+i_coord_offset,fshift+i_shift_offset);
1511
1512         ggid                        = gid[iidx];
1513         /* Update potential energies */
1514         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1515         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1516
1517         /* Increment number of inner iterations */
1518         inneriter                  += j_index_end - j_index_start;
1519
1520         /* Outer loop uses 26 flops */
1521     }
1522
1523     /* Increment number of outer iterations */
1524     outeriter        += nri;
1525
1526     /* Update outer/inner flops */
1527
1528     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*431);
1529 }
1530 /*
1531  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_single
1532  * Electrostatics interaction: CubicSplineTable
1533  * VdW interaction:            LennardJones
1534  * Geometry:                   Water4-Water4
1535  * Calculate force/pot:        Force
1536  */
1537 void
1538 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_single
1539                     (t_nblist                    * gmx_restrict       nlist,
1540                      rvec                        * gmx_restrict          xx,
1541                      rvec                        * gmx_restrict          ff,
1542                      t_forcerec                  * gmx_restrict          fr,
1543                      t_mdatoms                   * gmx_restrict     mdatoms,
1544                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1545                      t_nrnb                      * gmx_restrict        nrnb)
1546 {
1547     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1548      * just 0 for non-waters.
1549      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1550      * jnr indices corresponding to data put in the four positions in the SIMD register.
1551      */
1552     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1553     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1554     int              jnrA,jnrB,jnrC,jnrD;
1555     int              jnrE,jnrF,jnrG,jnrH;
1556     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1557     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1558     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1559     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1560     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1561     real             rcutoff_scalar;
1562     real             *shiftvec,*fshift,*x,*f;
1563     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1564     real             scratch[4*DIM];
1565     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1566     real *           vdwioffsetptr0;
1567     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1568     real *           vdwioffsetptr1;
1569     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1570     real *           vdwioffsetptr2;
1571     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1572     real *           vdwioffsetptr3;
1573     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1574     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1575     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1576     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1577     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1578     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1579     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1580     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1581     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1582     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1583     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1584     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1585     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1586     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1587     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1588     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1589     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1590     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1591     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1592     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1593     real             *charge;
1594     int              nvdwtype;
1595     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1596     int              *vdwtype;
1597     real             *vdwparam;
1598     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1599     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1600     __m256i          vfitab;
1601     __m128i          vfitab_lo,vfitab_hi;
1602     __m128i          ifour       = _mm_set1_epi32(4);
1603     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1604     real             *vftab;
1605     __m256           dummy_mask,cutoff_mask;
1606     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1607     __m256           one     = _mm256_set1_ps(1.0);
1608     __m256           two     = _mm256_set1_ps(2.0);
1609     x                = xx[0];
1610     f                = ff[0];
1611
1612     nri              = nlist->nri;
1613     iinr             = nlist->iinr;
1614     jindex           = nlist->jindex;
1615     jjnr             = nlist->jjnr;
1616     shiftidx         = nlist->shift;
1617     gid              = nlist->gid;
1618     shiftvec         = fr->shift_vec[0];
1619     fshift           = fr->fshift[0];
1620     facel            = _mm256_set1_ps(fr->epsfac);
1621     charge           = mdatoms->chargeA;
1622     nvdwtype         = fr->ntype;
1623     vdwparam         = fr->nbfp;
1624     vdwtype          = mdatoms->typeA;
1625
1626     vftab            = kernel_data->table_elec->data;
1627     vftabscale       = _mm256_set1_ps(kernel_data->table_elec->scale);
1628
1629     /* Setup water-specific parameters */
1630     inr              = nlist->iinr[0];
1631     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1632     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1633     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1634     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1635
1636     jq1              = _mm256_set1_ps(charge[inr+1]);
1637     jq2              = _mm256_set1_ps(charge[inr+2]);
1638     jq3              = _mm256_set1_ps(charge[inr+3]);
1639     vdwjidx0A        = 2*vdwtype[inr+0];
1640     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1641     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1642     qq11             = _mm256_mul_ps(iq1,jq1);
1643     qq12             = _mm256_mul_ps(iq1,jq2);
1644     qq13             = _mm256_mul_ps(iq1,jq3);
1645     qq21             = _mm256_mul_ps(iq2,jq1);
1646     qq22             = _mm256_mul_ps(iq2,jq2);
1647     qq23             = _mm256_mul_ps(iq2,jq3);
1648     qq31             = _mm256_mul_ps(iq3,jq1);
1649     qq32             = _mm256_mul_ps(iq3,jq2);
1650     qq33             = _mm256_mul_ps(iq3,jq3);
1651
1652     /* Avoid stupid compiler warnings */
1653     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1654     j_coord_offsetA = 0;
1655     j_coord_offsetB = 0;
1656     j_coord_offsetC = 0;
1657     j_coord_offsetD = 0;
1658     j_coord_offsetE = 0;
1659     j_coord_offsetF = 0;
1660     j_coord_offsetG = 0;
1661     j_coord_offsetH = 0;
1662
1663     outeriter        = 0;
1664     inneriter        = 0;
1665
1666     for(iidx=0;iidx<4*DIM;iidx++)
1667     {
1668         scratch[iidx] = 0.0;
1669     }
1670
1671     /* Start outer loop over neighborlists */
1672     for(iidx=0; iidx<nri; iidx++)
1673     {
1674         /* Load shift vector for this list */
1675         i_shift_offset   = DIM*shiftidx[iidx];
1676
1677         /* Load limits for loop over neighbors */
1678         j_index_start    = jindex[iidx];
1679         j_index_end      = jindex[iidx+1];
1680
1681         /* Get outer coordinate index */
1682         inr              = iinr[iidx];
1683         i_coord_offset   = DIM*inr;
1684
1685         /* Load i particle coords and add shift vector */
1686         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1687                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1688
1689         fix0             = _mm256_setzero_ps();
1690         fiy0             = _mm256_setzero_ps();
1691         fiz0             = _mm256_setzero_ps();
1692         fix1             = _mm256_setzero_ps();
1693         fiy1             = _mm256_setzero_ps();
1694         fiz1             = _mm256_setzero_ps();
1695         fix2             = _mm256_setzero_ps();
1696         fiy2             = _mm256_setzero_ps();
1697         fiz2             = _mm256_setzero_ps();
1698         fix3             = _mm256_setzero_ps();
1699         fiy3             = _mm256_setzero_ps();
1700         fiz3             = _mm256_setzero_ps();
1701
1702         /* Start inner kernel loop */
1703         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1704         {
1705
1706             /* Get j neighbor index, and coordinate index */
1707             jnrA             = jjnr[jidx];
1708             jnrB             = jjnr[jidx+1];
1709             jnrC             = jjnr[jidx+2];
1710             jnrD             = jjnr[jidx+3];
1711             jnrE             = jjnr[jidx+4];
1712             jnrF             = jjnr[jidx+5];
1713             jnrG             = jjnr[jidx+6];
1714             jnrH             = jjnr[jidx+7];
1715             j_coord_offsetA  = DIM*jnrA;
1716             j_coord_offsetB  = DIM*jnrB;
1717             j_coord_offsetC  = DIM*jnrC;
1718             j_coord_offsetD  = DIM*jnrD;
1719             j_coord_offsetE  = DIM*jnrE;
1720             j_coord_offsetF  = DIM*jnrF;
1721             j_coord_offsetG  = DIM*jnrG;
1722             j_coord_offsetH  = DIM*jnrH;
1723
1724             /* load j atom coordinates */
1725             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1726                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1727                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1728                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1729                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1730                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1731
1732             /* Calculate displacement vector */
1733             dx00             = _mm256_sub_ps(ix0,jx0);
1734             dy00             = _mm256_sub_ps(iy0,jy0);
1735             dz00             = _mm256_sub_ps(iz0,jz0);
1736             dx11             = _mm256_sub_ps(ix1,jx1);
1737             dy11             = _mm256_sub_ps(iy1,jy1);
1738             dz11             = _mm256_sub_ps(iz1,jz1);
1739             dx12             = _mm256_sub_ps(ix1,jx2);
1740             dy12             = _mm256_sub_ps(iy1,jy2);
1741             dz12             = _mm256_sub_ps(iz1,jz2);
1742             dx13             = _mm256_sub_ps(ix1,jx3);
1743             dy13             = _mm256_sub_ps(iy1,jy3);
1744             dz13             = _mm256_sub_ps(iz1,jz3);
1745             dx21             = _mm256_sub_ps(ix2,jx1);
1746             dy21             = _mm256_sub_ps(iy2,jy1);
1747             dz21             = _mm256_sub_ps(iz2,jz1);
1748             dx22             = _mm256_sub_ps(ix2,jx2);
1749             dy22             = _mm256_sub_ps(iy2,jy2);
1750             dz22             = _mm256_sub_ps(iz2,jz2);
1751             dx23             = _mm256_sub_ps(ix2,jx3);
1752             dy23             = _mm256_sub_ps(iy2,jy3);
1753             dz23             = _mm256_sub_ps(iz2,jz3);
1754             dx31             = _mm256_sub_ps(ix3,jx1);
1755             dy31             = _mm256_sub_ps(iy3,jy1);
1756             dz31             = _mm256_sub_ps(iz3,jz1);
1757             dx32             = _mm256_sub_ps(ix3,jx2);
1758             dy32             = _mm256_sub_ps(iy3,jy2);
1759             dz32             = _mm256_sub_ps(iz3,jz2);
1760             dx33             = _mm256_sub_ps(ix3,jx3);
1761             dy33             = _mm256_sub_ps(iy3,jy3);
1762             dz33             = _mm256_sub_ps(iz3,jz3);
1763
1764             /* Calculate squared distance and things based on it */
1765             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1766             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1767             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1768             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1769             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1770             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1771             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1772             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1773             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1774             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1775
1776             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1777             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1778             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1779             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1780             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1781             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1782             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1783             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1784             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1785
1786             rinvsq00         = gmx_mm256_inv_ps(rsq00);
1787
1788             fjx0             = _mm256_setzero_ps();
1789             fjy0             = _mm256_setzero_ps();
1790             fjz0             = _mm256_setzero_ps();
1791             fjx1             = _mm256_setzero_ps();
1792             fjy1             = _mm256_setzero_ps();
1793             fjz1             = _mm256_setzero_ps();
1794             fjx2             = _mm256_setzero_ps();
1795             fjy2             = _mm256_setzero_ps();
1796             fjz2             = _mm256_setzero_ps();
1797             fjx3             = _mm256_setzero_ps();
1798             fjy3             = _mm256_setzero_ps();
1799             fjz3             = _mm256_setzero_ps();
1800
1801             /**************************
1802              * CALCULATE INTERACTIONS *
1803              **************************/
1804
1805             /* LENNARD-JONES DISPERSION/REPULSION */
1806
1807             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1808             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1809
1810             fscal            = fvdw;
1811
1812             /* Calculate temporary vectorial force */
1813             tx               = _mm256_mul_ps(fscal,dx00);
1814             ty               = _mm256_mul_ps(fscal,dy00);
1815             tz               = _mm256_mul_ps(fscal,dz00);
1816
1817             /* Update vectorial force */
1818             fix0             = _mm256_add_ps(fix0,tx);
1819             fiy0             = _mm256_add_ps(fiy0,ty);
1820             fiz0             = _mm256_add_ps(fiz0,tz);
1821
1822             fjx0             = _mm256_add_ps(fjx0,tx);
1823             fjy0             = _mm256_add_ps(fjy0,ty);
1824             fjz0             = _mm256_add_ps(fjz0,tz);
1825
1826             /**************************
1827              * CALCULATE INTERACTIONS *
1828              **************************/
1829
1830             r11              = _mm256_mul_ps(rsq11,rinv11);
1831
1832             /* Calculate table index by multiplying r with table scale and truncate to integer */
1833             rt               = _mm256_mul_ps(r11,vftabscale);
1834             vfitab           = _mm256_cvttps_epi32(rt);
1835             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1836             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1837             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1838             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1839             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1840             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1841
1842             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1843             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1844                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1845             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1846                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1847             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1848                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1849             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1850                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1851             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1852             Heps             = _mm256_mul_ps(vfeps,H);
1853             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1854             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1855             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1856
1857             fscal            = felec;
1858
1859             /* Calculate temporary vectorial force */
1860             tx               = _mm256_mul_ps(fscal,dx11);
1861             ty               = _mm256_mul_ps(fscal,dy11);
1862             tz               = _mm256_mul_ps(fscal,dz11);
1863
1864             /* Update vectorial force */
1865             fix1             = _mm256_add_ps(fix1,tx);
1866             fiy1             = _mm256_add_ps(fiy1,ty);
1867             fiz1             = _mm256_add_ps(fiz1,tz);
1868
1869             fjx1             = _mm256_add_ps(fjx1,tx);
1870             fjy1             = _mm256_add_ps(fjy1,ty);
1871             fjz1             = _mm256_add_ps(fjz1,tz);
1872
1873             /**************************
1874              * CALCULATE INTERACTIONS *
1875              **************************/
1876
1877             r12              = _mm256_mul_ps(rsq12,rinv12);
1878
1879             /* Calculate table index by multiplying r with table scale and truncate to integer */
1880             rt               = _mm256_mul_ps(r12,vftabscale);
1881             vfitab           = _mm256_cvttps_epi32(rt);
1882             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1883             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1884             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1885             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1886             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1887             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1888
1889             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1890             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1891                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1892             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1893                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1894             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1895                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1896             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1897                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1898             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1899             Heps             = _mm256_mul_ps(vfeps,H);
1900             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1901             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1902             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1903
1904             fscal            = felec;
1905
1906             /* Calculate temporary vectorial force */
1907             tx               = _mm256_mul_ps(fscal,dx12);
1908             ty               = _mm256_mul_ps(fscal,dy12);
1909             tz               = _mm256_mul_ps(fscal,dz12);
1910
1911             /* Update vectorial force */
1912             fix1             = _mm256_add_ps(fix1,tx);
1913             fiy1             = _mm256_add_ps(fiy1,ty);
1914             fiz1             = _mm256_add_ps(fiz1,tz);
1915
1916             fjx2             = _mm256_add_ps(fjx2,tx);
1917             fjy2             = _mm256_add_ps(fjy2,ty);
1918             fjz2             = _mm256_add_ps(fjz2,tz);
1919
1920             /**************************
1921              * CALCULATE INTERACTIONS *
1922              **************************/
1923
1924             r13              = _mm256_mul_ps(rsq13,rinv13);
1925
1926             /* Calculate table index by multiplying r with table scale and truncate to integer */
1927             rt               = _mm256_mul_ps(r13,vftabscale);
1928             vfitab           = _mm256_cvttps_epi32(rt);
1929             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1930             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1931             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1932             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1933             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1934             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1935
1936             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1937             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1938                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1939             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1940                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1941             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1942                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1943             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1944                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1945             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1946             Heps             = _mm256_mul_ps(vfeps,H);
1947             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1948             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1949             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
1950
1951             fscal            = felec;
1952
1953             /* Calculate temporary vectorial force */
1954             tx               = _mm256_mul_ps(fscal,dx13);
1955             ty               = _mm256_mul_ps(fscal,dy13);
1956             tz               = _mm256_mul_ps(fscal,dz13);
1957
1958             /* Update vectorial force */
1959             fix1             = _mm256_add_ps(fix1,tx);
1960             fiy1             = _mm256_add_ps(fiy1,ty);
1961             fiz1             = _mm256_add_ps(fiz1,tz);
1962
1963             fjx3             = _mm256_add_ps(fjx3,tx);
1964             fjy3             = _mm256_add_ps(fjy3,ty);
1965             fjz3             = _mm256_add_ps(fjz3,tz);
1966
1967             /**************************
1968              * CALCULATE INTERACTIONS *
1969              **************************/
1970
1971             r21              = _mm256_mul_ps(rsq21,rinv21);
1972
1973             /* Calculate table index by multiplying r with table scale and truncate to integer */
1974             rt               = _mm256_mul_ps(r21,vftabscale);
1975             vfitab           = _mm256_cvttps_epi32(rt);
1976             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1977             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1978             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1979             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1980             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1981             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1982
1983             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1984             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1985                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1986             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1987                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1988             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1989                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1990             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1991                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1992             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1993             Heps             = _mm256_mul_ps(vfeps,H);
1994             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1995             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1996             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1997
1998             fscal            = felec;
1999
2000             /* Calculate temporary vectorial force */
2001             tx               = _mm256_mul_ps(fscal,dx21);
2002             ty               = _mm256_mul_ps(fscal,dy21);
2003             tz               = _mm256_mul_ps(fscal,dz21);
2004
2005             /* Update vectorial force */
2006             fix2             = _mm256_add_ps(fix2,tx);
2007             fiy2             = _mm256_add_ps(fiy2,ty);
2008             fiz2             = _mm256_add_ps(fiz2,tz);
2009
2010             fjx1             = _mm256_add_ps(fjx1,tx);
2011             fjy1             = _mm256_add_ps(fjy1,ty);
2012             fjz1             = _mm256_add_ps(fjz1,tz);
2013
2014             /**************************
2015              * CALCULATE INTERACTIONS *
2016              **************************/
2017
2018             r22              = _mm256_mul_ps(rsq22,rinv22);
2019
2020             /* Calculate table index by multiplying r with table scale and truncate to integer */
2021             rt               = _mm256_mul_ps(r22,vftabscale);
2022             vfitab           = _mm256_cvttps_epi32(rt);
2023             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2024             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2025             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2026             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2027             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2028             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2029
2030             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2031             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2032                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2033             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2034                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2035             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2036                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2037             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2038                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2039             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2040             Heps             = _mm256_mul_ps(vfeps,H);
2041             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2042             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2043             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2044
2045             fscal            = felec;
2046
2047             /* Calculate temporary vectorial force */
2048             tx               = _mm256_mul_ps(fscal,dx22);
2049             ty               = _mm256_mul_ps(fscal,dy22);
2050             tz               = _mm256_mul_ps(fscal,dz22);
2051
2052             /* Update vectorial force */
2053             fix2             = _mm256_add_ps(fix2,tx);
2054             fiy2             = _mm256_add_ps(fiy2,ty);
2055             fiz2             = _mm256_add_ps(fiz2,tz);
2056
2057             fjx2             = _mm256_add_ps(fjx2,tx);
2058             fjy2             = _mm256_add_ps(fjy2,ty);
2059             fjz2             = _mm256_add_ps(fjz2,tz);
2060
2061             /**************************
2062              * CALCULATE INTERACTIONS *
2063              **************************/
2064
2065             r23              = _mm256_mul_ps(rsq23,rinv23);
2066
2067             /* Calculate table index by multiplying r with table scale and truncate to integer */
2068             rt               = _mm256_mul_ps(r23,vftabscale);
2069             vfitab           = _mm256_cvttps_epi32(rt);
2070             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2071             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2072             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2073             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2074             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2075             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2076
2077             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2078             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2079                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2080             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2081                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2082             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2083                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2084             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2085                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2086             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2087             Heps             = _mm256_mul_ps(vfeps,H);
2088             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2089             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2090             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2091
2092             fscal            = felec;
2093
2094             /* Calculate temporary vectorial force */
2095             tx               = _mm256_mul_ps(fscal,dx23);
2096             ty               = _mm256_mul_ps(fscal,dy23);
2097             tz               = _mm256_mul_ps(fscal,dz23);
2098
2099             /* Update vectorial force */
2100             fix2             = _mm256_add_ps(fix2,tx);
2101             fiy2             = _mm256_add_ps(fiy2,ty);
2102             fiz2             = _mm256_add_ps(fiz2,tz);
2103
2104             fjx3             = _mm256_add_ps(fjx3,tx);
2105             fjy3             = _mm256_add_ps(fjy3,ty);
2106             fjz3             = _mm256_add_ps(fjz3,tz);
2107
2108             /**************************
2109              * CALCULATE INTERACTIONS *
2110              **************************/
2111
2112             r31              = _mm256_mul_ps(rsq31,rinv31);
2113
2114             /* Calculate table index by multiplying r with table scale and truncate to integer */
2115             rt               = _mm256_mul_ps(r31,vftabscale);
2116             vfitab           = _mm256_cvttps_epi32(rt);
2117             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2118             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2119             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2120             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2121             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2122             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2123
2124             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2125             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2126                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2127             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2128                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2129             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2130                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2131             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2132                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2133             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2134             Heps             = _mm256_mul_ps(vfeps,H);
2135             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2136             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2137             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2138
2139             fscal            = felec;
2140
2141             /* Calculate temporary vectorial force */
2142             tx               = _mm256_mul_ps(fscal,dx31);
2143             ty               = _mm256_mul_ps(fscal,dy31);
2144             tz               = _mm256_mul_ps(fscal,dz31);
2145
2146             /* Update vectorial force */
2147             fix3             = _mm256_add_ps(fix3,tx);
2148             fiy3             = _mm256_add_ps(fiy3,ty);
2149             fiz3             = _mm256_add_ps(fiz3,tz);
2150
2151             fjx1             = _mm256_add_ps(fjx1,tx);
2152             fjy1             = _mm256_add_ps(fjy1,ty);
2153             fjz1             = _mm256_add_ps(fjz1,tz);
2154
2155             /**************************
2156              * CALCULATE INTERACTIONS *
2157              **************************/
2158
2159             r32              = _mm256_mul_ps(rsq32,rinv32);
2160
2161             /* Calculate table index by multiplying r with table scale and truncate to integer */
2162             rt               = _mm256_mul_ps(r32,vftabscale);
2163             vfitab           = _mm256_cvttps_epi32(rt);
2164             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2165             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2166             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2167             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2168             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2169             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2170
2171             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2172             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2173                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2174             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2175                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2176             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2177                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2178             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2179                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2180             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2181             Heps             = _mm256_mul_ps(vfeps,H);
2182             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2183             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2184             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2185
2186             fscal            = felec;
2187
2188             /* Calculate temporary vectorial force */
2189             tx               = _mm256_mul_ps(fscal,dx32);
2190             ty               = _mm256_mul_ps(fscal,dy32);
2191             tz               = _mm256_mul_ps(fscal,dz32);
2192
2193             /* Update vectorial force */
2194             fix3             = _mm256_add_ps(fix3,tx);
2195             fiy3             = _mm256_add_ps(fiy3,ty);
2196             fiz3             = _mm256_add_ps(fiz3,tz);
2197
2198             fjx2             = _mm256_add_ps(fjx2,tx);
2199             fjy2             = _mm256_add_ps(fjy2,ty);
2200             fjz2             = _mm256_add_ps(fjz2,tz);
2201
2202             /**************************
2203              * CALCULATE INTERACTIONS *
2204              **************************/
2205
2206             r33              = _mm256_mul_ps(rsq33,rinv33);
2207
2208             /* Calculate table index by multiplying r with table scale and truncate to integer */
2209             rt               = _mm256_mul_ps(r33,vftabscale);
2210             vfitab           = _mm256_cvttps_epi32(rt);
2211             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2212             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2213             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2214             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2215             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2216             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2217
2218             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2219             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2220                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2221             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2222                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2223             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2224                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2225             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2226                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2227             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2228             Heps             = _mm256_mul_ps(vfeps,H);
2229             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2230             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2231             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2232
2233             fscal            = felec;
2234
2235             /* Calculate temporary vectorial force */
2236             tx               = _mm256_mul_ps(fscal,dx33);
2237             ty               = _mm256_mul_ps(fscal,dy33);
2238             tz               = _mm256_mul_ps(fscal,dz33);
2239
2240             /* Update vectorial force */
2241             fix3             = _mm256_add_ps(fix3,tx);
2242             fiy3             = _mm256_add_ps(fiy3,ty);
2243             fiz3             = _mm256_add_ps(fiz3,tz);
2244
2245             fjx3             = _mm256_add_ps(fjx3,tx);
2246             fjy3             = _mm256_add_ps(fjy3,ty);
2247             fjz3             = _mm256_add_ps(fjz3,tz);
2248
2249             fjptrA             = f+j_coord_offsetA;
2250             fjptrB             = f+j_coord_offsetB;
2251             fjptrC             = f+j_coord_offsetC;
2252             fjptrD             = f+j_coord_offsetD;
2253             fjptrE             = f+j_coord_offsetE;
2254             fjptrF             = f+j_coord_offsetF;
2255             fjptrG             = f+j_coord_offsetG;
2256             fjptrH             = f+j_coord_offsetH;
2257
2258             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2259                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2260                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2261
2262             /* Inner loop uses 381 flops */
2263         }
2264
2265         if(jidx<j_index_end)
2266         {
2267
2268             /* Get j neighbor index, and coordinate index */
2269             jnrlistA         = jjnr[jidx];
2270             jnrlistB         = jjnr[jidx+1];
2271             jnrlistC         = jjnr[jidx+2];
2272             jnrlistD         = jjnr[jidx+3];
2273             jnrlistE         = jjnr[jidx+4];
2274             jnrlistF         = jjnr[jidx+5];
2275             jnrlistG         = jjnr[jidx+6];
2276             jnrlistH         = jjnr[jidx+7];
2277             /* Sign of each element will be negative for non-real atoms.
2278              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2279              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2280              */
2281             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2282                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2283                                             
2284             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2285             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2286             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2287             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2288             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2289             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2290             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2291             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2292             j_coord_offsetA  = DIM*jnrA;
2293             j_coord_offsetB  = DIM*jnrB;
2294             j_coord_offsetC  = DIM*jnrC;
2295             j_coord_offsetD  = DIM*jnrD;
2296             j_coord_offsetE  = DIM*jnrE;
2297             j_coord_offsetF  = DIM*jnrF;
2298             j_coord_offsetG  = DIM*jnrG;
2299             j_coord_offsetH  = DIM*jnrH;
2300
2301             /* load j atom coordinates */
2302             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2303                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2304                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2305                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2306                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2307                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2308
2309             /* Calculate displacement vector */
2310             dx00             = _mm256_sub_ps(ix0,jx0);
2311             dy00             = _mm256_sub_ps(iy0,jy0);
2312             dz00             = _mm256_sub_ps(iz0,jz0);
2313             dx11             = _mm256_sub_ps(ix1,jx1);
2314             dy11             = _mm256_sub_ps(iy1,jy1);
2315             dz11             = _mm256_sub_ps(iz1,jz1);
2316             dx12             = _mm256_sub_ps(ix1,jx2);
2317             dy12             = _mm256_sub_ps(iy1,jy2);
2318             dz12             = _mm256_sub_ps(iz1,jz2);
2319             dx13             = _mm256_sub_ps(ix1,jx3);
2320             dy13             = _mm256_sub_ps(iy1,jy3);
2321             dz13             = _mm256_sub_ps(iz1,jz3);
2322             dx21             = _mm256_sub_ps(ix2,jx1);
2323             dy21             = _mm256_sub_ps(iy2,jy1);
2324             dz21             = _mm256_sub_ps(iz2,jz1);
2325             dx22             = _mm256_sub_ps(ix2,jx2);
2326             dy22             = _mm256_sub_ps(iy2,jy2);
2327             dz22             = _mm256_sub_ps(iz2,jz2);
2328             dx23             = _mm256_sub_ps(ix2,jx3);
2329             dy23             = _mm256_sub_ps(iy2,jy3);
2330             dz23             = _mm256_sub_ps(iz2,jz3);
2331             dx31             = _mm256_sub_ps(ix3,jx1);
2332             dy31             = _mm256_sub_ps(iy3,jy1);
2333             dz31             = _mm256_sub_ps(iz3,jz1);
2334             dx32             = _mm256_sub_ps(ix3,jx2);
2335             dy32             = _mm256_sub_ps(iy3,jy2);
2336             dz32             = _mm256_sub_ps(iz3,jz2);
2337             dx33             = _mm256_sub_ps(ix3,jx3);
2338             dy33             = _mm256_sub_ps(iy3,jy3);
2339             dz33             = _mm256_sub_ps(iz3,jz3);
2340
2341             /* Calculate squared distance and things based on it */
2342             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2343             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2344             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2345             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2346             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2347             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2348             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2349             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2350             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2351             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2352
2353             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2354             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2355             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
2356             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2357             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2358             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
2359             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
2360             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
2361             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
2362
2363             rinvsq00         = gmx_mm256_inv_ps(rsq00);
2364
2365             fjx0             = _mm256_setzero_ps();
2366             fjy0             = _mm256_setzero_ps();
2367             fjz0             = _mm256_setzero_ps();
2368             fjx1             = _mm256_setzero_ps();
2369             fjy1             = _mm256_setzero_ps();
2370             fjz1             = _mm256_setzero_ps();
2371             fjx2             = _mm256_setzero_ps();
2372             fjy2             = _mm256_setzero_ps();
2373             fjz2             = _mm256_setzero_ps();
2374             fjx3             = _mm256_setzero_ps();
2375             fjy3             = _mm256_setzero_ps();
2376             fjz3             = _mm256_setzero_ps();
2377
2378             /**************************
2379              * CALCULATE INTERACTIONS *
2380              **************************/
2381
2382             /* LENNARD-JONES DISPERSION/REPULSION */
2383
2384             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2385             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2386
2387             fscal            = fvdw;
2388
2389             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2390
2391             /* Calculate temporary vectorial force */
2392             tx               = _mm256_mul_ps(fscal,dx00);
2393             ty               = _mm256_mul_ps(fscal,dy00);
2394             tz               = _mm256_mul_ps(fscal,dz00);
2395
2396             /* Update vectorial force */
2397             fix0             = _mm256_add_ps(fix0,tx);
2398             fiy0             = _mm256_add_ps(fiy0,ty);
2399             fiz0             = _mm256_add_ps(fiz0,tz);
2400
2401             fjx0             = _mm256_add_ps(fjx0,tx);
2402             fjy0             = _mm256_add_ps(fjy0,ty);
2403             fjz0             = _mm256_add_ps(fjz0,tz);
2404
2405             /**************************
2406              * CALCULATE INTERACTIONS *
2407              **************************/
2408
2409             r11              = _mm256_mul_ps(rsq11,rinv11);
2410             r11              = _mm256_andnot_ps(dummy_mask,r11);
2411
2412             /* Calculate table index by multiplying r with table scale and truncate to integer */
2413             rt               = _mm256_mul_ps(r11,vftabscale);
2414             vfitab           = _mm256_cvttps_epi32(rt);
2415             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2416             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2417             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2418             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2419             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2420             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2421
2422             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2423             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2424                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2425             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2426                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2427             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2428                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2429             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2430                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2431             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2432             Heps             = _mm256_mul_ps(vfeps,H);
2433             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2434             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2435             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2436
2437             fscal            = felec;
2438
2439             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2440
2441             /* Calculate temporary vectorial force */
2442             tx               = _mm256_mul_ps(fscal,dx11);
2443             ty               = _mm256_mul_ps(fscal,dy11);
2444             tz               = _mm256_mul_ps(fscal,dz11);
2445
2446             /* Update vectorial force */
2447             fix1             = _mm256_add_ps(fix1,tx);
2448             fiy1             = _mm256_add_ps(fiy1,ty);
2449             fiz1             = _mm256_add_ps(fiz1,tz);
2450
2451             fjx1             = _mm256_add_ps(fjx1,tx);
2452             fjy1             = _mm256_add_ps(fjy1,ty);
2453             fjz1             = _mm256_add_ps(fjz1,tz);
2454
2455             /**************************
2456              * CALCULATE INTERACTIONS *
2457              **************************/
2458
2459             r12              = _mm256_mul_ps(rsq12,rinv12);
2460             r12              = _mm256_andnot_ps(dummy_mask,r12);
2461
2462             /* Calculate table index by multiplying r with table scale and truncate to integer */
2463             rt               = _mm256_mul_ps(r12,vftabscale);
2464             vfitab           = _mm256_cvttps_epi32(rt);
2465             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2466             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2467             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2468             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2469             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2470             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2471
2472             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2473             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2474                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2475             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2476                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2477             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2478                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2479             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2480                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2481             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2482             Heps             = _mm256_mul_ps(vfeps,H);
2483             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2484             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2485             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2486
2487             fscal            = felec;
2488
2489             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2490
2491             /* Calculate temporary vectorial force */
2492             tx               = _mm256_mul_ps(fscal,dx12);
2493             ty               = _mm256_mul_ps(fscal,dy12);
2494             tz               = _mm256_mul_ps(fscal,dz12);
2495
2496             /* Update vectorial force */
2497             fix1             = _mm256_add_ps(fix1,tx);
2498             fiy1             = _mm256_add_ps(fiy1,ty);
2499             fiz1             = _mm256_add_ps(fiz1,tz);
2500
2501             fjx2             = _mm256_add_ps(fjx2,tx);
2502             fjy2             = _mm256_add_ps(fjy2,ty);
2503             fjz2             = _mm256_add_ps(fjz2,tz);
2504
2505             /**************************
2506              * CALCULATE INTERACTIONS *
2507              **************************/
2508
2509             r13              = _mm256_mul_ps(rsq13,rinv13);
2510             r13              = _mm256_andnot_ps(dummy_mask,r13);
2511
2512             /* Calculate table index by multiplying r with table scale and truncate to integer */
2513             rt               = _mm256_mul_ps(r13,vftabscale);
2514             vfitab           = _mm256_cvttps_epi32(rt);
2515             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2516             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2517             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2518             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2519             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2520             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2521
2522             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2523             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2524                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2525             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2526                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2527             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2528                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2529             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2530                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2531             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2532             Heps             = _mm256_mul_ps(vfeps,H);
2533             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2534             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2535             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
2536
2537             fscal            = felec;
2538
2539             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2540
2541             /* Calculate temporary vectorial force */
2542             tx               = _mm256_mul_ps(fscal,dx13);
2543             ty               = _mm256_mul_ps(fscal,dy13);
2544             tz               = _mm256_mul_ps(fscal,dz13);
2545
2546             /* Update vectorial force */
2547             fix1             = _mm256_add_ps(fix1,tx);
2548             fiy1             = _mm256_add_ps(fiy1,ty);
2549             fiz1             = _mm256_add_ps(fiz1,tz);
2550
2551             fjx3             = _mm256_add_ps(fjx3,tx);
2552             fjy3             = _mm256_add_ps(fjy3,ty);
2553             fjz3             = _mm256_add_ps(fjz3,tz);
2554
2555             /**************************
2556              * CALCULATE INTERACTIONS *
2557              **************************/
2558
2559             r21              = _mm256_mul_ps(rsq21,rinv21);
2560             r21              = _mm256_andnot_ps(dummy_mask,r21);
2561
2562             /* Calculate table index by multiplying r with table scale and truncate to integer */
2563             rt               = _mm256_mul_ps(r21,vftabscale);
2564             vfitab           = _mm256_cvttps_epi32(rt);
2565             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2566             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2567             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2568             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2569             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2570             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2571
2572             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2573             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2574                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2575             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2576                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2577             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2578                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2579             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2580                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2581             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2582             Heps             = _mm256_mul_ps(vfeps,H);
2583             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2584             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2585             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2586
2587             fscal            = felec;
2588
2589             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2590
2591             /* Calculate temporary vectorial force */
2592             tx               = _mm256_mul_ps(fscal,dx21);
2593             ty               = _mm256_mul_ps(fscal,dy21);
2594             tz               = _mm256_mul_ps(fscal,dz21);
2595
2596             /* Update vectorial force */
2597             fix2             = _mm256_add_ps(fix2,tx);
2598             fiy2             = _mm256_add_ps(fiy2,ty);
2599             fiz2             = _mm256_add_ps(fiz2,tz);
2600
2601             fjx1             = _mm256_add_ps(fjx1,tx);
2602             fjy1             = _mm256_add_ps(fjy1,ty);
2603             fjz1             = _mm256_add_ps(fjz1,tz);
2604
2605             /**************************
2606              * CALCULATE INTERACTIONS *
2607              **************************/
2608
2609             r22              = _mm256_mul_ps(rsq22,rinv22);
2610             r22              = _mm256_andnot_ps(dummy_mask,r22);
2611
2612             /* Calculate table index by multiplying r with table scale and truncate to integer */
2613             rt               = _mm256_mul_ps(r22,vftabscale);
2614             vfitab           = _mm256_cvttps_epi32(rt);
2615             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2616             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2617             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2618             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2619             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2620             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2621
2622             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2623             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2624                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2625             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2626                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2627             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2628                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2629             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2630                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2631             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2632             Heps             = _mm256_mul_ps(vfeps,H);
2633             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2634             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2635             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2636
2637             fscal            = felec;
2638
2639             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2640
2641             /* Calculate temporary vectorial force */
2642             tx               = _mm256_mul_ps(fscal,dx22);
2643             ty               = _mm256_mul_ps(fscal,dy22);
2644             tz               = _mm256_mul_ps(fscal,dz22);
2645
2646             /* Update vectorial force */
2647             fix2             = _mm256_add_ps(fix2,tx);
2648             fiy2             = _mm256_add_ps(fiy2,ty);
2649             fiz2             = _mm256_add_ps(fiz2,tz);
2650
2651             fjx2             = _mm256_add_ps(fjx2,tx);
2652             fjy2             = _mm256_add_ps(fjy2,ty);
2653             fjz2             = _mm256_add_ps(fjz2,tz);
2654
2655             /**************************
2656              * CALCULATE INTERACTIONS *
2657              **************************/
2658
2659             r23              = _mm256_mul_ps(rsq23,rinv23);
2660             r23              = _mm256_andnot_ps(dummy_mask,r23);
2661
2662             /* Calculate table index by multiplying r with table scale and truncate to integer */
2663             rt               = _mm256_mul_ps(r23,vftabscale);
2664             vfitab           = _mm256_cvttps_epi32(rt);
2665             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2666             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2667             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2668             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2669             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2670             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2671
2672             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2673             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2674                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2675             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2676                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2677             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2678                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2679             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2680                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2681             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2682             Heps             = _mm256_mul_ps(vfeps,H);
2683             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2684             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2685             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2686
2687             fscal            = felec;
2688
2689             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2690
2691             /* Calculate temporary vectorial force */
2692             tx               = _mm256_mul_ps(fscal,dx23);
2693             ty               = _mm256_mul_ps(fscal,dy23);
2694             tz               = _mm256_mul_ps(fscal,dz23);
2695
2696             /* Update vectorial force */
2697             fix2             = _mm256_add_ps(fix2,tx);
2698             fiy2             = _mm256_add_ps(fiy2,ty);
2699             fiz2             = _mm256_add_ps(fiz2,tz);
2700
2701             fjx3             = _mm256_add_ps(fjx3,tx);
2702             fjy3             = _mm256_add_ps(fjy3,ty);
2703             fjz3             = _mm256_add_ps(fjz3,tz);
2704
2705             /**************************
2706              * CALCULATE INTERACTIONS *
2707              **************************/
2708
2709             r31              = _mm256_mul_ps(rsq31,rinv31);
2710             r31              = _mm256_andnot_ps(dummy_mask,r31);
2711
2712             /* Calculate table index by multiplying r with table scale and truncate to integer */
2713             rt               = _mm256_mul_ps(r31,vftabscale);
2714             vfitab           = _mm256_cvttps_epi32(rt);
2715             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2716             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2717             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2718             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2719             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2720             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2721
2722             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2723             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2724                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2725             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2726                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2727             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2728                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2729             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2730                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2731             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2732             Heps             = _mm256_mul_ps(vfeps,H);
2733             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2734             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2735             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2736
2737             fscal            = felec;
2738
2739             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2740
2741             /* Calculate temporary vectorial force */
2742             tx               = _mm256_mul_ps(fscal,dx31);
2743             ty               = _mm256_mul_ps(fscal,dy31);
2744             tz               = _mm256_mul_ps(fscal,dz31);
2745
2746             /* Update vectorial force */
2747             fix3             = _mm256_add_ps(fix3,tx);
2748             fiy3             = _mm256_add_ps(fiy3,ty);
2749             fiz3             = _mm256_add_ps(fiz3,tz);
2750
2751             fjx1             = _mm256_add_ps(fjx1,tx);
2752             fjy1             = _mm256_add_ps(fjy1,ty);
2753             fjz1             = _mm256_add_ps(fjz1,tz);
2754
2755             /**************************
2756              * CALCULATE INTERACTIONS *
2757              **************************/
2758
2759             r32              = _mm256_mul_ps(rsq32,rinv32);
2760             r32              = _mm256_andnot_ps(dummy_mask,r32);
2761
2762             /* Calculate table index by multiplying r with table scale and truncate to integer */
2763             rt               = _mm256_mul_ps(r32,vftabscale);
2764             vfitab           = _mm256_cvttps_epi32(rt);
2765             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2766             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2767             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2768             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2769             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2770             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2771
2772             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2773             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2774                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2775             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2776                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2777             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2778                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2779             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2780                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2781             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2782             Heps             = _mm256_mul_ps(vfeps,H);
2783             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2784             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2785             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2786
2787             fscal            = felec;
2788
2789             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2790
2791             /* Calculate temporary vectorial force */
2792             tx               = _mm256_mul_ps(fscal,dx32);
2793             ty               = _mm256_mul_ps(fscal,dy32);
2794             tz               = _mm256_mul_ps(fscal,dz32);
2795
2796             /* Update vectorial force */
2797             fix3             = _mm256_add_ps(fix3,tx);
2798             fiy3             = _mm256_add_ps(fiy3,ty);
2799             fiz3             = _mm256_add_ps(fiz3,tz);
2800
2801             fjx2             = _mm256_add_ps(fjx2,tx);
2802             fjy2             = _mm256_add_ps(fjy2,ty);
2803             fjz2             = _mm256_add_ps(fjz2,tz);
2804
2805             /**************************
2806              * CALCULATE INTERACTIONS *
2807              **************************/
2808
2809             r33              = _mm256_mul_ps(rsq33,rinv33);
2810             r33              = _mm256_andnot_ps(dummy_mask,r33);
2811
2812             /* Calculate table index by multiplying r with table scale and truncate to integer */
2813             rt               = _mm256_mul_ps(r33,vftabscale);
2814             vfitab           = _mm256_cvttps_epi32(rt);
2815             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2816             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2817             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2818             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2819             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2820             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2821
2822             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2823             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2824                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2825             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2826                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2827             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2828                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2829             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2830                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2831             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2832             Heps             = _mm256_mul_ps(vfeps,H);
2833             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2834             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2835             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2836
2837             fscal            = felec;
2838
2839             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2840
2841             /* Calculate temporary vectorial force */
2842             tx               = _mm256_mul_ps(fscal,dx33);
2843             ty               = _mm256_mul_ps(fscal,dy33);
2844             tz               = _mm256_mul_ps(fscal,dz33);
2845
2846             /* Update vectorial force */
2847             fix3             = _mm256_add_ps(fix3,tx);
2848             fiy3             = _mm256_add_ps(fiy3,ty);
2849             fiz3             = _mm256_add_ps(fiz3,tz);
2850
2851             fjx3             = _mm256_add_ps(fjx3,tx);
2852             fjy3             = _mm256_add_ps(fjy3,ty);
2853             fjz3             = _mm256_add_ps(fjz3,tz);
2854
2855             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2856             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2857             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2858             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2859             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2860             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2861             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2862             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2863
2864             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2865                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2866                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2867
2868             /* Inner loop uses 390 flops */
2869         }
2870
2871         /* End of innermost loop */
2872
2873         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2874                                                  f+i_coord_offset,fshift+i_shift_offset);
2875
2876         /* Increment number of inner iterations */
2877         inneriter                  += j_index_end - j_index_start;
2878
2879         /* Outer loop uses 24 flops */
2880     }
2881
2882     /* Increment number of outer iterations */
2883     outeriter        += nri;
2884
2885     /* Update outer/inner flops */
2886
2887     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*390);
2888 }