Version bumps after new release
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single
54  * Electrostatics interaction: CubicSplineTable
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrE,jnrF,jnrG,jnrH;
78     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
83     real             rcutoff_scalar;
84     real             *shiftvec,*fshift,*x,*f;
85     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
86     real             scratch[4*DIM];
87     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88     real *           vdwioffsetptr0;
89     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90     real *           vdwioffsetptr1;
91     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92     real *           vdwioffsetptr2;
93     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
102     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
103     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
104     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
105     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
106     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
107     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
108     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
109     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
116     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
117     __m256i          vfitab;
118     __m128i          vfitab_lo,vfitab_hi;
119     __m128i          ifour       = _mm_set1_epi32(4);
120     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
121     real             *vftab;
122     __m256           dummy_mask,cutoff_mask;
123     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
124     __m256           one     = _mm256_set1_ps(1.0);
125     __m256           two     = _mm256_set1_ps(2.0);
126     x                = xx[0];
127     f                = ff[0];
128
129     nri              = nlist->nri;
130     iinr             = nlist->iinr;
131     jindex           = nlist->jindex;
132     jjnr             = nlist->jjnr;
133     shiftidx         = nlist->shift;
134     gid              = nlist->gid;
135     shiftvec         = fr->shift_vec[0];
136     fshift           = fr->fshift[0];
137     facel            = _mm256_set1_ps(fr->epsfac);
138     charge           = mdatoms->chargeA;
139     nvdwtype         = fr->ntype;
140     vdwparam         = fr->nbfp;
141     vdwtype          = mdatoms->typeA;
142
143     vftab            = kernel_data->table_elec->data;
144     vftabscale       = _mm256_set1_ps(kernel_data->table_elec->scale);
145
146     /* Setup water-specific parameters */
147     inr              = nlist->iinr[0];
148     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
149     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
150     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
151     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
152
153     jq0              = _mm256_set1_ps(charge[inr+0]);
154     jq1              = _mm256_set1_ps(charge[inr+1]);
155     jq2              = _mm256_set1_ps(charge[inr+2]);
156     vdwjidx0A        = 2*vdwtype[inr+0];
157     qq00             = _mm256_mul_ps(iq0,jq0);
158     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
159     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
160     qq01             = _mm256_mul_ps(iq0,jq1);
161     qq02             = _mm256_mul_ps(iq0,jq2);
162     qq10             = _mm256_mul_ps(iq1,jq0);
163     qq11             = _mm256_mul_ps(iq1,jq1);
164     qq12             = _mm256_mul_ps(iq1,jq2);
165     qq20             = _mm256_mul_ps(iq2,jq0);
166     qq21             = _mm256_mul_ps(iq2,jq1);
167     qq22             = _mm256_mul_ps(iq2,jq2);
168
169     /* Avoid stupid compiler warnings */
170     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
171     j_coord_offsetA = 0;
172     j_coord_offsetB = 0;
173     j_coord_offsetC = 0;
174     j_coord_offsetD = 0;
175     j_coord_offsetE = 0;
176     j_coord_offsetF = 0;
177     j_coord_offsetG = 0;
178     j_coord_offsetH = 0;
179
180     outeriter        = 0;
181     inneriter        = 0;
182
183     for(iidx=0;iidx<4*DIM;iidx++)
184     {
185         scratch[iidx] = 0.0;
186     }
187
188     /* Start outer loop over neighborlists */
189     for(iidx=0; iidx<nri; iidx++)
190     {
191         /* Load shift vector for this list */
192         i_shift_offset   = DIM*shiftidx[iidx];
193
194         /* Load limits for loop over neighbors */
195         j_index_start    = jindex[iidx];
196         j_index_end      = jindex[iidx+1];
197
198         /* Get outer coordinate index */
199         inr              = iinr[iidx];
200         i_coord_offset   = DIM*inr;
201
202         /* Load i particle coords and add shift vector */
203         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
204                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
205
206         fix0             = _mm256_setzero_ps();
207         fiy0             = _mm256_setzero_ps();
208         fiz0             = _mm256_setzero_ps();
209         fix1             = _mm256_setzero_ps();
210         fiy1             = _mm256_setzero_ps();
211         fiz1             = _mm256_setzero_ps();
212         fix2             = _mm256_setzero_ps();
213         fiy2             = _mm256_setzero_ps();
214         fiz2             = _mm256_setzero_ps();
215
216         /* Reset potential sums */
217         velecsum         = _mm256_setzero_ps();
218         vvdwsum          = _mm256_setzero_ps();
219
220         /* Start inner kernel loop */
221         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
222         {
223
224             /* Get j neighbor index, and coordinate index */
225             jnrA             = jjnr[jidx];
226             jnrB             = jjnr[jidx+1];
227             jnrC             = jjnr[jidx+2];
228             jnrD             = jjnr[jidx+3];
229             jnrE             = jjnr[jidx+4];
230             jnrF             = jjnr[jidx+5];
231             jnrG             = jjnr[jidx+6];
232             jnrH             = jjnr[jidx+7];
233             j_coord_offsetA  = DIM*jnrA;
234             j_coord_offsetB  = DIM*jnrB;
235             j_coord_offsetC  = DIM*jnrC;
236             j_coord_offsetD  = DIM*jnrD;
237             j_coord_offsetE  = DIM*jnrE;
238             j_coord_offsetF  = DIM*jnrF;
239             j_coord_offsetG  = DIM*jnrG;
240             j_coord_offsetH  = DIM*jnrH;
241
242             /* load j atom coordinates */
243             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
244                                                  x+j_coord_offsetC,x+j_coord_offsetD,
245                                                  x+j_coord_offsetE,x+j_coord_offsetF,
246                                                  x+j_coord_offsetG,x+j_coord_offsetH,
247                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
248
249             /* Calculate displacement vector */
250             dx00             = _mm256_sub_ps(ix0,jx0);
251             dy00             = _mm256_sub_ps(iy0,jy0);
252             dz00             = _mm256_sub_ps(iz0,jz0);
253             dx01             = _mm256_sub_ps(ix0,jx1);
254             dy01             = _mm256_sub_ps(iy0,jy1);
255             dz01             = _mm256_sub_ps(iz0,jz1);
256             dx02             = _mm256_sub_ps(ix0,jx2);
257             dy02             = _mm256_sub_ps(iy0,jy2);
258             dz02             = _mm256_sub_ps(iz0,jz2);
259             dx10             = _mm256_sub_ps(ix1,jx0);
260             dy10             = _mm256_sub_ps(iy1,jy0);
261             dz10             = _mm256_sub_ps(iz1,jz0);
262             dx11             = _mm256_sub_ps(ix1,jx1);
263             dy11             = _mm256_sub_ps(iy1,jy1);
264             dz11             = _mm256_sub_ps(iz1,jz1);
265             dx12             = _mm256_sub_ps(ix1,jx2);
266             dy12             = _mm256_sub_ps(iy1,jy2);
267             dz12             = _mm256_sub_ps(iz1,jz2);
268             dx20             = _mm256_sub_ps(ix2,jx0);
269             dy20             = _mm256_sub_ps(iy2,jy0);
270             dz20             = _mm256_sub_ps(iz2,jz0);
271             dx21             = _mm256_sub_ps(ix2,jx1);
272             dy21             = _mm256_sub_ps(iy2,jy1);
273             dz21             = _mm256_sub_ps(iz2,jz1);
274             dx22             = _mm256_sub_ps(ix2,jx2);
275             dy22             = _mm256_sub_ps(iy2,jy2);
276             dz22             = _mm256_sub_ps(iz2,jz2);
277
278             /* Calculate squared distance and things based on it */
279             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
280             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
281             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
282             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
283             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
284             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
285             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
286             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
287             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
288
289             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
290             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
291             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
292             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
293             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
294             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
295             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
296             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
297             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
298
299             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
300
301             fjx0             = _mm256_setzero_ps();
302             fjy0             = _mm256_setzero_ps();
303             fjz0             = _mm256_setzero_ps();
304             fjx1             = _mm256_setzero_ps();
305             fjy1             = _mm256_setzero_ps();
306             fjz1             = _mm256_setzero_ps();
307             fjx2             = _mm256_setzero_ps();
308             fjy2             = _mm256_setzero_ps();
309             fjz2             = _mm256_setzero_ps();
310
311             /**************************
312              * CALCULATE INTERACTIONS *
313              **************************/
314
315             r00              = _mm256_mul_ps(rsq00,rinv00);
316
317             /* Calculate table index by multiplying r with table scale and truncate to integer */
318             rt               = _mm256_mul_ps(r00,vftabscale);
319             vfitab           = _mm256_cvttps_epi32(rt);
320             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
321             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
322             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
323             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
324             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
325             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
326
327             /* CUBIC SPLINE TABLE ELECTROSTATICS */
328             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
329                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
330             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
331                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
332             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
333                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
334             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
335                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
336             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
337             Heps             = _mm256_mul_ps(vfeps,H);
338             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
339             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
340             velec            = _mm256_mul_ps(qq00,VV);
341             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
342             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
343
344             /* LENNARD-JONES DISPERSION/REPULSION */
345
346             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
347             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
348             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
349             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
350             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
351
352             /* Update potential sum for this i atom from the interaction with this j atom. */
353             velecsum         = _mm256_add_ps(velecsum,velec);
354             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
355
356             fscal            = _mm256_add_ps(felec,fvdw);
357
358             /* Calculate temporary vectorial force */
359             tx               = _mm256_mul_ps(fscal,dx00);
360             ty               = _mm256_mul_ps(fscal,dy00);
361             tz               = _mm256_mul_ps(fscal,dz00);
362
363             /* Update vectorial force */
364             fix0             = _mm256_add_ps(fix0,tx);
365             fiy0             = _mm256_add_ps(fiy0,ty);
366             fiz0             = _mm256_add_ps(fiz0,tz);
367
368             fjx0             = _mm256_add_ps(fjx0,tx);
369             fjy0             = _mm256_add_ps(fjy0,ty);
370             fjz0             = _mm256_add_ps(fjz0,tz);
371
372             /**************************
373              * CALCULATE INTERACTIONS *
374              **************************/
375
376             r01              = _mm256_mul_ps(rsq01,rinv01);
377
378             /* Calculate table index by multiplying r with table scale and truncate to integer */
379             rt               = _mm256_mul_ps(r01,vftabscale);
380             vfitab           = _mm256_cvttps_epi32(rt);
381             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
382             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
383             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
384             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
385             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
386             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
387
388             /* CUBIC SPLINE TABLE ELECTROSTATICS */
389             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
390                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
391             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
392                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
393             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
394                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
395             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
396                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
397             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
398             Heps             = _mm256_mul_ps(vfeps,H);
399             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
400             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
401             velec            = _mm256_mul_ps(qq01,VV);
402             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
403             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
404
405             /* Update potential sum for this i atom from the interaction with this j atom. */
406             velecsum         = _mm256_add_ps(velecsum,velec);
407
408             fscal            = felec;
409
410             /* Calculate temporary vectorial force */
411             tx               = _mm256_mul_ps(fscal,dx01);
412             ty               = _mm256_mul_ps(fscal,dy01);
413             tz               = _mm256_mul_ps(fscal,dz01);
414
415             /* Update vectorial force */
416             fix0             = _mm256_add_ps(fix0,tx);
417             fiy0             = _mm256_add_ps(fiy0,ty);
418             fiz0             = _mm256_add_ps(fiz0,tz);
419
420             fjx1             = _mm256_add_ps(fjx1,tx);
421             fjy1             = _mm256_add_ps(fjy1,ty);
422             fjz1             = _mm256_add_ps(fjz1,tz);
423
424             /**************************
425              * CALCULATE INTERACTIONS *
426              **************************/
427
428             r02              = _mm256_mul_ps(rsq02,rinv02);
429
430             /* Calculate table index by multiplying r with table scale and truncate to integer */
431             rt               = _mm256_mul_ps(r02,vftabscale);
432             vfitab           = _mm256_cvttps_epi32(rt);
433             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
434             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
435             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
436             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
437             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
438             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
439
440             /* CUBIC SPLINE TABLE ELECTROSTATICS */
441             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
442                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
443             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
444                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
445             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
446                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
447             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
448                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
449             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
450             Heps             = _mm256_mul_ps(vfeps,H);
451             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
452             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
453             velec            = _mm256_mul_ps(qq02,VV);
454             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
455             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
456
457             /* Update potential sum for this i atom from the interaction with this j atom. */
458             velecsum         = _mm256_add_ps(velecsum,velec);
459
460             fscal            = felec;
461
462             /* Calculate temporary vectorial force */
463             tx               = _mm256_mul_ps(fscal,dx02);
464             ty               = _mm256_mul_ps(fscal,dy02);
465             tz               = _mm256_mul_ps(fscal,dz02);
466
467             /* Update vectorial force */
468             fix0             = _mm256_add_ps(fix0,tx);
469             fiy0             = _mm256_add_ps(fiy0,ty);
470             fiz0             = _mm256_add_ps(fiz0,tz);
471
472             fjx2             = _mm256_add_ps(fjx2,tx);
473             fjy2             = _mm256_add_ps(fjy2,ty);
474             fjz2             = _mm256_add_ps(fjz2,tz);
475
476             /**************************
477              * CALCULATE INTERACTIONS *
478              **************************/
479
480             r10              = _mm256_mul_ps(rsq10,rinv10);
481
482             /* Calculate table index by multiplying r with table scale and truncate to integer */
483             rt               = _mm256_mul_ps(r10,vftabscale);
484             vfitab           = _mm256_cvttps_epi32(rt);
485             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
486             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
487             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
488             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
489             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
490             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
491
492             /* CUBIC SPLINE TABLE ELECTROSTATICS */
493             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
494                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
495             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
496                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
497             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
498                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
499             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
500                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
501             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
502             Heps             = _mm256_mul_ps(vfeps,H);
503             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
504             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
505             velec            = _mm256_mul_ps(qq10,VV);
506             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
507             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
508
509             /* Update potential sum for this i atom from the interaction with this j atom. */
510             velecsum         = _mm256_add_ps(velecsum,velec);
511
512             fscal            = felec;
513
514             /* Calculate temporary vectorial force */
515             tx               = _mm256_mul_ps(fscal,dx10);
516             ty               = _mm256_mul_ps(fscal,dy10);
517             tz               = _mm256_mul_ps(fscal,dz10);
518
519             /* Update vectorial force */
520             fix1             = _mm256_add_ps(fix1,tx);
521             fiy1             = _mm256_add_ps(fiy1,ty);
522             fiz1             = _mm256_add_ps(fiz1,tz);
523
524             fjx0             = _mm256_add_ps(fjx0,tx);
525             fjy0             = _mm256_add_ps(fjy0,ty);
526             fjz0             = _mm256_add_ps(fjz0,tz);
527
528             /**************************
529              * CALCULATE INTERACTIONS *
530              **************************/
531
532             r11              = _mm256_mul_ps(rsq11,rinv11);
533
534             /* Calculate table index by multiplying r with table scale and truncate to integer */
535             rt               = _mm256_mul_ps(r11,vftabscale);
536             vfitab           = _mm256_cvttps_epi32(rt);
537             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
538             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
539             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
540             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
541             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
542             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
543
544             /* CUBIC SPLINE TABLE ELECTROSTATICS */
545             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
546                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
547             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
548                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
549             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
550                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
551             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
552                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
553             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
554             Heps             = _mm256_mul_ps(vfeps,H);
555             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
556             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
557             velec            = _mm256_mul_ps(qq11,VV);
558             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
559             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
560
561             /* Update potential sum for this i atom from the interaction with this j atom. */
562             velecsum         = _mm256_add_ps(velecsum,velec);
563
564             fscal            = felec;
565
566             /* Calculate temporary vectorial force */
567             tx               = _mm256_mul_ps(fscal,dx11);
568             ty               = _mm256_mul_ps(fscal,dy11);
569             tz               = _mm256_mul_ps(fscal,dz11);
570
571             /* Update vectorial force */
572             fix1             = _mm256_add_ps(fix1,tx);
573             fiy1             = _mm256_add_ps(fiy1,ty);
574             fiz1             = _mm256_add_ps(fiz1,tz);
575
576             fjx1             = _mm256_add_ps(fjx1,tx);
577             fjy1             = _mm256_add_ps(fjy1,ty);
578             fjz1             = _mm256_add_ps(fjz1,tz);
579
580             /**************************
581              * CALCULATE INTERACTIONS *
582              **************************/
583
584             r12              = _mm256_mul_ps(rsq12,rinv12);
585
586             /* Calculate table index by multiplying r with table scale and truncate to integer */
587             rt               = _mm256_mul_ps(r12,vftabscale);
588             vfitab           = _mm256_cvttps_epi32(rt);
589             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
590             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
591             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
592             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
593             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
594             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
595
596             /* CUBIC SPLINE TABLE ELECTROSTATICS */
597             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
598                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
599             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
600                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
601             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
602                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
603             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
604                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
605             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
606             Heps             = _mm256_mul_ps(vfeps,H);
607             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
608             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
609             velec            = _mm256_mul_ps(qq12,VV);
610             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
611             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
612
613             /* Update potential sum for this i atom from the interaction with this j atom. */
614             velecsum         = _mm256_add_ps(velecsum,velec);
615
616             fscal            = felec;
617
618             /* Calculate temporary vectorial force */
619             tx               = _mm256_mul_ps(fscal,dx12);
620             ty               = _mm256_mul_ps(fscal,dy12);
621             tz               = _mm256_mul_ps(fscal,dz12);
622
623             /* Update vectorial force */
624             fix1             = _mm256_add_ps(fix1,tx);
625             fiy1             = _mm256_add_ps(fiy1,ty);
626             fiz1             = _mm256_add_ps(fiz1,tz);
627
628             fjx2             = _mm256_add_ps(fjx2,tx);
629             fjy2             = _mm256_add_ps(fjy2,ty);
630             fjz2             = _mm256_add_ps(fjz2,tz);
631
632             /**************************
633              * CALCULATE INTERACTIONS *
634              **************************/
635
636             r20              = _mm256_mul_ps(rsq20,rinv20);
637
638             /* Calculate table index by multiplying r with table scale and truncate to integer */
639             rt               = _mm256_mul_ps(r20,vftabscale);
640             vfitab           = _mm256_cvttps_epi32(rt);
641             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
642             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
643             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
644             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
645             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
646             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
647
648             /* CUBIC SPLINE TABLE ELECTROSTATICS */
649             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
650                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
651             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
652                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
653             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
654                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
655             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
656                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
657             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
658             Heps             = _mm256_mul_ps(vfeps,H);
659             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
660             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
661             velec            = _mm256_mul_ps(qq20,VV);
662             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
663             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
664
665             /* Update potential sum for this i atom from the interaction with this j atom. */
666             velecsum         = _mm256_add_ps(velecsum,velec);
667
668             fscal            = felec;
669
670             /* Calculate temporary vectorial force */
671             tx               = _mm256_mul_ps(fscal,dx20);
672             ty               = _mm256_mul_ps(fscal,dy20);
673             tz               = _mm256_mul_ps(fscal,dz20);
674
675             /* Update vectorial force */
676             fix2             = _mm256_add_ps(fix2,tx);
677             fiy2             = _mm256_add_ps(fiy2,ty);
678             fiz2             = _mm256_add_ps(fiz2,tz);
679
680             fjx0             = _mm256_add_ps(fjx0,tx);
681             fjy0             = _mm256_add_ps(fjy0,ty);
682             fjz0             = _mm256_add_ps(fjz0,tz);
683
684             /**************************
685              * CALCULATE INTERACTIONS *
686              **************************/
687
688             r21              = _mm256_mul_ps(rsq21,rinv21);
689
690             /* Calculate table index by multiplying r with table scale and truncate to integer */
691             rt               = _mm256_mul_ps(r21,vftabscale);
692             vfitab           = _mm256_cvttps_epi32(rt);
693             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
694             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
695             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
696             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
697             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
698             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
699
700             /* CUBIC SPLINE TABLE ELECTROSTATICS */
701             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
702                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
703             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
704                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
705             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
706                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
707             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
708                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
709             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
710             Heps             = _mm256_mul_ps(vfeps,H);
711             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
712             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
713             velec            = _mm256_mul_ps(qq21,VV);
714             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
715             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
716
717             /* Update potential sum for this i atom from the interaction with this j atom. */
718             velecsum         = _mm256_add_ps(velecsum,velec);
719
720             fscal            = felec;
721
722             /* Calculate temporary vectorial force */
723             tx               = _mm256_mul_ps(fscal,dx21);
724             ty               = _mm256_mul_ps(fscal,dy21);
725             tz               = _mm256_mul_ps(fscal,dz21);
726
727             /* Update vectorial force */
728             fix2             = _mm256_add_ps(fix2,tx);
729             fiy2             = _mm256_add_ps(fiy2,ty);
730             fiz2             = _mm256_add_ps(fiz2,tz);
731
732             fjx1             = _mm256_add_ps(fjx1,tx);
733             fjy1             = _mm256_add_ps(fjy1,ty);
734             fjz1             = _mm256_add_ps(fjz1,tz);
735
736             /**************************
737              * CALCULATE INTERACTIONS *
738              **************************/
739
740             r22              = _mm256_mul_ps(rsq22,rinv22);
741
742             /* Calculate table index by multiplying r with table scale and truncate to integer */
743             rt               = _mm256_mul_ps(r22,vftabscale);
744             vfitab           = _mm256_cvttps_epi32(rt);
745             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
746             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
747             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
748             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
749             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
750             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
751
752             /* CUBIC SPLINE TABLE ELECTROSTATICS */
753             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
754                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
755             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
756                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
757             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
758                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
759             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
760                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
761             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
762             Heps             = _mm256_mul_ps(vfeps,H);
763             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
764             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
765             velec            = _mm256_mul_ps(qq22,VV);
766             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
767             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
768
769             /* Update potential sum for this i atom from the interaction with this j atom. */
770             velecsum         = _mm256_add_ps(velecsum,velec);
771
772             fscal            = felec;
773
774             /* Calculate temporary vectorial force */
775             tx               = _mm256_mul_ps(fscal,dx22);
776             ty               = _mm256_mul_ps(fscal,dy22);
777             tz               = _mm256_mul_ps(fscal,dz22);
778
779             /* Update vectorial force */
780             fix2             = _mm256_add_ps(fix2,tx);
781             fiy2             = _mm256_add_ps(fiy2,ty);
782             fiz2             = _mm256_add_ps(fiz2,tz);
783
784             fjx2             = _mm256_add_ps(fjx2,tx);
785             fjy2             = _mm256_add_ps(fjy2,ty);
786             fjz2             = _mm256_add_ps(fjz2,tz);
787
788             fjptrA             = f+j_coord_offsetA;
789             fjptrB             = f+j_coord_offsetB;
790             fjptrC             = f+j_coord_offsetC;
791             fjptrD             = f+j_coord_offsetD;
792             fjptrE             = f+j_coord_offsetE;
793             fjptrF             = f+j_coord_offsetF;
794             fjptrG             = f+j_coord_offsetG;
795             fjptrH             = f+j_coord_offsetH;
796
797             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
798                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
799
800             /* Inner loop uses 400 flops */
801         }
802
803         if(jidx<j_index_end)
804         {
805
806             /* Get j neighbor index, and coordinate index */
807             jnrlistA         = jjnr[jidx];
808             jnrlistB         = jjnr[jidx+1];
809             jnrlistC         = jjnr[jidx+2];
810             jnrlistD         = jjnr[jidx+3];
811             jnrlistE         = jjnr[jidx+4];
812             jnrlistF         = jjnr[jidx+5];
813             jnrlistG         = jjnr[jidx+6];
814             jnrlistH         = jjnr[jidx+7];
815             /* Sign of each element will be negative for non-real atoms.
816              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
817              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
818              */
819             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
820                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
821                                             
822             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
823             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
824             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
825             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
826             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
827             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
828             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
829             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
830             j_coord_offsetA  = DIM*jnrA;
831             j_coord_offsetB  = DIM*jnrB;
832             j_coord_offsetC  = DIM*jnrC;
833             j_coord_offsetD  = DIM*jnrD;
834             j_coord_offsetE  = DIM*jnrE;
835             j_coord_offsetF  = DIM*jnrF;
836             j_coord_offsetG  = DIM*jnrG;
837             j_coord_offsetH  = DIM*jnrH;
838
839             /* load j atom coordinates */
840             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
841                                                  x+j_coord_offsetC,x+j_coord_offsetD,
842                                                  x+j_coord_offsetE,x+j_coord_offsetF,
843                                                  x+j_coord_offsetG,x+j_coord_offsetH,
844                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
845
846             /* Calculate displacement vector */
847             dx00             = _mm256_sub_ps(ix0,jx0);
848             dy00             = _mm256_sub_ps(iy0,jy0);
849             dz00             = _mm256_sub_ps(iz0,jz0);
850             dx01             = _mm256_sub_ps(ix0,jx1);
851             dy01             = _mm256_sub_ps(iy0,jy1);
852             dz01             = _mm256_sub_ps(iz0,jz1);
853             dx02             = _mm256_sub_ps(ix0,jx2);
854             dy02             = _mm256_sub_ps(iy0,jy2);
855             dz02             = _mm256_sub_ps(iz0,jz2);
856             dx10             = _mm256_sub_ps(ix1,jx0);
857             dy10             = _mm256_sub_ps(iy1,jy0);
858             dz10             = _mm256_sub_ps(iz1,jz0);
859             dx11             = _mm256_sub_ps(ix1,jx1);
860             dy11             = _mm256_sub_ps(iy1,jy1);
861             dz11             = _mm256_sub_ps(iz1,jz1);
862             dx12             = _mm256_sub_ps(ix1,jx2);
863             dy12             = _mm256_sub_ps(iy1,jy2);
864             dz12             = _mm256_sub_ps(iz1,jz2);
865             dx20             = _mm256_sub_ps(ix2,jx0);
866             dy20             = _mm256_sub_ps(iy2,jy0);
867             dz20             = _mm256_sub_ps(iz2,jz0);
868             dx21             = _mm256_sub_ps(ix2,jx1);
869             dy21             = _mm256_sub_ps(iy2,jy1);
870             dz21             = _mm256_sub_ps(iz2,jz1);
871             dx22             = _mm256_sub_ps(ix2,jx2);
872             dy22             = _mm256_sub_ps(iy2,jy2);
873             dz22             = _mm256_sub_ps(iz2,jz2);
874
875             /* Calculate squared distance and things based on it */
876             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
877             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
878             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
879             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
880             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
881             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
882             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
883             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
884             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
885
886             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
887             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
888             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
889             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
890             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
891             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
892             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
893             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
894             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
895
896             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
897
898             fjx0             = _mm256_setzero_ps();
899             fjy0             = _mm256_setzero_ps();
900             fjz0             = _mm256_setzero_ps();
901             fjx1             = _mm256_setzero_ps();
902             fjy1             = _mm256_setzero_ps();
903             fjz1             = _mm256_setzero_ps();
904             fjx2             = _mm256_setzero_ps();
905             fjy2             = _mm256_setzero_ps();
906             fjz2             = _mm256_setzero_ps();
907
908             /**************************
909              * CALCULATE INTERACTIONS *
910              **************************/
911
912             r00              = _mm256_mul_ps(rsq00,rinv00);
913             r00              = _mm256_andnot_ps(dummy_mask,r00);
914
915             /* Calculate table index by multiplying r with table scale and truncate to integer */
916             rt               = _mm256_mul_ps(r00,vftabscale);
917             vfitab           = _mm256_cvttps_epi32(rt);
918             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
919             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
920             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
921             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
922             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
923             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
924
925             /* CUBIC SPLINE TABLE ELECTROSTATICS */
926             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
927                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
928             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
929                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
930             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
931                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
932             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
933                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
934             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
935             Heps             = _mm256_mul_ps(vfeps,H);
936             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
937             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
938             velec            = _mm256_mul_ps(qq00,VV);
939             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
940             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
941
942             /* LENNARD-JONES DISPERSION/REPULSION */
943
944             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
945             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
946             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
947             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
948             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
949
950             /* Update potential sum for this i atom from the interaction with this j atom. */
951             velec            = _mm256_andnot_ps(dummy_mask,velec);
952             velecsum         = _mm256_add_ps(velecsum,velec);
953             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
954             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
955
956             fscal            = _mm256_add_ps(felec,fvdw);
957
958             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
959
960             /* Calculate temporary vectorial force */
961             tx               = _mm256_mul_ps(fscal,dx00);
962             ty               = _mm256_mul_ps(fscal,dy00);
963             tz               = _mm256_mul_ps(fscal,dz00);
964
965             /* Update vectorial force */
966             fix0             = _mm256_add_ps(fix0,tx);
967             fiy0             = _mm256_add_ps(fiy0,ty);
968             fiz0             = _mm256_add_ps(fiz0,tz);
969
970             fjx0             = _mm256_add_ps(fjx0,tx);
971             fjy0             = _mm256_add_ps(fjy0,ty);
972             fjz0             = _mm256_add_ps(fjz0,tz);
973
974             /**************************
975              * CALCULATE INTERACTIONS *
976              **************************/
977
978             r01              = _mm256_mul_ps(rsq01,rinv01);
979             r01              = _mm256_andnot_ps(dummy_mask,r01);
980
981             /* Calculate table index by multiplying r with table scale and truncate to integer */
982             rt               = _mm256_mul_ps(r01,vftabscale);
983             vfitab           = _mm256_cvttps_epi32(rt);
984             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
985             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
986             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
987             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
988             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
989             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
990
991             /* CUBIC SPLINE TABLE ELECTROSTATICS */
992             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
993                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
994             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
995                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
996             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
997                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
998             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
999                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1000             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1001             Heps             = _mm256_mul_ps(vfeps,H);
1002             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1003             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1004             velec            = _mm256_mul_ps(qq01,VV);
1005             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1006             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1007
1008             /* Update potential sum for this i atom from the interaction with this j atom. */
1009             velec            = _mm256_andnot_ps(dummy_mask,velec);
1010             velecsum         = _mm256_add_ps(velecsum,velec);
1011
1012             fscal            = felec;
1013
1014             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1015
1016             /* Calculate temporary vectorial force */
1017             tx               = _mm256_mul_ps(fscal,dx01);
1018             ty               = _mm256_mul_ps(fscal,dy01);
1019             tz               = _mm256_mul_ps(fscal,dz01);
1020
1021             /* Update vectorial force */
1022             fix0             = _mm256_add_ps(fix0,tx);
1023             fiy0             = _mm256_add_ps(fiy0,ty);
1024             fiz0             = _mm256_add_ps(fiz0,tz);
1025
1026             fjx1             = _mm256_add_ps(fjx1,tx);
1027             fjy1             = _mm256_add_ps(fjy1,ty);
1028             fjz1             = _mm256_add_ps(fjz1,tz);
1029
1030             /**************************
1031              * CALCULATE INTERACTIONS *
1032              **************************/
1033
1034             r02              = _mm256_mul_ps(rsq02,rinv02);
1035             r02              = _mm256_andnot_ps(dummy_mask,r02);
1036
1037             /* Calculate table index by multiplying r with table scale and truncate to integer */
1038             rt               = _mm256_mul_ps(r02,vftabscale);
1039             vfitab           = _mm256_cvttps_epi32(rt);
1040             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1041             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1042             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1043             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1044             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1045             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1046
1047             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1048             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1049                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1050             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1051                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1052             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1053                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1054             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1055                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1056             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1057             Heps             = _mm256_mul_ps(vfeps,H);
1058             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1059             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1060             velec            = _mm256_mul_ps(qq02,VV);
1061             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1062             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1063
1064             /* Update potential sum for this i atom from the interaction with this j atom. */
1065             velec            = _mm256_andnot_ps(dummy_mask,velec);
1066             velecsum         = _mm256_add_ps(velecsum,velec);
1067
1068             fscal            = felec;
1069
1070             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1071
1072             /* Calculate temporary vectorial force */
1073             tx               = _mm256_mul_ps(fscal,dx02);
1074             ty               = _mm256_mul_ps(fscal,dy02);
1075             tz               = _mm256_mul_ps(fscal,dz02);
1076
1077             /* Update vectorial force */
1078             fix0             = _mm256_add_ps(fix0,tx);
1079             fiy0             = _mm256_add_ps(fiy0,ty);
1080             fiz0             = _mm256_add_ps(fiz0,tz);
1081
1082             fjx2             = _mm256_add_ps(fjx2,tx);
1083             fjy2             = _mm256_add_ps(fjy2,ty);
1084             fjz2             = _mm256_add_ps(fjz2,tz);
1085
1086             /**************************
1087              * CALCULATE INTERACTIONS *
1088              **************************/
1089
1090             r10              = _mm256_mul_ps(rsq10,rinv10);
1091             r10              = _mm256_andnot_ps(dummy_mask,r10);
1092
1093             /* Calculate table index by multiplying r with table scale and truncate to integer */
1094             rt               = _mm256_mul_ps(r10,vftabscale);
1095             vfitab           = _mm256_cvttps_epi32(rt);
1096             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1097             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1098             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1099             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1100             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1101             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1102
1103             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1104             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1105                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1106             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1107                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1108             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1109                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1110             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1111                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1112             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1113             Heps             = _mm256_mul_ps(vfeps,H);
1114             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1115             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1116             velec            = _mm256_mul_ps(qq10,VV);
1117             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1118             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1119
1120             /* Update potential sum for this i atom from the interaction with this j atom. */
1121             velec            = _mm256_andnot_ps(dummy_mask,velec);
1122             velecsum         = _mm256_add_ps(velecsum,velec);
1123
1124             fscal            = felec;
1125
1126             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1127
1128             /* Calculate temporary vectorial force */
1129             tx               = _mm256_mul_ps(fscal,dx10);
1130             ty               = _mm256_mul_ps(fscal,dy10);
1131             tz               = _mm256_mul_ps(fscal,dz10);
1132
1133             /* Update vectorial force */
1134             fix1             = _mm256_add_ps(fix1,tx);
1135             fiy1             = _mm256_add_ps(fiy1,ty);
1136             fiz1             = _mm256_add_ps(fiz1,tz);
1137
1138             fjx0             = _mm256_add_ps(fjx0,tx);
1139             fjy0             = _mm256_add_ps(fjy0,ty);
1140             fjz0             = _mm256_add_ps(fjz0,tz);
1141
1142             /**************************
1143              * CALCULATE INTERACTIONS *
1144              **************************/
1145
1146             r11              = _mm256_mul_ps(rsq11,rinv11);
1147             r11              = _mm256_andnot_ps(dummy_mask,r11);
1148
1149             /* Calculate table index by multiplying r with table scale and truncate to integer */
1150             rt               = _mm256_mul_ps(r11,vftabscale);
1151             vfitab           = _mm256_cvttps_epi32(rt);
1152             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1153             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1154             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1155             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1156             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1157             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1158
1159             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1160             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1161                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1162             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1163                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1164             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1165                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1166             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1167                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1168             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1169             Heps             = _mm256_mul_ps(vfeps,H);
1170             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1171             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1172             velec            = _mm256_mul_ps(qq11,VV);
1173             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1174             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1175
1176             /* Update potential sum for this i atom from the interaction with this j atom. */
1177             velec            = _mm256_andnot_ps(dummy_mask,velec);
1178             velecsum         = _mm256_add_ps(velecsum,velec);
1179
1180             fscal            = felec;
1181
1182             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1183
1184             /* Calculate temporary vectorial force */
1185             tx               = _mm256_mul_ps(fscal,dx11);
1186             ty               = _mm256_mul_ps(fscal,dy11);
1187             tz               = _mm256_mul_ps(fscal,dz11);
1188
1189             /* Update vectorial force */
1190             fix1             = _mm256_add_ps(fix1,tx);
1191             fiy1             = _mm256_add_ps(fiy1,ty);
1192             fiz1             = _mm256_add_ps(fiz1,tz);
1193
1194             fjx1             = _mm256_add_ps(fjx1,tx);
1195             fjy1             = _mm256_add_ps(fjy1,ty);
1196             fjz1             = _mm256_add_ps(fjz1,tz);
1197
1198             /**************************
1199              * CALCULATE INTERACTIONS *
1200              **************************/
1201
1202             r12              = _mm256_mul_ps(rsq12,rinv12);
1203             r12              = _mm256_andnot_ps(dummy_mask,r12);
1204
1205             /* Calculate table index by multiplying r with table scale and truncate to integer */
1206             rt               = _mm256_mul_ps(r12,vftabscale);
1207             vfitab           = _mm256_cvttps_epi32(rt);
1208             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1209             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1210             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1211             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1212             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1213             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1214
1215             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1216             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1217                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1218             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1219                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1220             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1221                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1222             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1223                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1224             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1225             Heps             = _mm256_mul_ps(vfeps,H);
1226             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1227             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1228             velec            = _mm256_mul_ps(qq12,VV);
1229             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1230             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1231
1232             /* Update potential sum for this i atom from the interaction with this j atom. */
1233             velec            = _mm256_andnot_ps(dummy_mask,velec);
1234             velecsum         = _mm256_add_ps(velecsum,velec);
1235
1236             fscal            = felec;
1237
1238             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1239
1240             /* Calculate temporary vectorial force */
1241             tx               = _mm256_mul_ps(fscal,dx12);
1242             ty               = _mm256_mul_ps(fscal,dy12);
1243             tz               = _mm256_mul_ps(fscal,dz12);
1244
1245             /* Update vectorial force */
1246             fix1             = _mm256_add_ps(fix1,tx);
1247             fiy1             = _mm256_add_ps(fiy1,ty);
1248             fiz1             = _mm256_add_ps(fiz1,tz);
1249
1250             fjx2             = _mm256_add_ps(fjx2,tx);
1251             fjy2             = _mm256_add_ps(fjy2,ty);
1252             fjz2             = _mm256_add_ps(fjz2,tz);
1253
1254             /**************************
1255              * CALCULATE INTERACTIONS *
1256              **************************/
1257
1258             r20              = _mm256_mul_ps(rsq20,rinv20);
1259             r20              = _mm256_andnot_ps(dummy_mask,r20);
1260
1261             /* Calculate table index by multiplying r with table scale and truncate to integer */
1262             rt               = _mm256_mul_ps(r20,vftabscale);
1263             vfitab           = _mm256_cvttps_epi32(rt);
1264             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1265             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1266             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1267             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1268             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1269             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1270
1271             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1272             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1273                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1274             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1275                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1276             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1277                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1278             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1279                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1280             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1281             Heps             = _mm256_mul_ps(vfeps,H);
1282             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1283             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1284             velec            = _mm256_mul_ps(qq20,VV);
1285             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1286             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1287
1288             /* Update potential sum for this i atom from the interaction with this j atom. */
1289             velec            = _mm256_andnot_ps(dummy_mask,velec);
1290             velecsum         = _mm256_add_ps(velecsum,velec);
1291
1292             fscal            = felec;
1293
1294             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1295
1296             /* Calculate temporary vectorial force */
1297             tx               = _mm256_mul_ps(fscal,dx20);
1298             ty               = _mm256_mul_ps(fscal,dy20);
1299             tz               = _mm256_mul_ps(fscal,dz20);
1300
1301             /* Update vectorial force */
1302             fix2             = _mm256_add_ps(fix2,tx);
1303             fiy2             = _mm256_add_ps(fiy2,ty);
1304             fiz2             = _mm256_add_ps(fiz2,tz);
1305
1306             fjx0             = _mm256_add_ps(fjx0,tx);
1307             fjy0             = _mm256_add_ps(fjy0,ty);
1308             fjz0             = _mm256_add_ps(fjz0,tz);
1309
1310             /**************************
1311              * CALCULATE INTERACTIONS *
1312              **************************/
1313
1314             r21              = _mm256_mul_ps(rsq21,rinv21);
1315             r21              = _mm256_andnot_ps(dummy_mask,r21);
1316
1317             /* Calculate table index by multiplying r with table scale and truncate to integer */
1318             rt               = _mm256_mul_ps(r21,vftabscale);
1319             vfitab           = _mm256_cvttps_epi32(rt);
1320             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1321             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1322             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1323             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1324             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1325             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1326
1327             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1328             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1329                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1330             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1331                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1332             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1333                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1334             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1335                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1336             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1337             Heps             = _mm256_mul_ps(vfeps,H);
1338             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1339             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1340             velec            = _mm256_mul_ps(qq21,VV);
1341             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1342             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1343
1344             /* Update potential sum for this i atom from the interaction with this j atom. */
1345             velec            = _mm256_andnot_ps(dummy_mask,velec);
1346             velecsum         = _mm256_add_ps(velecsum,velec);
1347
1348             fscal            = felec;
1349
1350             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1351
1352             /* Calculate temporary vectorial force */
1353             tx               = _mm256_mul_ps(fscal,dx21);
1354             ty               = _mm256_mul_ps(fscal,dy21);
1355             tz               = _mm256_mul_ps(fscal,dz21);
1356
1357             /* Update vectorial force */
1358             fix2             = _mm256_add_ps(fix2,tx);
1359             fiy2             = _mm256_add_ps(fiy2,ty);
1360             fiz2             = _mm256_add_ps(fiz2,tz);
1361
1362             fjx1             = _mm256_add_ps(fjx1,tx);
1363             fjy1             = _mm256_add_ps(fjy1,ty);
1364             fjz1             = _mm256_add_ps(fjz1,tz);
1365
1366             /**************************
1367              * CALCULATE INTERACTIONS *
1368              **************************/
1369
1370             r22              = _mm256_mul_ps(rsq22,rinv22);
1371             r22              = _mm256_andnot_ps(dummy_mask,r22);
1372
1373             /* Calculate table index by multiplying r with table scale and truncate to integer */
1374             rt               = _mm256_mul_ps(r22,vftabscale);
1375             vfitab           = _mm256_cvttps_epi32(rt);
1376             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1377             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1378             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1379             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1380             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1381             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1382
1383             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1384             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1385                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1386             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1387                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1388             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1389                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1390             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1391                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1392             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1393             Heps             = _mm256_mul_ps(vfeps,H);
1394             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1395             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1396             velec            = _mm256_mul_ps(qq22,VV);
1397             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1398             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1399
1400             /* Update potential sum for this i atom from the interaction with this j atom. */
1401             velec            = _mm256_andnot_ps(dummy_mask,velec);
1402             velecsum         = _mm256_add_ps(velecsum,velec);
1403
1404             fscal            = felec;
1405
1406             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1407
1408             /* Calculate temporary vectorial force */
1409             tx               = _mm256_mul_ps(fscal,dx22);
1410             ty               = _mm256_mul_ps(fscal,dy22);
1411             tz               = _mm256_mul_ps(fscal,dz22);
1412
1413             /* Update vectorial force */
1414             fix2             = _mm256_add_ps(fix2,tx);
1415             fiy2             = _mm256_add_ps(fiy2,ty);
1416             fiz2             = _mm256_add_ps(fiz2,tz);
1417
1418             fjx2             = _mm256_add_ps(fjx2,tx);
1419             fjy2             = _mm256_add_ps(fjy2,ty);
1420             fjz2             = _mm256_add_ps(fjz2,tz);
1421
1422             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1423             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1424             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1425             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1426             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1427             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1428             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1429             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1430
1431             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1432                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1433
1434             /* Inner loop uses 409 flops */
1435         }
1436
1437         /* End of innermost loop */
1438
1439         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1440                                                  f+i_coord_offset,fshift+i_shift_offset);
1441
1442         ggid                        = gid[iidx];
1443         /* Update potential energies */
1444         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1445         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1446
1447         /* Increment number of inner iterations */
1448         inneriter                  += j_index_end - j_index_start;
1449
1450         /* Outer loop uses 20 flops */
1451     }
1452
1453     /* Increment number of outer iterations */
1454     outeriter        += nri;
1455
1456     /* Update outer/inner flops */
1457
1458     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1459 }
1460 /*
1461  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single
1462  * Electrostatics interaction: CubicSplineTable
1463  * VdW interaction:            LennardJones
1464  * Geometry:                   Water3-Water3
1465  * Calculate force/pot:        Force
1466  */
1467 void
1468 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single
1469                     (t_nblist                    * gmx_restrict       nlist,
1470                      rvec                        * gmx_restrict          xx,
1471                      rvec                        * gmx_restrict          ff,
1472                      t_forcerec                  * gmx_restrict          fr,
1473                      t_mdatoms                   * gmx_restrict     mdatoms,
1474                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1475                      t_nrnb                      * gmx_restrict        nrnb)
1476 {
1477     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1478      * just 0 for non-waters.
1479      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1480      * jnr indices corresponding to data put in the four positions in the SIMD register.
1481      */
1482     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1483     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1484     int              jnrA,jnrB,jnrC,jnrD;
1485     int              jnrE,jnrF,jnrG,jnrH;
1486     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1487     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1488     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1489     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1490     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1491     real             rcutoff_scalar;
1492     real             *shiftvec,*fshift,*x,*f;
1493     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1494     real             scratch[4*DIM];
1495     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1496     real *           vdwioffsetptr0;
1497     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1498     real *           vdwioffsetptr1;
1499     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1500     real *           vdwioffsetptr2;
1501     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1502     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1503     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1504     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1505     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1506     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1507     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1508     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1509     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1510     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1511     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1512     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1513     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1514     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1515     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1516     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1517     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1518     real             *charge;
1519     int              nvdwtype;
1520     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1521     int              *vdwtype;
1522     real             *vdwparam;
1523     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1524     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1525     __m256i          vfitab;
1526     __m128i          vfitab_lo,vfitab_hi;
1527     __m128i          ifour       = _mm_set1_epi32(4);
1528     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1529     real             *vftab;
1530     __m256           dummy_mask,cutoff_mask;
1531     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1532     __m256           one     = _mm256_set1_ps(1.0);
1533     __m256           two     = _mm256_set1_ps(2.0);
1534     x                = xx[0];
1535     f                = ff[0];
1536
1537     nri              = nlist->nri;
1538     iinr             = nlist->iinr;
1539     jindex           = nlist->jindex;
1540     jjnr             = nlist->jjnr;
1541     shiftidx         = nlist->shift;
1542     gid              = nlist->gid;
1543     shiftvec         = fr->shift_vec[0];
1544     fshift           = fr->fshift[0];
1545     facel            = _mm256_set1_ps(fr->epsfac);
1546     charge           = mdatoms->chargeA;
1547     nvdwtype         = fr->ntype;
1548     vdwparam         = fr->nbfp;
1549     vdwtype          = mdatoms->typeA;
1550
1551     vftab            = kernel_data->table_elec->data;
1552     vftabscale       = _mm256_set1_ps(kernel_data->table_elec->scale);
1553
1554     /* Setup water-specific parameters */
1555     inr              = nlist->iinr[0];
1556     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1557     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1558     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1559     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1560
1561     jq0              = _mm256_set1_ps(charge[inr+0]);
1562     jq1              = _mm256_set1_ps(charge[inr+1]);
1563     jq2              = _mm256_set1_ps(charge[inr+2]);
1564     vdwjidx0A        = 2*vdwtype[inr+0];
1565     qq00             = _mm256_mul_ps(iq0,jq0);
1566     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1567     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1568     qq01             = _mm256_mul_ps(iq0,jq1);
1569     qq02             = _mm256_mul_ps(iq0,jq2);
1570     qq10             = _mm256_mul_ps(iq1,jq0);
1571     qq11             = _mm256_mul_ps(iq1,jq1);
1572     qq12             = _mm256_mul_ps(iq1,jq2);
1573     qq20             = _mm256_mul_ps(iq2,jq0);
1574     qq21             = _mm256_mul_ps(iq2,jq1);
1575     qq22             = _mm256_mul_ps(iq2,jq2);
1576
1577     /* Avoid stupid compiler warnings */
1578     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1579     j_coord_offsetA = 0;
1580     j_coord_offsetB = 0;
1581     j_coord_offsetC = 0;
1582     j_coord_offsetD = 0;
1583     j_coord_offsetE = 0;
1584     j_coord_offsetF = 0;
1585     j_coord_offsetG = 0;
1586     j_coord_offsetH = 0;
1587
1588     outeriter        = 0;
1589     inneriter        = 0;
1590
1591     for(iidx=0;iidx<4*DIM;iidx++)
1592     {
1593         scratch[iidx] = 0.0;
1594     }
1595
1596     /* Start outer loop over neighborlists */
1597     for(iidx=0; iidx<nri; iidx++)
1598     {
1599         /* Load shift vector for this list */
1600         i_shift_offset   = DIM*shiftidx[iidx];
1601
1602         /* Load limits for loop over neighbors */
1603         j_index_start    = jindex[iidx];
1604         j_index_end      = jindex[iidx+1];
1605
1606         /* Get outer coordinate index */
1607         inr              = iinr[iidx];
1608         i_coord_offset   = DIM*inr;
1609
1610         /* Load i particle coords and add shift vector */
1611         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1612                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1613
1614         fix0             = _mm256_setzero_ps();
1615         fiy0             = _mm256_setzero_ps();
1616         fiz0             = _mm256_setzero_ps();
1617         fix1             = _mm256_setzero_ps();
1618         fiy1             = _mm256_setzero_ps();
1619         fiz1             = _mm256_setzero_ps();
1620         fix2             = _mm256_setzero_ps();
1621         fiy2             = _mm256_setzero_ps();
1622         fiz2             = _mm256_setzero_ps();
1623
1624         /* Start inner kernel loop */
1625         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1626         {
1627
1628             /* Get j neighbor index, and coordinate index */
1629             jnrA             = jjnr[jidx];
1630             jnrB             = jjnr[jidx+1];
1631             jnrC             = jjnr[jidx+2];
1632             jnrD             = jjnr[jidx+3];
1633             jnrE             = jjnr[jidx+4];
1634             jnrF             = jjnr[jidx+5];
1635             jnrG             = jjnr[jidx+6];
1636             jnrH             = jjnr[jidx+7];
1637             j_coord_offsetA  = DIM*jnrA;
1638             j_coord_offsetB  = DIM*jnrB;
1639             j_coord_offsetC  = DIM*jnrC;
1640             j_coord_offsetD  = DIM*jnrD;
1641             j_coord_offsetE  = DIM*jnrE;
1642             j_coord_offsetF  = DIM*jnrF;
1643             j_coord_offsetG  = DIM*jnrG;
1644             j_coord_offsetH  = DIM*jnrH;
1645
1646             /* load j atom coordinates */
1647             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1648                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1649                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1650                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1651                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1652
1653             /* Calculate displacement vector */
1654             dx00             = _mm256_sub_ps(ix0,jx0);
1655             dy00             = _mm256_sub_ps(iy0,jy0);
1656             dz00             = _mm256_sub_ps(iz0,jz0);
1657             dx01             = _mm256_sub_ps(ix0,jx1);
1658             dy01             = _mm256_sub_ps(iy0,jy1);
1659             dz01             = _mm256_sub_ps(iz0,jz1);
1660             dx02             = _mm256_sub_ps(ix0,jx2);
1661             dy02             = _mm256_sub_ps(iy0,jy2);
1662             dz02             = _mm256_sub_ps(iz0,jz2);
1663             dx10             = _mm256_sub_ps(ix1,jx0);
1664             dy10             = _mm256_sub_ps(iy1,jy0);
1665             dz10             = _mm256_sub_ps(iz1,jz0);
1666             dx11             = _mm256_sub_ps(ix1,jx1);
1667             dy11             = _mm256_sub_ps(iy1,jy1);
1668             dz11             = _mm256_sub_ps(iz1,jz1);
1669             dx12             = _mm256_sub_ps(ix1,jx2);
1670             dy12             = _mm256_sub_ps(iy1,jy2);
1671             dz12             = _mm256_sub_ps(iz1,jz2);
1672             dx20             = _mm256_sub_ps(ix2,jx0);
1673             dy20             = _mm256_sub_ps(iy2,jy0);
1674             dz20             = _mm256_sub_ps(iz2,jz0);
1675             dx21             = _mm256_sub_ps(ix2,jx1);
1676             dy21             = _mm256_sub_ps(iy2,jy1);
1677             dz21             = _mm256_sub_ps(iz2,jz1);
1678             dx22             = _mm256_sub_ps(ix2,jx2);
1679             dy22             = _mm256_sub_ps(iy2,jy2);
1680             dz22             = _mm256_sub_ps(iz2,jz2);
1681
1682             /* Calculate squared distance and things based on it */
1683             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1684             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1685             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1686             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1687             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1688             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1689             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1690             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1691             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1692
1693             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1694             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1695             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1696             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1697             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1698             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1699             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1700             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1701             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1702
1703             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1704
1705             fjx0             = _mm256_setzero_ps();
1706             fjy0             = _mm256_setzero_ps();
1707             fjz0             = _mm256_setzero_ps();
1708             fjx1             = _mm256_setzero_ps();
1709             fjy1             = _mm256_setzero_ps();
1710             fjz1             = _mm256_setzero_ps();
1711             fjx2             = _mm256_setzero_ps();
1712             fjy2             = _mm256_setzero_ps();
1713             fjz2             = _mm256_setzero_ps();
1714
1715             /**************************
1716              * CALCULATE INTERACTIONS *
1717              **************************/
1718
1719             r00              = _mm256_mul_ps(rsq00,rinv00);
1720
1721             /* Calculate table index by multiplying r with table scale and truncate to integer */
1722             rt               = _mm256_mul_ps(r00,vftabscale);
1723             vfitab           = _mm256_cvttps_epi32(rt);
1724             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1725             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1726             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1727             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1728             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1729             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1730
1731             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1732             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1733                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1734             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1735                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1736             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1737                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1738             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1739                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1740             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1741             Heps             = _mm256_mul_ps(vfeps,H);
1742             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1743             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1744             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1745
1746             /* LENNARD-JONES DISPERSION/REPULSION */
1747
1748             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1749             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1750
1751             fscal            = _mm256_add_ps(felec,fvdw);
1752
1753             /* Calculate temporary vectorial force */
1754             tx               = _mm256_mul_ps(fscal,dx00);
1755             ty               = _mm256_mul_ps(fscal,dy00);
1756             tz               = _mm256_mul_ps(fscal,dz00);
1757
1758             /* Update vectorial force */
1759             fix0             = _mm256_add_ps(fix0,tx);
1760             fiy0             = _mm256_add_ps(fiy0,ty);
1761             fiz0             = _mm256_add_ps(fiz0,tz);
1762
1763             fjx0             = _mm256_add_ps(fjx0,tx);
1764             fjy0             = _mm256_add_ps(fjy0,ty);
1765             fjz0             = _mm256_add_ps(fjz0,tz);
1766
1767             /**************************
1768              * CALCULATE INTERACTIONS *
1769              **************************/
1770
1771             r01              = _mm256_mul_ps(rsq01,rinv01);
1772
1773             /* Calculate table index by multiplying r with table scale and truncate to integer */
1774             rt               = _mm256_mul_ps(r01,vftabscale);
1775             vfitab           = _mm256_cvttps_epi32(rt);
1776             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1777             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1778             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1779             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1780             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1781             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1782
1783             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1784             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1785                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1786             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1787                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1788             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1789                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1790             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1791                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1792             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1793             Heps             = _mm256_mul_ps(vfeps,H);
1794             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1795             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1796             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1797
1798             fscal            = felec;
1799
1800             /* Calculate temporary vectorial force */
1801             tx               = _mm256_mul_ps(fscal,dx01);
1802             ty               = _mm256_mul_ps(fscal,dy01);
1803             tz               = _mm256_mul_ps(fscal,dz01);
1804
1805             /* Update vectorial force */
1806             fix0             = _mm256_add_ps(fix0,tx);
1807             fiy0             = _mm256_add_ps(fiy0,ty);
1808             fiz0             = _mm256_add_ps(fiz0,tz);
1809
1810             fjx1             = _mm256_add_ps(fjx1,tx);
1811             fjy1             = _mm256_add_ps(fjy1,ty);
1812             fjz1             = _mm256_add_ps(fjz1,tz);
1813
1814             /**************************
1815              * CALCULATE INTERACTIONS *
1816              **************************/
1817
1818             r02              = _mm256_mul_ps(rsq02,rinv02);
1819
1820             /* Calculate table index by multiplying r with table scale and truncate to integer */
1821             rt               = _mm256_mul_ps(r02,vftabscale);
1822             vfitab           = _mm256_cvttps_epi32(rt);
1823             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1824             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1825             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1826             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1827             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1828             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1829
1830             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1831             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1832                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1833             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1834                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1835             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1836                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1837             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1838                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1839             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1840             Heps             = _mm256_mul_ps(vfeps,H);
1841             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1842             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1843             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1844
1845             fscal            = felec;
1846
1847             /* Calculate temporary vectorial force */
1848             tx               = _mm256_mul_ps(fscal,dx02);
1849             ty               = _mm256_mul_ps(fscal,dy02);
1850             tz               = _mm256_mul_ps(fscal,dz02);
1851
1852             /* Update vectorial force */
1853             fix0             = _mm256_add_ps(fix0,tx);
1854             fiy0             = _mm256_add_ps(fiy0,ty);
1855             fiz0             = _mm256_add_ps(fiz0,tz);
1856
1857             fjx2             = _mm256_add_ps(fjx2,tx);
1858             fjy2             = _mm256_add_ps(fjy2,ty);
1859             fjz2             = _mm256_add_ps(fjz2,tz);
1860
1861             /**************************
1862              * CALCULATE INTERACTIONS *
1863              **************************/
1864
1865             r10              = _mm256_mul_ps(rsq10,rinv10);
1866
1867             /* Calculate table index by multiplying r with table scale and truncate to integer */
1868             rt               = _mm256_mul_ps(r10,vftabscale);
1869             vfitab           = _mm256_cvttps_epi32(rt);
1870             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1871             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1872             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1873             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1874             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1875             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1876
1877             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1878             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1879                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1880             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1881                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1882             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1883                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1884             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1885                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1886             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1887             Heps             = _mm256_mul_ps(vfeps,H);
1888             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1889             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1890             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1891
1892             fscal            = felec;
1893
1894             /* Calculate temporary vectorial force */
1895             tx               = _mm256_mul_ps(fscal,dx10);
1896             ty               = _mm256_mul_ps(fscal,dy10);
1897             tz               = _mm256_mul_ps(fscal,dz10);
1898
1899             /* Update vectorial force */
1900             fix1             = _mm256_add_ps(fix1,tx);
1901             fiy1             = _mm256_add_ps(fiy1,ty);
1902             fiz1             = _mm256_add_ps(fiz1,tz);
1903
1904             fjx0             = _mm256_add_ps(fjx0,tx);
1905             fjy0             = _mm256_add_ps(fjy0,ty);
1906             fjz0             = _mm256_add_ps(fjz0,tz);
1907
1908             /**************************
1909              * CALCULATE INTERACTIONS *
1910              **************************/
1911
1912             r11              = _mm256_mul_ps(rsq11,rinv11);
1913
1914             /* Calculate table index by multiplying r with table scale and truncate to integer */
1915             rt               = _mm256_mul_ps(r11,vftabscale);
1916             vfitab           = _mm256_cvttps_epi32(rt);
1917             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1918             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1919             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1920             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1921             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1922             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1923
1924             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1925             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1926                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1927             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1928                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1929             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1930                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1931             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1932                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1933             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1934             Heps             = _mm256_mul_ps(vfeps,H);
1935             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1936             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1937             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1938
1939             fscal            = felec;
1940
1941             /* Calculate temporary vectorial force */
1942             tx               = _mm256_mul_ps(fscal,dx11);
1943             ty               = _mm256_mul_ps(fscal,dy11);
1944             tz               = _mm256_mul_ps(fscal,dz11);
1945
1946             /* Update vectorial force */
1947             fix1             = _mm256_add_ps(fix1,tx);
1948             fiy1             = _mm256_add_ps(fiy1,ty);
1949             fiz1             = _mm256_add_ps(fiz1,tz);
1950
1951             fjx1             = _mm256_add_ps(fjx1,tx);
1952             fjy1             = _mm256_add_ps(fjy1,ty);
1953             fjz1             = _mm256_add_ps(fjz1,tz);
1954
1955             /**************************
1956              * CALCULATE INTERACTIONS *
1957              **************************/
1958
1959             r12              = _mm256_mul_ps(rsq12,rinv12);
1960
1961             /* Calculate table index by multiplying r with table scale and truncate to integer */
1962             rt               = _mm256_mul_ps(r12,vftabscale);
1963             vfitab           = _mm256_cvttps_epi32(rt);
1964             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1965             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1966             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1967             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1968             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1969             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1970
1971             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1972             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1973                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1974             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1975                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1976             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1977                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1978             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1979                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1980             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1981             Heps             = _mm256_mul_ps(vfeps,H);
1982             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1983             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1984             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1985
1986             fscal            = felec;
1987
1988             /* Calculate temporary vectorial force */
1989             tx               = _mm256_mul_ps(fscal,dx12);
1990             ty               = _mm256_mul_ps(fscal,dy12);
1991             tz               = _mm256_mul_ps(fscal,dz12);
1992
1993             /* Update vectorial force */
1994             fix1             = _mm256_add_ps(fix1,tx);
1995             fiy1             = _mm256_add_ps(fiy1,ty);
1996             fiz1             = _mm256_add_ps(fiz1,tz);
1997
1998             fjx2             = _mm256_add_ps(fjx2,tx);
1999             fjy2             = _mm256_add_ps(fjy2,ty);
2000             fjz2             = _mm256_add_ps(fjz2,tz);
2001
2002             /**************************
2003              * CALCULATE INTERACTIONS *
2004              **************************/
2005
2006             r20              = _mm256_mul_ps(rsq20,rinv20);
2007
2008             /* Calculate table index by multiplying r with table scale and truncate to integer */
2009             rt               = _mm256_mul_ps(r20,vftabscale);
2010             vfitab           = _mm256_cvttps_epi32(rt);
2011             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2012             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2013             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2014             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2015             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2016             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2017
2018             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2019             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2020                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2021             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2022                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2023             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2024                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2025             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2026                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2027             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2028             Heps             = _mm256_mul_ps(vfeps,H);
2029             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2030             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2031             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2032
2033             fscal            = felec;
2034
2035             /* Calculate temporary vectorial force */
2036             tx               = _mm256_mul_ps(fscal,dx20);
2037             ty               = _mm256_mul_ps(fscal,dy20);
2038             tz               = _mm256_mul_ps(fscal,dz20);
2039
2040             /* Update vectorial force */
2041             fix2             = _mm256_add_ps(fix2,tx);
2042             fiy2             = _mm256_add_ps(fiy2,ty);
2043             fiz2             = _mm256_add_ps(fiz2,tz);
2044
2045             fjx0             = _mm256_add_ps(fjx0,tx);
2046             fjy0             = _mm256_add_ps(fjy0,ty);
2047             fjz0             = _mm256_add_ps(fjz0,tz);
2048
2049             /**************************
2050              * CALCULATE INTERACTIONS *
2051              **************************/
2052
2053             r21              = _mm256_mul_ps(rsq21,rinv21);
2054
2055             /* Calculate table index by multiplying r with table scale and truncate to integer */
2056             rt               = _mm256_mul_ps(r21,vftabscale);
2057             vfitab           = _mm256_cvttps_epi32(rt);
2058             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2059             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2060             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2061             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2062             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2063             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2064
2065             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2066             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2067                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2068             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2069                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2070             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2071                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2072             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2073                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2074             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2075             Heps             = _mm256_mul_ps(vfeps,H);
2076             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2077             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2078             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2079
2080             fscal            = felec;
2081
2082             /* Calculate temporary vectorial force */
2083             tx               = _mm256_mul_ps(fscal,dx21);
2084             ty               = _mm256_mul_ps(fscal,dy21);
2085             tz               = _mm256_mul_ps(fscal,dz21);
2086
2087             /* Update vectorial force */
2088             fix2             = _mm256_add_ps(fix2,tx);
2089             fiy2             = _mm256_add_ps(fiy2,ty);
2090             fiz2             = _mm256_add_ps(fiz2,tz);
2091
2092             fjx1             = _mm256_add_ps(fjx1,tx);
2093             fjy1             = _mm256_add_ps(fjy1,ty);
2094             fjz1             = _mm256_add_ps(fjz1,tz);
2095
2096             /**************************
2097              * CALCULATE INTERACTIONS *
2098              **************************/
2099
2100             r22              = _mm256_mul_ps(rsq22,rinv22);
2101
2102             /* Calculate table index by multiplying r with table scale and truncate to integer */
2103             rt               = _mm256_mul_ps(r22,vftabscale);
2104             vfitab           = _mm256_cvttps_epi32(rt);
2105             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2106             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2107             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2108             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2109             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2110             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2111
2112             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2113             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2114                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2115             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2116                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2117             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2118                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2119             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2120                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2121             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2122             Heps             = _mm256_mul_ps(vfeps,H);
2123             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2124             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2125             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2126
2127             fscal            = felec;
2128
2129             /* Calculate temporary vectorial force */
2130             tx               = _mm256_mul_ps(fscal,dx22);
2131             ty               = _mm256_mul_ps(fscal,dy22);
2132             tz               = _mm256_mul_ps(fscal,dz22);
2133
2134             /* Update vectorial force */
2135             fix2             = _mm256_add_ps(fix2,tx);
2136             fiy2             = _mm256_add_ps(fiy2,ty);
2137             fiz2             = _mm256_add_ps(fiz2,tz);
2138
2139             fjx2             = _mm256_add_ps(fjx2,tx);
2140             fjy2             = _mm256_add_ps(fjy2,ty);
2141             fjz2             = _mm256_add_ps(fjz2,tz);
2142
2143             fjptrA             = f+j_coord_offsetA;
2144             fjptrB             = f+j_coord_offsetB;
2145             fjptrC             = f+j_coord_offsetC;
2146             fjptrD             = f+j_coord_offsetD;
2147             fjptrE             = f+j_coord_offsetE;
2148             fjptrF             = f+j_coord_offsetF;
2149             fjptrG             = f+j_coord_offsetG;
2150             fjptrH             = f+j_coord_offsetH;
2151
2152             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2153                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2154
2155             /* Inner loop uses 359 flops */
2156         }
2157
2158         if(jidx<j_index_end)
2159         {
2160
2161             /* Get j neighbor index, and coordinate index */
2162             jnrlistA         = jjnr[jidx];
2163             jnrlistB         = jjnr[jidx+1];
2164             jnrlistC         = jjnr[jidx+2];
2165             jnrlistD         = jjnr[jidx+3];
2166             jnrlistE         = jjnr[jidx+4];
2167             jnrlistF         = jjnr[jidx+5];
2168             jnrlistG         = jjnr[jidx+6];
2169             jnrlistH         = jjnr[jidx+7];
2170             /* Sign of each element will be negative for non-real atoms.
2171              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2172              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2173              */
2174             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2175                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2176                                             
2177             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2178             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2179             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2180             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2181             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2182             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2183             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2184             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2185             j_coord_offsetA  = DIM*jnrA;
2186             j_coord_offsetB  = DIM*jnrB;
2187             j_coord_offsetC  = DIM*jnrC;
2188             j_coord_offsetD  = DIM*jnrD;
2189             j_coord_offsetE  = DIM*jnrE;
2190             j_coord_offsetF  = DIM*jnrF;
2191             j_coord_offsetG  = DIM*jnrG;
2192             j_coord_offsetH  = DIM*jnrH;
2193
2194             /* load j atom coordinates */
2195             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2196                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2197                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2198                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2199                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2200
2201             /* Calculate displacement vector */
2202             dx00             = _mm256_sub_ps(ix0,jx0);
2203             dy00             = _mm256_sub_ps(iy0,jy0);
2204             dz00             = _mm256_sub_ps(iz0,jz0);
2205             dx01             = _mm256_sub_ps(ix0,jx1);
2206             dy01             = _mm256_sub_ps(iy0,jy1);
2207             dz01             = _mm256_sub_ps(iz0,jz1);
2208             dx02             = _mm256_sub_ps(ix0,jx2);
2209             dy02             = _mm256_sub_ps(iy0,jy2);
2210             dz02             = _mm256_sub_ps(iz0,jz2);
2211             dx10             = _mm256_sub_ps(ix1,jx0);
2212             dy10             = _mm256_sub_ps(iy1,jy0);
2213             dz10             = _mm256_sub_ps(iz1,jz0);
2214             dx11             = _mm256_sub_ps(ix1,jx1);
2215             dy11             = _mm256_sub_ps(iy1,jy1);
2216             dz11             = _mm256_sub_ps(iz1,jz1);
2217             dx12             = _mm256_sub_ps(ix1,jx2);
2218             dy12             = _mm256_sub_ps(iy1,jy2);
2219             dz12             = _mm256_sub_ps(iz1,jz2);
2220             dx20             = _mm256_sub_ps(ix2,jx0);
2221             dy20             = _mm256_sub_ps(iy2,jy0);
2222             dz20             = _mm256_sub_ps(iz2,jz0);
2223             dx21             = _mm256_sub_ps(ix2,jx1);
2224             dy21             = _mm256_sub_ps(iy2,jy1);
2225             dz21             = _mm256_sub_ps(iz2,jz1);
2226             dx22             = _mm256_sub_ps(ix2,jx2);
2227             dy22             = _mm256_sub_ps(iy2,jy2);
2228             dz22             = _mm256_sub_ps(iz2,jz2);
2229
2230             /* Calculate squared distance and things based on it */
2231             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2232             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2233             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2234             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2235             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2236             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2237             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2238             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2239             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2240
2241             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2242             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
2243             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
2244             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
2245             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2246             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2247             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
2248             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2249             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2250
2251             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
2252
2253             fjx0             = _mm256_setzero_ps();
2254             fjy0             = _mm256_setzero_ps();
2255             fjz0             = _mm256_setzero_ps();
2256             fjx1             = _mm256_setzero_ps();
2257             fjy1             = _mm256_setzero_ps();
2258             fjz1             = _mm256_setzero_ps();
2259             fjx2             = _mm256_setzero_ps();
2260             fjy2             = _mm256_setzero_ps();
2261             fjz2             = _mm256_setzero_ps();
2262
2263             /**************************
2264              * CALCULATE INTERACTIONS *
2265              **************************/
2266
2267             r00              = _mm256_mul_ps(rsq00,rinv00);
2268             r00              = _mm256_andnot_ps(dummy_mask,r00);
2269
2270             /* Calculate table index by multiplying r with table scale and truncate to integer */
2271             rt               = _mm256_mul_ps(r00,vftabscale);
2272             vfitab           = _mm256_cvttps_epi32(rt);
2273             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2274             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2275             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2276             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2277             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2278             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2279
2280             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2281             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2282                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2283             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2284                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2285             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2286                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2287             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2288                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2289             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2290             Heps             = _mm256_mul_ps(vfeps,H);
2291             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2292             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2293             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2294
2295             /* LENNARD-JONES DISPERSION/REPULSION */
2296
2297             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2298             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2299
2300             fscal            = _mm256_add_ps(felec,fvdw);
2301
2302             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2303
2304             /* Calculate temporary vectorial force */
2305             tx               = _mm256_mul_ps(fscal,dx00);
2306             ty               = _mm256_mul_ps(fscal,dy00);
2307             tz               = _mm256_mul_ps(fscal,dz00);
2308
2309             /* Update vectorial force */
2310             fix0             = _mm256_add_ps(fix0,tx);
2311             fiy0             = _mm256_add_ps(fiy0,ty);
2312             fiz0             = _mm256_add_ps(fiz0,tz);
2313
2314             fjx0             = _mm256_add_ps(fjx0,tx);
2315             fjy0             = _mm256_add_ps(fjy0,ty);
2316             fjz0             = _mm256_add_ps(fjz0,tz);
2317
2318             /**************************
2319              * CALCULATE INTERACTIONS *
2320              **************************/
2321
2322             r01              = _mm256_mul_ps(rsq01,rinv01);
2323             r01              = _mm256_andnot_ps(dummy_mask,r01);
2324
2325             /* Calculate table index by multiplying r with table scale and truncate to integer */
2326             rt               = _mm256_mul_ps(r01,vftabscale);
2327             vfitab           = _mm256_cvttps_epi32(rt);
2328             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2329             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2330             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2331             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2332             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2333             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2334
2335             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2336             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2337                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2338             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2339                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2340             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2341                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2342             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2343                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2344             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2345             Heps             = _mm256_mul_ps(vfeps,H);
2346             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2347             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2348             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2349
2350             fscal            = felec;
2351
2352             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2353
2354             /* Calculate temporary vectorial force */
2355             tx               = _mm256_mul_ps(fscal,dx01);
2356             ty               = _mm256_mul_ps(fscal,dy01);
2357             tz               = _mm256_mul_ps(fscal,dz01);
2358
2359             /* Update vectorial force */
2360             fix0             = _mm256_add_ps(fix0,tx);
2361             fiy0             = _mm256_add_ps(fiy0,ty);
2362             fiz0             = _mm256_add_ps(fiz0,tz);
2363
2364             fjx1             = _mm256_add_ps(fjx1,tx);
2365             fjy1             = _mm256_add_ps(fjy1,ty);
2366             fjz1             = _mm256_add_ps(fjz1,tz);
2367
2368             /**************************
2369              * CALCULATE INTERACTIONS *
2370              **************************/
2371
2372             r02              = _mm256_mul_ps(rsq02,rinv02);
2373             r02              = _mm256_andnot_ps(dummy_mask,r02);
2374
2375             /* Calculate table index by multiplying r with table scale and truncate to integer */
2376             rt               = _mm256_mul_ps(r02,vftabscale);
2377             vfitab           = _mm256_cvttps_epi32(rt);
2378             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2379             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2380             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2381             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2382             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2383             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2384
2385             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2386             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2387                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2388             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2389                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2390             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2391                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2392             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2393                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2394             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2395             Heps             = _mm256_mul_ps(vfeps,H);
2396             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2397             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2398             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2399
2400             fscal            = felec;
2401
2402             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2403
2404             /* Calculate temporary vectorial force */
2405             tx               = _mm256_mul_ps(fscal,dx02);
2406             ty               = _mm256_mul_ps(fscal,dy02);
2407             tz               = _mm256_mul_ps(fscal,dz02);
2408
2409             /* Update vectorial force */
2410             fix0             = _mm256_add_ps(fix0,tx);
2411             fiy0             = _mm256_add_ps(fiy0,ty);
2412             fiz0             = _mm256_add_ps(fiz0,tz);
2413
2414             fjx2             = _mm256_add_ps(fjx2,tx);
2415             fjy2             = _mm256_add_ps(fjy2,ty);
2416             fjz2             = _mm256_add_ps(fjz2,tz);
2417
2418             /**************************
2419              * CALCULATE INTERACTIONS *
2420              **************************/
2421
2422             r10              = _mm256_mul_ps(rsq10,rinv10);
2423             r10              = _mm256_andnot_ps(dummy_mask,r10);
2424
2425             /* Calculate table index by multiplying r with table scale and truncate to integer */
2426             rt               = _mm256_mul_ps(r10,vftabscale);
2427             vfitab           = _mm256_cvttps_epi32(rt);
2428             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2429             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2430             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2431             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2432             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2433             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2434
2435             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2436             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2437                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2438             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2439                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2440             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2441                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2442             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2443                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2444             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2445             Heps             = _mm256_mul_ps(vfeps,H);
2446             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2447             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2448             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2449
2450             fscal            = felec;
2451
2452             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2453
2454             /* Calculate temporary vectorial force */
2455             tx               = _mm256_mul_ps(fscal,dx10);
2456             ty               = _mm256_mul_ps(fscal,dy10);
2457             tz               = _mm256_mul_ps(fscal,dz10);
2458
2459             /* Update vectorial force */
2460             fix1             = _mm256_add_ps(fix1,tx);
2461             fiy1             = _mm256_add_ps(fiy1,ty);
2462             fiz1             = _mm256_add_ps(fiz1,tz);
2463
2464             fjx0             = _mm256_add_ps(fjx0,tx);
2465             fjy0             = _mm256_add_ps(fjy0,ty);
2466             fjz0             = _mm256_add_ps(fjz0,tz);
2467
2468             /**************************
2469              * CALCULATE INTERACTIONS *
2470              **************************/
2471
2472             r11              = _mm256_mul_ps(rsq11,rinv11);
2473             r11              = _mm256_andnot_ps(dummy_mask,r11);
2474
2475             /* Calculate table index by multiplying r with table scale and truncate to integer */
2476             rt               = _mm256_mul_ps(r11,vftabscale);
2477             vfitab           = _mm256_cvttps_epi32(rt);
2478             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2479             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2480             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2481             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2482             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2483             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2484
2485             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2486             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2487                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2488             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2489                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2490             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2491                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2492             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2493                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2494             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2495             Heps             = _mm256_mul_ps(vfeps,H);
2496             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2497             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2498             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2499
2500             fscal            = felec;
2501
2502             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2503
2504             /* Calculate temporary vectorial force */
2505             tx               = _mm256_mul_ps(fscal,dx11);
2506             ty               = _mm256_mul_ps(fscal,dy11);
2507             tz               = _mm256_mul_ps(fscal,dz11);
2508
2509             /* Update vectorial force */
2510             fix1             = _mm256_add_ps(fix1,tx);
2511             fiy1             = _mm256_add_ps(fiy1,ty);
2512             fiz1             = _mm256_add_ps(fiz1,tz);
2513
2514             fjx1             = _mm256_add_ps(fjx1,tx);
2515             fjy1             = _mm256_add_ps(fjy1,ty);
2516             fjz1             = _mm256_add_ps(fjz1,tz);
2517
2518             /**************************
2519              * CALCULATE INTERACTIONS *
2520              **************************/
2521
2522             r12              = _mm256_mul_ps(rsq12,rinv12);
2523             r12              = _mm256_andnot_ps(dummy_mask,r12);
2524
2525             /* Calculate table index by multiplying r with table scale and truncate to integer */
2526             rt               = _mm256_mul_ps(r12,vftabscale);
2527             vfitab           = _mm256_cvttps_epi32(rt);
2528             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2529             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2530             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2531             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2532             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2533             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2534
2535             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2536             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2537                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2538             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2539                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2540             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2541                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2542             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2543                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2544             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2545             Heps             = _mm256_mul_ps(vfeps,H);
2546             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2547             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2548             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2549
2550             fscal            = felec;
2551
2552             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2553
2554             /* Calculate temporary vectorial force */
2555             tx               = _mm256_mul_ps(fscal,dx12);
2556             ty               = _mm256_mul_ps(fscal,dy12);
2557             tz               = _mm256_mul_ps(fscal,dz12);
2558
2559             /* Update vectorial force */
2560             fix1             = _mm256_add_ps(fix1,tx);
2561             fiy1             = _mm256_add_ps(fiy1,ty);
2562             fiz1             = _mm256_add_ps(fiz1,tz);
2563
2564             fjx2             = _mm256_add_ps(fjx2,tx);
2565             fjy2             = _mm256_add_ps(fjy2,ty);
2566             fjz2             = _mm256_add_ps(fjz2,tz);
2567
2568             /**************************
2569              * CALCULATE INTERACTIONS *
2570              **************************/
2571
2572             r20              = _mm256_mul_ps(rsq20,rinv20);
2573             r20              = _mm256_andnot_ps(dummy_mask,r20);
2574
2575             /* Calculate table index by multiplying r with table scale and truncate to integer */
2576             rt               = _mm256_mul_ps(r20,vftabscale);
2577             vfitab           = _mm256_cvttps_epi32(rt);
2578             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2579             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2580             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2581             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2582             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2583             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2584
2585             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2586             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2587                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2588             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2589                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2590             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2591                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2592             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2593                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2594             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2595             Heps             = _mm256_mul_ps(vfeps,H);
2596             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2597             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2598             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2599
2600             fscal            = felec;
2601
2602             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2603
2604             /* Calculate temporary vectorial force */
2605             tx               = _mm256_mul_ps(fscal,dx20);
2606             ty               = _mm256_mul_ps(fscal,dy20);
2607             tz               = _mm256_mul_ps(fscal,dz20);
2608
2609             /* Update vectorial force */
2610             fix2             = _mm256_add_ps(fix2,tx);
2611             fiy2             = _mm256_add_ps(fiy2,ty);
2612             fiz2             = _mm256_add_ps(fiz2,tz);
2613
2614             fjx0             = _mm256_add_ps(fjx0,tx);
2615             fjy0             = _mm256_add_ps(fjy0,ty);
2616             fjz0             = _mm256_add_ps(fjz0,tz);
2617
2618             /**************************
2619              * CALCULATE INTERACTIONS *
2620              **************************/
2621
2622             r21              = _mm256_mul_ps(rsq21,rinv21);
2623             r21              = _mm256_andnot_ps(dummy_mask,r21);
2624
2625             /* Calculate table index by multiplying r with table scale and truncate to integer */
2626             rt               = _mm256_mul_ps(r21,vftabscale);
2627             vfitab           = _mm256_cvttps_epi32(rt);
2628             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2629             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2630             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2631             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2632             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2633             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2634
2635             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2636             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2637                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2638             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2639                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2640             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2641                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2642             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2643                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2644             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2645             Heps             = _mm256_mul_ps(vfeps,H);
2646             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2647             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2648             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2649
2650             fscal            = felec;
2651
2652             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2653
2654             /* Calculate temporary vectorial force */
2655             tx               = _mm256_mul_ps(fscal,dx21);
2656             ty               = _mm256_mul_ps(fscal,dy21);
2657             tz               = _mm256_mul_ps(fscal,dz21);
2658
2659             /* Update vectorial force */
2660             fix2             = _mm256_add_ps(fix2,tx);
2661             fiy2             = _mm256_add_ps(fiy2,ty);
2662             fiz2             = _mm256_add_ps(fiz2,tz);
2663
2664             fjx1             = _mm256_add_ps(fjx1,tx);
2665             fjy1             = _mm256_add_ps(fjy1,ty);
2666             fjz1             = _mm256_add_ps(fjz1,tz);
2667
2668             /**************************
2669              * CALCULATE INTERACTIONS *
2670              **************************/
2671
2672             r22              = _mm256_mul_ps(rsq22,rinv22);
2673             r22              = _mm256_andnot_ps(dummy_mask,r22);
2674
2675             /* Calculate table index by multiplying r with table scale and truncate to integer */
2676             rt               = _mm256_mul_ps(r22,vftabscale);
2677             vfitab           = _mm256_cvttps_epi32(rt);
2678             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2679             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2680             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2681             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2682             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2683             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2684
2685             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2686             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2687                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2688             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2689                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2690             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2691                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2692             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2693                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2694             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2695             Heps             = _mm256_mul_ps(vfeps,H);
2696             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2697             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2698             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2699
2700             fscal            = felec;
2701
2702             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2703
2704             /* Calculate temporary vectorial force */
2705             tx               = _mm256_mul_ps(fscal,dx22);
2706             ty               = _mm256_mul_ps(fscal,dy22);
2707             tz               = _mm256_mul_ps(fscal,dz22);
2708
2709             /* Update vectorial force */
2710             fix2             = _mm256_add_ps(fix2,tx);
2711             fiy2             = _mm256_add_ps(fiy2,ty);
2712             fiz2             = _mm256_add_ps(fiz2,tz);
2713
2714             fjx2             = _mm256_add_ps(fjx2,tx);
2715             fjy2             = _mm256_add_ps(fjy2,ty);
2716             fjz2             = _mm256_add_ps(fjz2,tz);
2717
2718             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2719             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2720             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2721             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2722             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2723             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2724             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2725             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2726
2727             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2728                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2729
2730             /* Inner loop uses 368 flops */
2731         }
2732
2733         /* End of innermost loop */
2734
2735         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2736                                                  f+i_coord_offset,fshift+i_shift_offset);
2737
2738         /* Increment number of inner iterations */
2739         inneriter                  += j_index_end - j_index_start;
2740
2741         /* Outer loop uses 18 flops */
2742     }
2743
2744     /* Increment number of outer iterations */
2745     outeriter        += nri;
2746
2747     /* Update outer/inner flops */
2748
2749     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);
2750 }