Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
94     real             *charge;
95     int              nvdwtype;
96     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97     int              *vdwtype;
98     real             *vdwparam;
99     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
100     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
101     __m256i          vfitab;
102     __m128i          vfitab_lo,vfitab_hi;
103     __m128i          ifour       = _mm_set1_epi32(4);
104     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
105     real             *vftab;
106     __m256           dummy_mask,cutoff_mask;
107     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
108     __m256           one     = _mm256_set1_ps(1.0);
109     __m256           two     = _mm256_set1_ps(2.0);
110     x                = xx[0];
111     f                = ff[0];
112
113     nri              = nlist->nri;
114     iinr             = nlist->iinr;
115     jindex           = nlist->jindex;
116     jjnr             = nlist->jjnr;
117     shiftidx         = nlist->shift;
118     gid              = nlist->gid;
119     shiftvec         = fr->shift_vec[0];
120     fshift           = fr->fshift[0];
121     facel            = _mm256_set1_ps(fr->epsfac);
122     charge           = mdatoms->chargeA;
123     nvdwtype         = fr->ntype;
124     vdwparam         = fr->nbfp;
125     vdwtype          = mdatoms->typeA;
126
127     vftab            = kernel_data->table_elec->data;
128     vftabscale       = _mm256_set1_ps(kernel_data->table_elec->scale);
129
130     /* Setup water-specific parameters */
131     inr              = nlist->iinr[0];
132     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
133     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
134     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
135     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
136
137     jq0              = _mm256_set1_ps(charge[inr+0]);
138     jq1              = _mm256_set1_ps(charge[inr+1]);
139     jq2              = _mm256_set1_ps(charge[inr+2]);
140     vdwjidx0A        = 2*vdwtype[inr+0];
141     qq00             = _mm256_mul_ps(iq0,jq0);
142     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
143     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
144     qq01             = _mm256_mul_ps(iq0,jq1);
145     qq02             = _mm256_mul_ps(iq0,jq2);
146     qq10             = _mm256_mul_ps(iq1,jq0);
147     qq11             = _mm256_mul_ps(iq1,jq1);
148     qq12             = _mm256_mul_ps(iq1,jq2);
149     qq20             = _mm256_mul_ps(iq2,jq0);
150     qq21             = _mm256_mul_ps(iq2,jq1);
151     qq22             = _mm256_mul_ps(iq2,jq2);
152
153     /* Avoid stupid compiler warnings */
154     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
155     j_coord_offsetA = 0;
156     j_coord_offsetB = 0;
157     j_coord_offsetC = 0;
158     j_coord_offsetD = 0;
159     j_coord_offsetE = 0;
160     j_coord_offsetF = 0;
161     j_coord_offsetG = 0;
162     j_coord_offsetH = 0;
163
164     outeriter        = 0;
165     inneriter        = 0;
166
167     for(iidx=0;iidx<4*DIM;iidx++)
168     {
169         scratch[iidx] = 0.0;
170     }
171
172     /* Start outer loop over neighborlists */
173     for(iidx=0; iidx<nri; iidx++)
174     {
175         /* Load shift vector for this list */
176         i_shift_offset   = DIM*shiftidx[iidx];
177
178         /* Load limits for loop over neighbors */
179         j_index_start    = jindex[iidx];
180         j_index_end      = jindex[iidx+1];
181
182         /* Get outer coordinate index */
183         inr              = iinr[iidx];
184         i_coord_offset   = DIM*inr;
185
186         /* Load i particle coords and add shift vector */
187         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
188                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
189
190         fix0             = _mm256_setzero_ps();
191         fiy0             = _mm256_setzero_ps();
192         fiz0             = _mm256_setzero_ps();
193         fix1             = _mm256_setzero_ps();
194         fiy1             = _mm256_setzero_ps();
195         fiz1             = _mm256_setzero_ps();
196         fix2             = _mm256_setzero_ps();
197         fiy2             = _mm256_setzero_ps();
198         fiz2             = _mm256_setzero_ps();
199
200         /* Reset potential sums */
201         velecsum         = _mm256_setzero_ps();
202         vvdwsum          = _mm256_setzero_ps();
203
204         /* Start inner kernel loop */
205         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
206         {
207
208             /* Get j neighbor index, and coordinate index */
209             jnrA             = jjnr[jidx];
210             jnrB             = jjnr[jidx+1];
211             jnrC             = jjnr[jidx+2];
212             jnrD             = jjnr[jidx+3];
213             jnrE             = jjnr[jidx+4];
214             jnrF             = jjnr[jidx+5];
215             jnrG             = jjnr[jidx+6];
216             jnrH             = jjnr[jidx+7];
217             j_coord_offsetA  = DIM*jnrA;
218             j_coord_offsetB  = DIM*jnrB;
219             j_coord_offsetC  = DIM*jnrC;
220             j_coord_offsetD  = DIM*jnrD;
221             j_coord_offsetE  = DIM*jnrE;
222             j_coord_offsetF  = DIM*jnrF;
223             j_coord_offsetG  = DIM*jnrG;
224             j_coord_offsetH  = DIM*jnrH;
225
226             /* load j atom coordinates */
227             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
228                                                  x+j_coord_offsetC,x+j_coord_offsetD,
229                                                  x+j_coord_offsetE,x+j_coord_offsetF,
230                                                  x+j_coord_offsetG,x+j_coord_offsetH,
231                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
232
233             /* Calculate displacement vector */
234             dx00             = _mm256_sub_ps(ix0,jx0);
235             dy00             = _mm256_sub_ps(iy0,jy0);
236             dz00             = _mm256_sub_ps(iz0,jz0);
237             dx01             = _mm256_sub_ps(ix0,jx1);
238             dy01             = _mm256_sub_ps(iy0,jy1);
239             dz01             = _mm256_sub_ps(iz0,jz1);
240             dx02             = _mm256_sub_ps(ix0,jx2);
241             dy02             = _mm256_sub_ps(iy0,jy2);
242             dz02             = _mm256_sub_ps(iz0,jz2);
243             dx10             = _mm256_sub_ps(ix1,jx0);
244             dy10             = _mm256_sub_ps(iy1,jy0);
245             dz10             = _mm256_sub_ps(iz1,jz0);
246             dx11             = _mm256_sub_ps(ix1,jx1);
247             dy11             = _mm256_sub_ps(iy1,jy1);
248             dz11             = _mm256_sub_ps(iz1,jz1);
249             dx12             = _mm256_sub_ps(ix1,jx2);
250             dy12             = _mm256_sub_ps(iy1,jy2);
251             dz12             = _mm256_sub_ps(iz1,jz2);
252             dx20             = _mm256_sub_ps(ix2,jx0);
253             dy20             = _mm256_sub_ps(iy2,jy0);
254             dz20             = _mm256_sub_ps(iz2,jz0);
255             dx21             = _mm256_sub_ps(ix2,jx1);
256             dy21             = _mm256_sub_ps(iy2,jy1);
257             dz21             = _mm256_sub_ps(iz2,jz1);
258             dx22             = _mm256_sub_ps(ix2,jx2);
259             dy22             = _mm256_sub_ps(iy2,jy2);
260             dz22             = _mm256_sub_ps(iz2,jz2);
261
262             /* Calculate squared distance and things based on it */
263             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
264             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
265             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
266             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
267             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
268             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
269             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
270             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
271             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
272
273             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
274             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
275             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
276             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
277             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
278             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
279             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
280             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
281             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
282
283             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
284
285             fjx0             = _mm256_setzero_ps();
286             fjy0             = _mm256_setzero_ps();
287             fjz0             = _mm256_setzero_ps();
288             fjx1             = _mm256_setzero_ps();
289             fjy1             = _mm256_setzero_ps();
290             fjz1             = _mm256_setzero_ps();
291             fjx2             = _mm256_setzero_ps();
292             fjy2             = _mm256_setzero_ps();
293             fjz2             = _mm256_setzero_ps();
294
295             /**************************
296              * CALCULATE INTERACTIONS *
297              **************************/
298
299             r00              = _mm256_mul_ps(rsq00,rinv00);
300
301             /* Calculate table index by multiplying r with table scale and truncate to integer */
302             rt               = _mm256_mul_ps(r00,vftabscale);
303             vfitab           = _mm256_cvttps_epi32(rt);
304             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
305             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
306             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
307             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
308             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
309             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
310
311             /* CUBIC SPLINE TABLE ELECTROSTATICS */
312             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
313                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
314             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
315                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
316             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
317                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
318             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
319                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
320             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
321             Heps             = _mm256_mul_ps(vfeps,H);
322             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
323             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
324             velec            = _mm256_mul_ps(qq00,VV);
325             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
326             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
327
328             /* LENNARD-JONES DISPERSION/REPULSION */
329
330             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
331             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
332             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
333             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
334             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
335
336             /* Update potential sum for this i atom from the interaction with this j atom. */
337             velecsum         = _mm256_add_ps(velecsum,velec);
338             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
339
340             fscal            = _mm256_add_ps(felec,fvdw);
341
342             /* Calculate temporary vectorial force */
343             tx               = _mm256_mul_ps(fscal,dx00);
344             ty               = _mm256_mul_ps(fscal,dy00);
345             tz               = _mm256_mul_ps(fscal,dz00);
346
347             /* Update vectorial force */
348             fix0             = _mm256_add_ps(fix0,tx);
349             fiy0             = _mm256_add_ps(fiy0,ty);
350             fiz0             = _mm256_add_ps(fiz0,tz);
351
352             fjx0             = _mm256_add_ps(fjx0,tx);
353             fjy0             = _mm256_add_ps(fjy0,ty);
354             fjz0             = _mm256_add_ps(fjz0,tz);
355
356             /**************************
357              * CALCULATE INTERACTIONS *
358              **************************/
359
360             r01              = _mm256_mul_ps(rsq01,rinv01);
361
362             /* Calculate table index by multiplying r with table scale and truncate to integer */
363             rt               = _mm256_mul_ps(r01,vftabscale);
364             vfitab           = _mm256_cvttps_epi32(rt);
365             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
366             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
367             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
368             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
369             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
370             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
371
372             /* CUBIC SPLINE TABLE ELECTROSTATICS */
373             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
374                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
375             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
376                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
377             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
378                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
379             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
380                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
381             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
382             Heps             = _mm256_mul_ps(vfeps,H);
383             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
384             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
385             velec            = _mm256_mul_ps(qq01,VV);
386             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
387             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velecsum         = _mm256_add_ps(velecsum,velec);
391
392             fscal            = felec;
393
394             /* Calculate temporary vectorial force */
395             tx               = _mm256_mul_ps(fscal,dx01);
396             ty               = _mm256_mul_ps(fscal,dy01);
397             tz               = _mm256_mul_ps(fscal,dz01);
398
399             /* Update vectorial force */
400             fix0             = _mm256_add_ps(fix0,tx);
401             fiy0             = _mm256_add_ps(fiy0,ty);
402             fiz0             = _mm256_add_ps(fiz0,tz);
403
404             fjx1             = _mm256_add_ps(fjx1,tx);
405             fjy1             = _mm256_add_ps(fjy1,ty);
406             fjz1             = _mm256_add_ps(fjz1,tz);
407
408             /**************************
409              * CALCULATE INTERACTIONS *
410              **************************/
411
412             r02              = _mm256_mul_ps(rsq02,rinv02);
413
414             /* Calculate table index by multiplying r with table scale and truncate to integer */
415             rt               = _mm256_mul_ps(r02,vftabscale);
416             vfitab           = _mm256_cvttps_epi32(rt);
417             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
418             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
419             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
420             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
421             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
422             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
423
424             /* CUBIC SPLINE TABLE ELECTROSTATICS */
425             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
426                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
427             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
428                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
429             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
430                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
431             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
432                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
433             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
434             Heps             = _mm256_mul_ps(vfeps,H);
435             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
436             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
437             velec            = _mm256_mul_ps(qq02,VV);
438             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
439             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
440
441             /* Update potential sum for this i atom from the interaction with this j atom. */
442             velecsum         = _mm256_add_ps(velecsum,velec);
443
444             fscal            = felec;
445
446             /* Calculate temporary vectorial force */
447             tx               = _mm256_mul_ps(fscal,dx02);
448             ty               = _mm256_mul_ps(fscal,dy02);
449             tz               = _mm256_mul_ps(fscal,dz02);
450
451             /* Update vectorial force */
452             fix0             = _mm256_add_ps(fix0,tx);
453             fiy0             = _mm256_add_ps(fiy0,ty);
454             fiz0             = _mm256_add_ps(fiz0,tz);
455
456             fjx2             = _mm256_add_ps(fjx2,tx);
457             fjy2             = _mm256_add_ps(fjy2,ty);
458             fjz2             = _mm256_add_ps(fjz2,tz);
459
460             /**************************
461              * CALCULATE INTERACTIONS *
462              **************************/
463
464             r10              = _mm256_mul_ps(rsq10,rinv10);
465
466             /* Calculate table index by multiplying r with table scale and truncate to integer */
467             rt               = _mm256_mul_ps(r10,vftabscale);
468             vfitab           = _mm256_cvttps_epi32(rt);
469             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
470             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
471             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
472             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
473             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
474             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
475
476             /* CUBIC SPLINE TABLE ELECTROSTATICS */
477             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
478                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
479             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
480                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
481             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
482                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
483             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
484                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
485             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
486             Heps             = _mm256_mul_ps(vfeps,H);
487             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
488             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
489             velec            = _mm256_mul_ps(qq10,VV);
490             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
491             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
492
493             /* Update potential sum for this i atom from the interaction with this j atom. */
494             velecsum         = _mm256_add_ps(velecsum,velec);
495
496             fscal            = felec;
497
498             /* Calculate temporary vectorial force */
499             tx               = _mm256_mul_ps(fscal,dx10);
500             ty               = _mm256_mul_ps(fscal,dy10);
501             tz               = _mm256_mul_ps(fscal,dz10);
502
503             /* Update vectorial force */
504             fix1             = _mm256_add_ps(fix1,tx);
505             fiy1             = _mm256_add_ps(fiy1,ty);
506             fiz1             = _mm256_add_ps(fiz1,tz);
507
508             fjx0             = _mm256_add_ps(fjx0,tx);
509             fjy0             = _mm256_add_ps(fjy0,ty);
510             fjz0             = _mm256_add_ps(fjz0,tz);
511
512             /**************************
513              * CALCULATE INTERACTIONS *
514              **************************/
515
516             r11              = _mm256_mul_ps(rsq11,rinv11);
517
518             /* Calculate table index by multiplying r with table scale and truncate to integer */
519             rt               = _mm256_mul_ps(r11,vftabscale);
520             vfitab           = _mm256_cvttps_epi32(rt);
521             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
522             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
523             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
524             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
525             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
526             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
527
528             /* CUBIC SPLINE TABLE ELECTROSTATICS */
529             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
530                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
531             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
532                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
533             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
534                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
535             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
536                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
537             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
538             Heps             = _mm256_mul_ps(vfeps,H);
539             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
540             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
541             velec            = _mm256_mul_ps(qq11,VV);
542             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
543             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
544
545             /* Update potential sum for this i atom from the interaction with this j atom. */
546             velecsum         = _mm256_add_ps(velecsum,velec);
547
548             fscal            = felec;
549
550             /* Calculate temporary vectorial force */
551             tx               = _mm256_mul_ps(fscal,dx11);
552             ty               = _mm256_mul_ps(fscal,dy11);
553             tz               = _mm256_mul_ps(fscal,dz11);
554
555             /* Update vectorial force */
556             fix1             = _mm256_add_ps(fix1,tx);
557             fiy1             = _mm256_add_ps(fiy1,ty);
558             fiz1             = _mm256_add_ps(fiz1,tz);
559
560             fjx1             = _mm256_add_ps(fjx1,tx);
561             fjy1             = _mm256_add_ps(fjy1,ty);
562             fjz1             = _mm256_add_ps(fjz1,tz);
563
564             /**************************
565              * CALCULATE INTERACTIONS *
566              **************************/
567
568             r12              = _mm256_mul_ps(rsq12,rinv12);
569
570             /* Calculate table index by multiplying r with table scale and truncate to integer */
571             rt               = _mm256_mul_ps(r12,vftabscale);
572             vfitab           = _mm256_cvttps_epi32(rt);
573             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
574             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
575             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
576             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
577             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
578             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
579
580             /* CUBIC SPLINE TABLE ELECTROSTATICS */
581             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
582                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
583             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
584                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
585             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
586                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
587             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
588                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
589             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
590             Heps             = _mm256_mul_ps(vfeps,H);
591             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
592             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
593             velec            = _mm256_mul_ps(qq12,VV);
594             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
595             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
596
597             /* Update potential sum for this i atom from the interaction with this j atom. */
598             velecsum         = _mm256_add_ps(velecsum,velec);
599
600             fscal            = felec;
601
602             /* Calculate temporary vectorial force */
603             tx               = _mm256_mul_ps(fscal,dx12);
604             ty               = _mm256_mul_ps(fscal,dy12);
605             tz               = _mm256_mul_ps(fscal,dz12);
606
607             /* Update vectorial force */
608             fix1             = _mm256_add_ps(fix1,tx);
609             fiy1             = _mm256_add_ps(fiy1,ty);
610             fiz1             = _mm256_add_ps(fiz1,tz);
611
612             fjx2             = _mm256_add_ps(fjx2,tx);
613             fjy2             = _mm256_add_ps(fjy2,ty);
614             fjz2             = _mm256_add_ps(fjz2,tz);
615
616             /**************************
617              * CALCULATE INTERACTIONS *
618              **************************/
619
620             r20              = _mm256_mul_ps(rsq20,rinv20);
621
622             /* Calculate table index by multiplying r with table scale and truncate to integer */
623             rt               = _mm256_mul_ps(r20,vftabscale);
624             vfitab           = _mm256_cvttps_epi32(rt);
625             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
626             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
627             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
628             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
629             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
630             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
631
632             /* CUBIC SPLINE TABLE ELECTROSTATICS */
633             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
634                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
635             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
636                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
637             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
638                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
639             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
640                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
641             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
642             Heps             = _mm256_mul_ps(vfeps,H);
643             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
644             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
645             velec            = _mm256_mul_ps(qq20,VV);
646             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
647             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
648
649             /* Update potential sum for this i atom from the interaction with this j atom. */
650             velecsum         = _mm256_add_ps(velecsum,velec);
651
652             fscal            = felec;
653
654             /* Calculate temporary vectorial force */
655             tx               = _mm256_mul_ps(fscal,dx20);
656             ty               = _mm256_mul_ps(fscal,dy20);
657             tz               = _mm256_mul_ps(fscal,dz20);
658
659             /* Update vectorial force */
660             fix2             = _mm256_add_ps(fix2,tx);
661             fiy2             = _mm256_add_ps(fiy2,ty);
662             fiz2             = _mm256_add_ps(fiz2,tz);
663
664             fjx0             = _mm256_add_ps(fjx0,tx);
665             fjy0             = _mm256_add_ps(fjy0,ty);
666             fjz0             = _mm256_add_ps(fjz0,tz);
667
668             /**************************
669              * CALCULATE INTERACTIONS *
670              **************************/
671
672             r21              = _mm256_mul_ps(rsq21,rinv21);
673
674             /* Calculate table index by multiplying r with table scale and truncate to integer */
675             rt               = _mm256_mul_ps(r21,vftabscale);
676             vfitab           = _mm256_cvttps_epi32(rt);
677             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
678             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
679             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
680             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
681             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
682             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
683
684             /* CUBIC SPLINE TABLE ELECTROSTATICS */
685             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
686                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
687             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
688                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
689             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
690                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
691             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
692                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
693             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
694             Heps             = _mm256_mul_ps(vfeps,H);
695             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
696             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
697             velec            = _mm256_mul_ps(qq21,VV);
698             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
699             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
700
701             /* Update potential sum for this i atom from the interaction with this j atom. */
702             velecsum         = _mm256_add_ps(velecsum,velec);
703
704             fscal            = felec;
705
706             /* Calculate temporary vectorial force */
707             tx               = _mm256_mul_ps(fscal,dx21);
708             ty               = _mm256_mul_ps(fscal,dy21);
709             tz               = _mm256_mul_ps(fscal,dz21);
710
711             /* Update vectorial force */
712             fix2             = _mm256_add_ps(fix2,tx);
713             fiy2             = _mm256_add_ps(fiy2,ty);
714             fiz2             = _mm256_add_ps(fiz2,tz);
715
716             fjx1             = _mm256_add_ps(fjx1,tx);
717             fjy1             = _mm256_add_ps(fjy1,ty);
718             fjz1             = _mm256_add_ps(fjz1,tz);
719
720             /**************************
721              * CALCULATE INTERACTIONS *
722              **************************/
723
724             r22              = _mm256_mul_ps(rsq22,rinv22);
725
726             /* Calculate table index by multiplying r with table scale and truncate to integer */
727             rt               = _mm256_mul_ps(r22,vftabscale);
728             vfitab           = _mm256_cvttps_epi32(rt);
729             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
730             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
731             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
732             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
733             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
734             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
735
736             /* CUBIC SPLINE TABLE ELECTROSTATICS */
737             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
738                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
739             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
740                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
741             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
742                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
743             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
744                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
745             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
746             Heps             = _mm256_mul_ps(vfeps,H);
747             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
748             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
749             velec            = _mm256_mul_ps(qq22,VV);
750             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
751             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
752
753             /* Update potential sum for this i atom from the interaction with this j atom. */
754             velecsum         = _mm256_add_ps(velecsum,velec);
755
756             fscal            = felec;
757
758             /* Calculate temporary vectorial force */
759             tx               = _mm256_mul_ps(fscal,dx22);
760             ty               = _mm256_mul_ps(fscal,dy22);
761             tz               = _mm256_mul_ps(fscal,dz22);
762
763             /* Update vectorial force */
764             fix2             = _mm256_add_ps(fix2,tx);
765             fiy2             = _mm256_add_ps(fiy2,ty);
766             fiz2             = _mm256_add_ps(fiz2,tz);
767
768             fjx2             = _mm256_add_ps(fjx2,tx);
769             fjy2             = _mm256_add_ps(fjy2,ty);
770             fjz2             = _mm256_add_ps(fjz2,tz);
771
772             fjptrA             = f+j_coord_offsetA;
773             fjptrB             = f+j_coord_offsetB;
774             fjptrC             = f+j_coord_offsetC;
775             fjptrD             = f+j_coord_offsetD;
776             fjptrE             = f+j_coord_offsetE;
777             fjptrF             = f+j_coord_offsetF;
778             fjptrG             = f+j_coord_offsetG;
779             fjptrH             = f+j_coord_offsetH;
780
781             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
782                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
783
784             /* Inner loop uses 400 flops */
785         }
786
787         if(jidx<j_index_end)
788         {
789
790             /* Get j neighbor index, and coordinate index */
791             jnrlistA         = jjnr[jidx];
792             jnrlistB         = jjnr[jidx+1];
793             jnrlistC         = jjnr[jidx+2];
794             jnrlistD         = jjnr[jidx+3];
795             jnrlistE         = jjnr[jidx+4];
796             jnrlistF         = jjnr[jidx+5];
797             jnrlistG         = jjnr[jidx+6];
798             jnrlistH         = jjnr[jidx+7];
799             /* Sign of each element will be negative for non-real atoms.
800              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
801              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
802              */
803             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
804                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
805                                             
806             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
807             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
808             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
809             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
810             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
811             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
812             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
813             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
814             j_coord_offsetA  = DIM*jnrA;
815             j_coord_offsetB  = DIM*jnrB;
816             j_coord_offsetC  = DIM*jnrC;
817             j_coord_offsetD  = DIM*jnrD;
818             j_coord_offsetE  = DIM*jnrE;
819             j_coord_offsetF  = DIM*jnrF;
820             j_coord_offsetG  = DIM*jnrG;
821             j_coord_offsetH  = DIM*jnrH;
822
823             /* load j atom coordinates */
824             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
825                                                  x+j_coord_offsetC,x+j_coord_offsetD,
826                                                  x+j_coord_offsetE,x+j_coord_offsetF,
827                                                  x+j_coord_offsetG,x+j_coord_offsetH,
828                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
829
830             /* Calculate displacement vector */
831             dx00             = _mm256_sub_ps(ix0,jx0);
832             dy00             = _mm256_sub_ps(iy0,jy0);
833             dz00             = _mm256_sub_ps(iz0,jz0);
834             dx01             = _mm256_sub_ps(ix0,jx1);
835             dy01             = _mm256_sub_ps(iy0,jy1);
836             dz01             = _mm256_sub_ps(iz0,jz1);
837             dx02             = _mm256_sub_ps(ix0,jx2);
838             dy02             = _mm256_sub_ps(iy0,jy2);
839             dz02             = _mm256_sub_ps(iz0,jz2);
840             dx10             = _mm256_sub_ps(ix1,jx0);
841             dy10             = _mm256_sub_ps(iy1,jy0);
842             dz10             = _mm256_sub_ps(iz1,jz0);
843             dx11             = _mm256_sub_ps(ix1,jx1);
844             dy11             = _mm256_sub_ps(iy1,jy1);
845             dz11             = _mm256_sub_ps(iz1,jz1);
846             dx12             = _mm256_sub_ps(ix1,jx2);
847             dy12             = _mm256_sub_ps(iy1,jy2);
848             dz12             = _mm256_sub_ps(iz1,jz2);
849             dx20             = _mm256_sub_ps(ix2,jx0);
850             dy20             = _mm256_sub_ps(iy2,jy0);
851             dz20             = _mm256_sub_ps(iz2,jz0);
852             dx21             = _mm256_sub_ps(ix2,jx1);
853             dy21             = _mm256_sub_ps(iy2,jy1);
854             dz21             = _mm256_sub_ps(iz2,jz1);
855             dx22             = _mm256_sub_ps(ix2,jx2);
856             dy22             = _mm256_sub_ps(iy2,jy2);
857             dz22             = _mm256_sub_ps(iz2,jz2);
858
859             /* Calculate squared distance and things based on it */
860             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
861             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
862             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
863             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
864             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
865             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
866             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
867             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
868             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
869
870             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
871             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
872             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
873             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
874             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
875             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
876             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
877             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
878             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
879
880             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
881
882             fjx0             = _mm256_setzero_ps();
883             fjy0             = _mm256_setzero_ps();
884             fjz0             = _mm256_setzero_ps();
885             fjx1             = _mm256_setzero_ps();
886             fjy1             = _mm256_setzero_ps();
887             fjz1             = _mm256_setzero_ps();
888             fjx2             = _mm256_setzero_ps();
889             fjy2             = _mm256_setzero_ps();
890             fjz2             = _mm256_setzero_ps();
891
892             /**************************
893              * CALCULATE INTERACTIONS *
894              **************************/
895
896             r00              = _mm256_mul_ps(rsq00,rinv00);
897             r00              = _mm256_andnot_ps(dummy_mask,r00);
898
899             /* Calculate table index by multiplying r with table scale and truncate to integer */
900             rt               = _mm256_mul_ps(r00,vftabscale);
901             vfitab           = _mm256_cvttps_epi32(rt);
902             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
903             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
904             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
905             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
906             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
907             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
908
909             /* CUBIC SPLINE TABLE ELECTROSTATICS */
910             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
911                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
912             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
913                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
914             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
915                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
916             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
917                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
918             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
919             Heps             = _mm256_mul_ps(vfeps,H);
920             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
921             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
922             velec            = _mm256_mul_ps(qq00,VV);
923             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
924             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
925
926             /* LENNARD-JONES DISPERSION/REPULSION */
927
928             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
929             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
930             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
931             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
932             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
933
934             /* Update potential sum for this i atom from the interaction with this j atom. */
935             velec            = _mm256_andnot_ps(dummy_mask,velec);
936             velecsum         = _mm256_add_ps(velecsum,velec);
937             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
938             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
939
940             fscal            = _mm256_add_ps(felec,fvdw);
941
942             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
943
944             /* Calculate temporary vectorial force */
945             tx               = _mm256_mul_ps(fscal,dx00);
946             ty               = _mm256_mul_ps(fscal,dy00);
947             tz               = _mm256_mul_ps(fscal,dz00);
948
949             /* Update vectorial force */
950             fix0             = _mm256_add_ps(fix0,tx);
951             fiy0             = _mm256_add_ps(fiy0,ty);
952             fiz0             = _mm256_add_ps(fiz0,tz);
953
954             fjx0             = _mm256_add_ps(fjx0,tx);
955             fjy0             = _mm256_add_ps(fjy0,ty);
956             fjz0             = _mm256_add_ps(fjz0,tz);
957
958             /**************************
959              * CALCULATE INTERACTIONS *
960              **************************/
961
962             r01              = _mm256_mul_ps(rsq01,rinv01);
963             r01              = _mm256_andnot_ps(dummy_mask,r01);
964
965             /* Calculate table index by multiplying r with table scale and truncate to integer */
966             rt               = _mm256_mul_ps(r01,vftabscale);
967             vfitab           = _mm256_cvttps_epi32(rt);
968             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
969             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
970             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
971             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
972             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
973             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
974
975             /* CUBIC SPLINE TABLE ELECTROSTATICS */
976             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
977                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
978             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
979                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
980             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
981                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
982             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
983                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
984             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
985             Heps             = _mm256_mul_ps(vfeps,H);
986             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
987             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
988             velec            = _mm256_mul_ps(qq01,VV);
989             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
990             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
991
992             /* Update potential sum for this i atom from the interaction with this j atom. */
993             velec            = _mm256_andnot_ps(dummy_mask,velec);
994             velecsum         = _mm256_add_ps(velecsum,velec);
995
996             fscal            = felec;
997
998             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
999
1000             /* Calculate temporary vectorial force */
1001             tx               = _mm256_mul_ps(fscal,dx01);
1002             ty               = _mm256_mul_ps(fscal,dy01);
1003             tz               = _mm256_mul_ps(fscal,dz01);
1004
1005             /* Update vectorial force */
1006             fix0             = _mm256_add_ps(fix0,tx);
1007             fiy0             = _mm256_add_ps(fiy0,ty);
1008             fiz0             = _mm256_add_ps(fiz0,tz);
1009
1010             fjx1             = _mm256_add_ps(fjx1,tx);
1011             fjy1             = _mm256_add_ps(fjy1,ty);
1012             fjz1             = _mm256_add_ps(fjz1,tz);
1013
1014             /**************************
1015              * CALCULATE INTERACTIONS *
1016              **************************/
1017
1018             r02              = _mm256_mul_ps(rsq02,rinv02);
1019             r02              = _mm256_andnot_ps(dummy_mask,r02);
1020
1021             /* Calculate table index by multiplying r with table scale and truncate to integer */
1022             rt               = _mm256_mul_ps(r02,vftabscale);
1023             vfitab           = _mm256_cvttps_epi32(rt);
1024             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1025             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1026             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1027             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1028             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1029             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1030
1031             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1032             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1033                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1034             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1035                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1036             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1037                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1038             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1039                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1040             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1041             Heps             = _mm256_mul_ps(vfeps,H);
1042             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1043             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1044             velec            = _mm256_mul_ps(qq02,VV);
1045             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1046             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1047
1048             /* Update potential sum for this i atom from the interaction with this j atom. */
1049             velec            = _mm256_andnot_ps(dummy_mask,velec);
1050             velecsum         = _mm256_add_ps(velecsum,velec);
1051
1052             fscal            = felec;
1053
1054             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1055
1056             /* Calculate temporary vectorial force */
1057             tx               = _mm256_mul_ps(fscal,dx02);
1058             ty               = _mm256_mul_ps(fscal,dy02);
1059             tz               = _mm256_mul_ps(fscal,dz02);
1060
1061             /* Update vectorial force */
1062             fix0             = _mm256_add_ps(fix0,tx);
1063             fiy0             = _mm256_add_ps(fiy0,ty);
1064             fiz0             = _mm256_add_ps(fiz0,tz);
1065
1066             fjx2             = _mm256_add_ps(fjx2,tx);
1067             fjy2             = _mm256_add_ps(fjy2,ty);
1068             fjz2             = _mm256_add_ps(fjz2,tz);
1069
1070             /**************************
1071              * CALCULATE INTERACTIONS *
1072              **************************/
1073
1074             r10              = _mm256_mul_ps(rsq10,rinv10);
1075             r10              = _mm256_andnot_ps(dummy_mask,r10);
1076
1077             /* Calculate table index by multiplying r with table scale and truncate to integer */
1078             rt               = _mm256_mul_ps(r10,vftabscale);
1079             vfitab           = _mm256_cvttps_epi32(rt);
1080             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1081             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1082             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1083             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1084             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1085             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1086
1087             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1088             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1089                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1090             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1091                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1092             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1093                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1094             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1095                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1096             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1097             Heps             = _mm256_mul_ps(vfeps,H);
1098             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1099             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1100             velec            = _mm256_mul_ps(qq10,VV);
1101             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1102             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1103
1104             /* Update potential sum for this i atom from the interaction with this j atom. */
1105             velec            = _mm256_andnot_ps(dummy_mask,velec);
1106             velecsum         = _mm256_add_ps(velecsum,velec);
1107
1108             fscal            = felec;
1109
1110             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1111
1112             /* Calculate temporary vectorial force */
1113             tx               = _mm256_mul_ps(fscal,dx10);
1114             ty               = _mm256_mul_ps(fscal,dy10);
1115             tz               = _mm256_mul_ps(fscal,dz10);
1116
1117             /* Update vectorial force */
1118             fix1             = _mm256_add_ps(fix1,tx);
1119             fiy1             = _mm256_add_ps(fiy1,ty);
1120             fiz1             = _mm256_add_ps(fiz1,tz);
1121
1122             fjx0             = _mm256_add_ps(fjx0,tx);
1123             fjy0             = _mm256_add_ps(fjy0,ty);
1124             fjz0             = _mm256_add_ps(fjz0,tz);
1125
1126             /**************************
1127              * CALCULATE INTERACTIONS *
1128              **************************/
1129
1130             r11              = _mm256_mul_ps(rsq11,rinv11);
1131             r11              = _mm256_andnot_ps(dummy_mask,r11);
1132
1133             /* Calculate table index by multiplying r with table scale and truncate to integer */
1134             rt               = _mm256_mul_ps(r11,vftabscale);
1135             vfitab           = _mm256_cvttps_epi32(rt);
1136             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1137             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1138             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1139             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1140             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1141             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1142
1143             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1144             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1145                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1146             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1147                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1148             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1149                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1150             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1151                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1152             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1153             Heps             = _mm256_mul_ps(vfeps,H);
1154             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1155             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1156             velec            = _mm256_mul_ps(qq11,VV);
1157             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1158             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1159
1160             /* Update potential sum for this i atom from the interaction with this j atom. */
1161             velec            = _mm256_andnot_ps(dummy_mask,velec);
1162             velecsum         = _mm256_add_ps(velecsum,velec);
1163
1164             fscal            = felec;
1165
1166             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1167
1168             /* Calculate temporary vectorial force */
1169             tx               = _mm256_mul_ps(fscal,dx11);
1170             ty               = _mm256_mul_ps(fscal,dy11);
1171             tz               = _mm256_mul_ps(fscal,dz11);
1172
1173             /* Update vectorial force */
1174             fix1             = _mm256_add_ps(fix1,tx);
1175             fiy1             = _mm256_add_ps(fiy1,ty);
1176             fiz1             = _mm256_add_ps(fiz1,tz);
1177
1178             fjx1             = _mm256_add_ps(fjx1,tx);
1179             fjy1             = _mm256_add_ps(fjy1,ty);
1180             fjz1             = _mm256_add_ps(fjz1,tz);
1181
1182             /**************************
1183              * CALCULATE INTERACTIONS *
1184              **************************/
1185
1186             r12              = _mm256_mul_ps(rsq12,rinv12);
1187             r12              = _mm256_andnot_ps(dummy_mask,r12);
1188
1189             /* Calculate table index by multiplying r with table scale and truncate to integer */
1190             rt               = _mm256_mul_ps(r12,vftabscale);
1191             vfitab           = _mm256_cvttps_epi32(rt);
1192             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1193             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1194             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1195             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1196             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1197             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1198
1199             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1200             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1201                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1202             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1203                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1204             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1205                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1206             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1207                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1208             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1209             Heps             = _mm256_mul_ps(vfeps,H);
1210             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1211             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1212             velec            = _mm256_mul_ps(qq12,VV);
1213             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1214             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1215
1216             /* Update potential sum for this i atom from the interaction with this j atom. */
1217             velec            = _mm256_andnot_ps(dummy_mask,velec);
1218             velecsum         = _mm256_add_ps(velecsum,velec);
1219
1220             fscal            = felec;
1221
1222             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1223
1224             /* Calculate temporary vectorial force */
1225             tx               = _mm256_mul_ps(fscal,dx12);
1226             ty               = _mm256_mul_ps(fscal,dy12);
1227             tz               = _mm256_mul_ps(fscal,dz12);
1228
1229             /* Update vectorial force */
1230             fix1             = _mm256_add_ps(fix1,tx);
1231             fiy1             = _mm256_add_ps(fiy1,ty);
1232             fiz1             = _mm256_add_ps(fiz1,tz);
1233
1234             fjx2             = _mm256_add_ps(fjx2,tx);
1235             fjy2             = _mm256_add_ps(fjy2,ty);
1236             fjz2             = _mm256_add_ps(fjz2,tz);
1237
1238             /**************************
1239              * CALCULATE INTERACTIONS *
1240              **************************/
1241
1242             r20              = _mm256_mul_ps(rsq20,rinv20);
1243             r20              = _mm256_andnot_ps(dummy_mask,r20);
1244
1245             /* Calculate table index by multiplying r with table scale and truncate to integer */
1246             rt               = _mm256_mul_ps(r20,vftabscale);
1247             vfitab           = _mm256_cvttps_epi32(rt);
1248             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1249             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1250             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1251             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1252             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1253             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1254
1255             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1256             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1257                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1258             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1259                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1260             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1261                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1262             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1263                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1264             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1265             Heps             = _mm256_mul_ps(vfeps,H);
1266             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1267             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1268             velec            = _mm256_mul_ps(qq20,VV);
1269             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1270             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1271
1272             /* Update potential sum for this i atom from the interaction with this j atom. */
1273             velec            = _mm256_andnot_ps(dummy_mask,velec);
1274             velecsum         = _mm256_add_ps(velecsum,velec);
1275
1276             fscal            = felec;
1277
1278             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1279
1280             /* Calculate temporary vectorial force */
1281             tx               = _mm256_mul_ps(fscal,dx20);
1282             ty               = _mm256_mul_ps(fscal,dy20);
1283             tz               = _mm256_mul_ps(fscal,dz20);
1284
1285             /* Update vectorial force */
1286             fix2             = _mm256_add_ps(fix2,tx);
1287             fiy2             = _mm256_add_ps(fiy2,ty);
1288             fiz2             = _mm256_add_ps(fiz2,tz);
1289
1290             fjx0             = _mm256_add_ps(fjx0,tx);
1291             fjy0             = _mm256_add_ps(fjy0,ty);
1292             fjz0             = _mm256_add_ps(fjz0,tz);
1293
1294             /**************************
1295              * CALCULATE INTERACTIONS *
1296              **************************/
1297
1298             r21              = _mm256_mul_ps(rsq21,rinv21);
1299             r21              = _mm256_andnot_ps(dummy_mask,r21);
1300
1301             /* Calculate table index by multiplying r with table scale and truncate to integer */
1302             rt               = _mm256_mul_ps(r21,vftabscale);
1303             vfitab           = _mm256_cvttps_epi32(rt);
1304             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1305             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1306             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1307             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1308             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1309             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1310
1311             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1312             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1313                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1314             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1315                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1316             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1317                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1318             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1319                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1320             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1321             Heps             = _mm256_mul_ps(vfeps,H);
1322             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1323             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1324             velec            = _mm256_mul_ps(qq21,VV);
1325             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1326             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1327
1328             /* Update potential sum for this i atom from the interaction with this j atom. */
1329             velec            = _mm256_andnot_ps(dummy_mask,velec);
1330             velecsum         = _mm256_add_ps(velecsum,velec);
1331
1332             fscal            = felec;
1333
1334             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1335
1336             /* Calculate temporary vectorial force */
1337             tx               = _mm256_mul_ps(fscal,dx21);
1338             ty               = _mm256_mul_ps(fscal,dy21);
1339             tz               = _mm256_mul_ps(fscal,dz21);
1340
1341             /* Update vectorial force */
1342             fix2             = _mm256_add_ps(fix2,tx);
1343             fiy2             = _mm256_add_ps(fiy2,ty);
1344             fiz2             = _mm256_add_ps(fiz2,tz);
1345
1346             fjx1             = _mm256_add_ps(fjx1,tx);
1347             fjy1             = _mm256_add_ps(fjy1,ty);
1348             fjz1             = _mm256_add_ps(fjz1,tz);
1349
1350             /**************************
1351              * CALCULATE INTERACTIONS *
1352              **************************/
1353
1354             r22              = _mm256_mul_ps(rsq22,rinv22);
1355             r22              = _mm256_andnot_ps(dummy_mask,r22);
1356
1357             /* Calculate table index by multiplying r with table scale and truncate to integer */
1358             rt               = _mm256_mul_ps(r22,vftabscale);
1359             vfitab           = _mm256_cvttps_epi32(rt);
1360             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1361             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1362             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1363             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1364             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1365             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1366
1367             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1368             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1369                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1370             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1371                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1372             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1373                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1374             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1375                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1376             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1377             Heps             = _mm256_mul_ps(vfeps,H);
1378             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1379             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1380             velec            = _mm256_mul_ps(qq22,VV);
1381             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1382             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1383
1384             /* Update potential sum for this i atom from the interaction with this j atom. */
1385             velec            = _mm256_andnot_ps(dummy_mask,velec);
1386             velecsum         = _mm256_add_ps(velecsum,velec);
1387
1388             fscal            = felec;
1389
1390             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1391
1392             /* Calculate temporary vectorial force */
1393             tx               = _mm256_mul_ps(fscal,dx22);
1394             ty               = _mm256_mul_ps(fscal,dy22);
1395             tz               = _mm256_mul_ps(fscal,dz22);
1396
1397             /* Update vectorial force */
1398             fix2             = _mm256_add_ps(fix2,tx);
1399             fiy2             = _mm256_add_ps(fiy2,ty);
1400             fiz2             = _mm256_add_ps(fiz2,tz);
1401
1402             fjx2             = _mm256_add_ps(fjx2,tx);
1403             fjy2             = _mm256_add_ps(fjy2,ty);
1404             fjz2             = _mm256_add_ps(fjz2,tz);
1405
1406             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1407             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1408             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1409             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1410             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1411             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1412             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1413             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1414
1415             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1416                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1417
1418             /* Inner loop uses 409 flops */
1419         }
1420
1421         /* End of innermost loop */
1422
1423         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1424                                                  f+i_coord_offset,fshift+i_shift_offset);
1425
1426         ggid                        = gid[iidx];
1427         /* Update potential energies */
1428         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1429         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1430
1431         /* Increment number of inner iterations */
1432         inneriter                  += j_index_end - j_index_start;
1433
1434         /* Outer loop uses 20 flops */
1435     }
1436
1437     /* Increment number of outer iterations */
1438     outeriter        += nri;
1439
1440     /* Update outer/inner flops */
1441
1442     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1443 }
1444 /*
1445  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single
1446  * Electrostatics interaction: CubicSplineTable
1447  * VdW interaction:            LennardJones
1448  * Geometry:                   Water3-Water3
1449  * Calculate force/pot:        Force
1450  */
1451 void
1452 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single
1453                     (t_nblist * gmx_restrict                nlist,
1454                      rvec * gmx_restrict                    xx,
1455                      rvec * gmx_restrict                    ff,
1456                      t_forcerec * gmx_restrict              fr,
1457                      t_mdatoms * gmx_restrict               mdatoms,
1458                      nb_kernel_data_t * gmx_restrict        kernel_data,
1459                      t_nrnb * gmx_restrict                  nrnb)
1460 {
1461     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1462      * just 0 for non-waters.
1463      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1464      * jnr indices corresponding to data put in the four positions in the SIMD register.
1465      */
1466     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1467     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1468     int              jnrA,jnrB,jnrC,jnrD;
1469     int              jnrE,jnrF,jnrG,jnrH;
1470     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1471     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1472     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1473     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1474     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1475     real             rcutoff_scalar;
1476     real             *shiftvec,*fshift,*x,*f;
1477     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1478     real             scratch[4*DIM];
1479     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1480     real *           vdwioffsetptr0;
1481     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1482     real *           vdwioffsetptr1;
1483     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1484     real *           vdwioffsetptr2;
1485     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1486     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1487     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1488     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1489     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1490     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1491     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1492     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1493     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1494     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1495     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1496     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1497     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1498     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1499     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1500     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1501     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1502     real             *charge;
1503     int              nvdwtype;
1504     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1505     int              *vdwtype;
1506     real             *vdwparam;
1507     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1508     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1509     __m256i          vfitab;
1510     __m128i          vfitab_lo,vfitab_hi;
1511     __m128i          ifour       = _mm_set1_epi32(4);
1512     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1513     real             *vftab;
1514     __m256           dummy_mask,cutoff_mask;
1515     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1516     __m256           one     = _mm256_set1_ps(1.0);
1517     __m256           two     = _mm256_set1_ps(2.0);
1518     x                = xx[0];
1519     f                = ff[0];
1520
1521     nri              = nlist->nri;
1522     iinr             = nlist->iinr;
1523     jindex           = nlist->jindex;
1524     jjnr             = nlist->jjnr;
1525     shiftidx         = nlist->shift;
1526     gid              = nlist->gid;
1527     shiftvec         = fr->shift_vec[0];
1528     fshift           = fr->fshift[0];
1529     facel            = _mm256_set1_ps(fr->epsfac);
1530     charge           = mdatoms->chargeA;
1531     nvdwtype         = fr->ntype;
1532     vdwparam         = fr->nbfp;
1533     vdwtype          = mdatoms->typeA;
1534
1535     vftab            = kernel_data->table_elec->data;
1536     vftabscale       = _mm256_set1_ps(kernel_data->table_elec->scale);
1537
1538     /* Setup water-specific parameters */
1539     inr              = nlist->iinr[0];
1540     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1541     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1542     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1543     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1544
1545     jq0              = _mm256_set1_ps(charge[inr+0]);
1546     jq1              = _mm256_set1_ps(charge[inr+1]);
1547     jq2              = _mm256_set1_ps(charge[inr+2]);
1548     vdwjidx0A        = 2*vdwtype[inr+0];
1549     qq00             = _mm256_mul_ps(iq0,jq0);
1550     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1551     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1552     qq01             = _mm256_mul_ps(iq0,jq1);
1553     qq02             = _mm256_mul_ps(iq0,jq2);
1554     qq10             = _mm256_mul_ps(iq1,jq0);
1555     qq11             = _mm256_mul_ps(iq1,jq1);
1556     qq12             = _mm256_mul_ps(iq1,jq2);
1557     qq20             = _mm256_mul_ps(iq2,jq0);
1558     qq21             = _mm256_mul_ps(iq2,jq1);
1559     qq22             = _mm256_mul_ps(iq2,jq2);
1560
1561     /* Avoid stupid compiler warnings */
1562     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1563     j_coord_offsetA = 0;
1564     j_coord_offsetB = 0;
1565     j_coord_offsetC = 0;
1566     j_coord_offsetD = 0;
1567     j_coord_offsetE = 0;
1568     j_coord_offsetF = 0;
1569     j_coord_offsetG = 0;
1570     j_coord_offsetH = 0;
1571
1572     outeriter        = 0;
1573     inneriter        = 0;
1574
1575     for(iidx=0;iidx<4*DIM;iidx++)
1576     {
1577         scratch[iidx] = 0.0;
1578     }
1579
1580     /* Start outer loop over neighborlists */
1581     for(iidx=0; iidx<nri; iidx++)
1582     {
1583         /* Load shift vector for this list */
1584         i_shift_offset   = DIM*shiftidx[iidx];
1585
1586         /* Load limits for loop over neighbors */
1587         j_index_start    = jindex[iidx];
1588         j_index_end      = jindex[iidx+1];
1589
1590         /* Get outer coordinate index */
1591         inr              = iinr[iidx];
1592         i_coord_offset   = DIM*inr;
1593
1594         /* Load i particle coords and add shift vector */
1595         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1596                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1597
1598         fix0             = _mm256_setzero_ps();
1599         fiy0             = _mm256_setzero_ps();
1600         fiz0             = _mm256_setzero_ps();
1601         fix1             = _mm256_setzero_ps();
1602         fiy1             = _mm256_setzero_ps();
1603         fiz1             = _mm256_setzero_ps();
1604         fix2             = _mm256_setzero_ps();
1605         fiy2             = _mm256_setzero_ps();
1606         fiz2             = _mm256_setzero_ps();
1607
1608         /* Start inner kernel loop */
1609         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1610         {
1611
1612             /* Get j neighbor index, and coordinate index */
1613             jnrA             = jjnr[jidx];
1614             jnrB             = jjnr[jidx+1];
1615             jnrC             = jjnr[jidx+2];
1616             jnrD             = jjnr[jidx+3];
1617             jnrE             = jjnr[jidx+4];
1618             jnrF             = jjnr[jidx+5];
1619             jnrG             = jjnr[jidx+6];
1620             jnrH             = jjnr[jidx+7];
1621             j_coord_offsetA  = DIM*jnrA;
1622             j_coord_offsetB  = DIM*jnrB;
1623             j_coord_offsetC  = DIM*jnrC;
1624             j_coord_offsetD  = DIM*jnrD;
1625             j_coord_offsetE  = DIM*jnrE;
1626             j_coord_offsetF  = DIM*jnrF;
1627             j_coord_offsetG  = DIM*jnrG;
1628             j_coord_offsetH  = DIM*jnrH;
1629
1630             /* load j atom coordinates */
1631             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1632                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1633                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1634                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1635                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1636
1637             /* Calculate displacement vector */
1638             dx00             = _mm256_sub_ps(ix0,jx0);
1639             dy00             = _mm256_sub_ps(iy0,jy0);
1640             dz00             = _mm256_sub_ps(iz0,jz0);
1641             dx01             = _mm256_sub_ps(ix0,jx1);
1642             dy01             = _mm256_sub_ps(iy0,jy1);
1643             dz01             = _mm256_sub_ps(iz0,jz1);
1644             dx02             = _mm256_sub_ps(ix0,jx2);
1645             dy02             = _mm256_sub_ps(iy0,jy2);
1646             dz02             = _mm256_sub_ps(iz0,jz2);
1647             dx10             = _mm256_sub_ps(ix1,jx0);
1648             dy10             = _mm256_sub_ps(iy1,jy0);
1649             dz10             = _mm256_sub_ps(iz1,jz0);
1650             dx11             = _mm256_sub_ps(ix1,jx1);
1651             dy11             = _mm256_sub_ps(iy1,jy1);
1652             dz11             = _mm256_sub_ps(iz1,jz1);
1653             dx12             = _mm256_sub_ps(ix1,jx2);
1654             dy12             = _mm256_sub_ps(iy1,jy2);
1655             dz12             = _mm256_sub_ps(iz1,jz2);
1656             dx20             = _mm256_sub_ps(ix2,jx0);
1657             dy20             = _mm256_sub_ps(iy2,jy0);
1658             dz20             = _mm256_sub_ps(iz2,jz0);
1659             dx21             = _mm256_sub_ps(ix2,jx1);
1660             dy21             = _mm256_sub_ps(iy2,jy1);
1661             dz21             = _mm256_sub_ps(iz2,jz1);
1662             dx22             = _mm256_sub_ps(ix2,jx2);
1663             dy22             = _mm256_sub_ps(iy2,jy2);
1664             dz22             = _mm256_sub_ps(iz2,jz2);
1665
1666             /* Calculate squared distance and things based on it */
1667             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1668             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1669             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1670             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1671             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1672             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1673             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1674             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1675             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1676
1677             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1678             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1679             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1680             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1681             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1682             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1683             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1684             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1685             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1686
1687             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1688
1689             fjx0             = _mm256_setzero_ps();
1690             fjy0             = _mm256_setzero_ps();
1691             fjz0             = _mm256_setzero_ps();
1692             fjx1             = _mm256_setzero_ps();
1693             fjy1             = _mm256_setzero_ps();
1694             fjz1             = _mm256_setzero_ps();
1695             fjx2             = _mm256_setzero_ps();
1696             fjy2             = _mm256_setzero_ps();
1697             fjz2             = _mm256_setzero_ps();
1698
1699             /**************************
1700              * CALCULATE INTERACTIONS *
1701              **************************/
1702
1703             r00              = _mm256_mul_ps(rsq00,rinv00);
1704
1705             /* Calculate table index by multiplying r with table scale and truncate to integer */
1706             rt               = _mm256_mul_ps(r00,vftabscale);
1707             vfitab           = _mm256_cvttps_epi32(rt);
1708             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1709             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1710             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1711             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1712             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1713             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1714
1715             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1716             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1717                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1718             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1719                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1720             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1721                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1722             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1723                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1724             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1725             Heps             = _mm256_mul_ps(vfeps,H);
1726             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1727             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1728             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1729
1730             /* LENNARD-JONES DISPERSION/REPULSION */
1731
1732             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1733             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1734
1735             fscal            = _mm256_add_ps(felec,fvdw);
1736
1737             /* Calculate temporary vectorial force */
1738             tx               = _mm256_mul_ps(fscal,dx00);
1739             ty               = _mm256_mul_ps(fscal,dy00);
1740             tz               = _mm256_mul_ps(fscal,dz00);
1741
1742             /* Update vectorial force */
1743             fix0             = _mm256_add_ps(fix0,tx);
1744             fiy0             = _mm256_add_ps(fiy0,ty);
1745             fiz0             = _mm256_add_ps(fiz0,tz);
1746
1747             fjx0             = _mm256_add_ps(fjx0,tx);
1748             fjy0             = _mm256_add_ps(fjy0,ty);
1749             fjz0             = _mm256_add_ps(fjz0,tz);
1750
1751             /**************************
1752              * CALCULATE INTERACTIONS *
1753              **************************/
1754
1755             r01              = _mm256_mul_ps(rsq01,rinv01);
1756
1757             /* Calculate table index by multiplying r with table scale and truncate to integer */
1758             rt               = _mm256_mul_ps(r01,vftabscale);
1759             vfitab           = _mm256_cvttps_epi32(rt);
1760             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1761             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1762             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1763             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1764             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1765             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1766
1767             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1768             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1769                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1770             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1771                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1772             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1773                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1774             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1775                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1776             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1777             Heps             = _mm256_mul_ps(vfeps,H);
1778             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1779             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1780             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1781
1782             fscal            = felec;
1783
1784             /* Calculate temporary vectorial force */
1785             tx               = _mm256_mul_ps(fscal,dx01);
1786             ty               = _mm256_mul_ps(fscal,dy01);
1787             tz               = _mm256_mul_ps(fscal,dz01);
1788
1789             /* Update vectorial force */
1790             fix0             = _mm256_add_ps(fix0,tx);
1791             fiy0             = _mm256_add_ps(fiy0,ty);
1792             fiz0             = _mm256_add_ps(fiz0,tz);
1793
1794             fjx1             = _mm256_add_ps(fjx1,tx);
1795             fjy1             = _mm256_add_ps(fjy1,ty);
1796             fjz1             = _mm256_add_ps(fjz1,tz);
1797
1798             /**************************
1799              * CALCULATE INTERACTIONS *
1800              **************************/
1801
1802             r02              = _mm256_mul_ps(rsq02,rinv02);
1803
1804             /* Calculate table index by multiplying r with table scale and truncate to integer */
1805             rt               = _mm256_mul_ps(r02,vftabscale);
1806             vfitab           = _mm256_cvttps_epi32(rt);
1807             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1808             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1809             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1810             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1811             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1812             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1813
1814             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1815             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1816                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1817             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1818                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1819             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1820                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1821             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1822                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1823             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1824             Heps             = _mm256_mul_ps(vfeps,H);
1825             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1826             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1827             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1828
1829             fscal            = felec;
1830
1831             /* Calculate temporary vectorial force */
1832             tx               = _mm256_mul_ps(fscal,dx02);
1833             ty               = _mm256_mul_ps(fscal,dy02);
1834             tz               = _mm256_mul_ps(fscal,dz02);
1835
1836             /* Update vectorial force */
1837             fix0             = _mm256_add_ps(fix0,tx);
1838             fiy0             = _mm256_add_ps(fiy0,ty);
1839             fiz0             = _mm256_add_ps(fiz0,tz);
1840
1841             fjx2             = _mm256_add_ps(fjx2,tx);
1842             fjy2             = _mm256_add_ps(fjy2,ty);
1843             fjz2             = _mm256_add_ps(fjz2,tz);
1844
1845             /**************************
1846              * CALCULATE INTERACTIONS *
1847              **************************/
1848
1849             r10              = _mm256_mul_ps(rsq10,rinv10);
1850
1851             /* Calculate table index by multiplying r with table scale and truncate to integer */
1852             rt               = _mm256_mul_ps(r10,vftabscale);
1853             vfitab           = _mm256_cvttps_epi32(rt);
1854             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1855             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1856             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1857             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1858             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1859             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1860
1861             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1862             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1863                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1864             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1865                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1866             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1867                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1868             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1869                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1870             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1871             Heps             = _mm256_mul_ps(vfeps,H);
1872             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1873             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1874             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1875
1876             fscal            = felec;
1877
1878             /* Calculate temporary vectorial force */
1879             tx               = _mm256_mul_ps(fscal,dx10);
1880             ty               = _mm256_mul_ps(fscal,dy10);
1881             tz               = _mm256_mul_ps(fscal,dz10);
1882
1883             /* Update vectorial force */
1884             fix1             = _mm256_add_ps(fix1,tx);
1885             fiy1             = _mm256_add_ps(fiy1,ty);
1886             fiz1             = _mm256_add_ps(fiz1,tz);
1887
1888             fjx0             = _mm256_add_ps(fjx0,tx);
1889             fjy0             = _mm256_add_ps(fjy0,ty);
1890             fjz0             = _mm256_add_ps(fjz0,tz);
1891
1892             /**************************
1893              * CALCULATE INTERACTIONS *
1894              **************************/
1895
1896             r11              = _mm256_mul_ps(rsq11,rinv11);
1897
1898             /* Calculate table index by multiplying r with table scale and truncate to integer */
1899             rt               = _mm256_mul_ps(r11,vftabscale);
1900             vfitab           = _mm256_cvttps_epi32(rt);
1901             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1902             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1903             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1904             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1905             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1906             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1907
1908             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1909             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1910                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1911             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1912                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1913             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1914                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1915             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1916                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1917             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1918             Heps             = _mm256_mul_ps(vfeps,H);
1919             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1920             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1921             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1922
1923             fscal            = felec;
1924
1925             /* Calculate temporary vectorial force */
1926             tx               = _mm256_mul_ps(fscal,dx11);
1927             ty               = _mm256_mul_ps(fscal,dy11);
1928             tz               = _mm256_mul_ps(fscal,dz11);
1929
1930             /* Update vectorial force */
1931             fix1             = _mm256_add_ps(fix1,tx);
1932             fiy1             = _mm256_add_ps(fiy1,ty);
1933             fiz1             = _mm256_add_ps(fiz1,tz);
1934
1935             fjx1             = _mm256_add_ps(fjx1,tx);
1936             fjy1             = _mm256_add_ps(fjy1,ty);
1937             fjz1             = _mm256_add_ps(fjz1,tz);
1938
1939             /**************************
1940              * CALCULATE INTERACTIONS *
1941              **************************/
1942
1943             r12              = _mm256_mul_ps(rsq12,rinv12);
1944
1945             /* Calculate table index by multiplying r with table scale and truncate to integer */
1946             rt               = _mm256_mul_ps(r12,vftabscale);
1947             vfitab           = _mm256_cvttps_epi32(rt);
1948             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1949             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1950             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1951             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1952             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1953             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1954
1955             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1956             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1957                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1958             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1959                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1960             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1961                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1962             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1963                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1964             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1965             Heps             = _mm256_mul_ps(vfeps,H);
1966             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1967             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1968             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1969
1970             fscal            = felec;
1971
1972             /* Calculate temporary vectorial force */
1973             tx               = _mm256_mul_ps(fscal,dx12);
1974             ty               = _mm256_mul_ps(fscal,dy12);
1975             tz               = _mm256_mul_ps(fscal,dz12);
1976
1977             /* Update vectorial force */
1978             fix1             = _mm256_add_ps(fix1,tx);
1979             fiy1             = _mm256_add_ps(fiy1,ty);
1980             fiz1             = _mm256_add_ps(fiz1,tz);
1981
1982             fjx2             = _mm256_add_ps(fjx2,tx);
1983             fjy2             = _mm256_add_ps(fjy2,ty);
1984             fjz2             = _mm256_add_ps(fjz2,tz);
1985
1986             /**************************
1987              * CALCULATE INTERACTIONS *
1988              **************************/
1989
1990             r20              = _mm256_mul_ps(rsq20,rinv20);
1991
1992             /* Calculate table index by multiplying r with table scale and truncate to integer */
1993             rt               = _mm256_mul_ps(r20,vftabscale);
1994             vfitab           = _mm256_cvttps_epi32(rt);
1995             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1996             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1997             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1998             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1999             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2000             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2001
2002             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2003             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2004                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2005             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2006                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2007             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2008                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2009             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2010                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2011             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2012             Heps             = _mm256_mul_ps(vfeps,H);
2013             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2014             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2015             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2016
2017             fscal            = felec;
2018
2019             /* Calculate temporary vectorial force */
2020             tx               = _mm256_mul_ps(fscal,dx20);
2021             ty               = _mm256_mul_ps(fscal,dy20);
2022             tz               = _mm256_mul_ps(fscal,dz20);
2023
2024             /* Update vectorial force */
2025             fix2             = _mm256_add_ps(fix2,tx);
2026             fiy2             = _mm256_add_ps(fiy2,ty);
2027             fiz2             = _mm256_add_ps(fiz2,tz);
2028
2029             fjx0             = _mm256_add_ps(fjx0,tx);
2030             fjy0             = _mm256_add_ps(fjy0,ty);
2031             fjz0             = _mm256_add_ps(fjz0,tz);
2032
2033             /**************************
2034              * CALCULATE INTERACTIONS *
2035              **************************/
2036
2037             r21              = _mm256_mul_ps(rsq21,rinv21);
2038
2039             /* Calculate table index by multiplying r with table scale and truncate to integer */
2040             rt               = _mm256_mul_ps(r21,vftabscale);
2041             vfitab           = _mm256_cvttps_epi32(rt);
2042             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2043             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2044             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2045             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2046             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2047             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2048
2049             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2050             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2051                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2052             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2053                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2054             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2055                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2056             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2057                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2058             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2059             Heps             = _mm256_mul_ps(vfeps,H);
2060             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2061             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2062             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2063
2064             fscal            = felec;
2065
2066             /* Calculate temporary vectorial force */
2067             tx               = _mm256_mul_ps(fscal,dx21);
2068             ty               = _mm256_mul_ps(fscal,dy21);
2069             tz               = _mm256_mul_ps(fscal,dz21);
2070
2071             /* Update vectorial force */
2072             fix2             = _mm256_add_ps(fix2,tx);
2073             fiy2             = _mm256_add_ps(fiy2,ty);
2074             fiz2             = _mm256_add_ps(fiz2,tz);
2075
2076             fjx1             = _mm256_add_ps(fjx1,tx);
2077             fjy1             = _mm256_add_ps(fjy1,ty);
2078             fjz1             = _mm256_add_ps(fjz1,tz);
2079
2080             /**************************
2081              * CALCULATE INTERACTIONS *
2082              **************************/
2083
2084             r22              = _mm256_mul_ps(rsq22,rinv22);
2085
2086             /* Calculate table index by multiplying r with table scale and truncate to integer */
2087             rt               = _mm256_mul_ps(r22,vftabscale);
2088             vfitab           = _mm256_cvttps_epi32(rt);
2089             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2090             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2091             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2092             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2093             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2094             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2095
2096             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2097             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2098                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2099             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2100                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2101             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2102                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2103             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2104                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2105             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2106             Heps             = _mm256_mul_ps(vfeps,H);
2107             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2108             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2109             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2110
2111             fscal            = felec;
2112
2113             /* Calculate temporary vectorial force */
2114             tx               = _mm256_mul_ps(fscal,dx22);
2115             ty               = _mm256_mul_ps(fscal,dy22);
2116             tz               = _mm256_mul_ps(fscal,dz22);
2117
2118             /* Update vectorial force */
2119             fix2             = _mm256_add_ps(fix2,tx);
2120             fiy2             = _mm256_add_ps(fiy2,ty);
2121             fiz2             = _mm256_add_ps(fiz2,tz);
2122
2123             fjx2             = _mm256_add_ps(fjx2,tx);
2124             fjy2             = _mm256_add_ps(fjy2,ty);
2125             fjz2             = _mm256_add_ps(fjz2,tz);
2126
2127             fjptrA             = f+j_coord_offsetA;
2128             fjptrB             = f+j_coord_offsetB;
2129             fjptrC             = f+j_coord_offsetC;
2130             fjptrD             = f+j_coord_offsetD;
2131             fjptrE             = f+j_coord_offsetE;
2132             fjptrF             = f+j_coord_offsetF;
2133             fjptrG             = f+j_coord_offsetG;
2134             fjptrH             = f+j_coord_offsetH;
2135
2136             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2137                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2138
2139             /* Inner loop uses 359 flops */
2140         }
2141
2142         if(jidx<j_index_end)
2143         {
2144
2145             /* Get j neighbor index, and coordinate index */
2146             jnrlistA         = jjnr[jidx];
2147             jnrlistB         = jjnr[jidx+1];
2148             jnrlistC         = jjnr[jidx+2];
2149             jnrlistD         = jjnr[jidx+3];
2150             jnrlistE         = jjnr[jidx+4];
2151             jnrlistF         = jjnr[jidx+5];
2152             jnrlistG         = jjnr[jidx+6];
2153             jnrlistH         = jjnr[jidx+7];
2154             /* Sign of each element will be negative for non-real atoms.
2155              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2156              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2157              */
2158             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2159                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2160                                             
2161             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2162             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2163             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2164             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2165             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2166             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2167             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2168             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2169             j_coord_offsetA  = DIM*jnrA;
2170             j_coord_offsetB  = DIM*jnrB;
2171             j_coord_offsetC  = DIM*jnrC;
2172             j_coord_offsetD  = DIM*jnrD;
2173             j_coord_offsetE  = DIM*jnrE;
2174             j_coord_offsetF  = DIM*jnrF;
2175             j_coord_offsetG  = DIM*jnrG;
2176             j_coord_offsetH  = DIM*jnrH;
2177
2178             /* load j atom coordinates */
2179             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2180                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2181                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2182                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2183                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2184
2185             /* Calculate displacement vector */
2186             dx00             = _mm256_sub_ps(ix0,jx0);
2187             dy00             = _mm256_sub_ps(iy0,jy0);
2188             dz00             = _mm256_sub_ps(iz0,jz0);
2189             dx01             = _mm256_sub_ps(ix0,jx1);
2190             dy01             = _mm256_sub_ps(iy0,jy1);
2191             dz01             = _mm256_sub_ps(iz0,jz1);
2192             dx02             = _mm256_sub_ps(ix0,jx2);
2193             dy02             = _mm256_sub_ps(iy0,jy2);
2194             dz02             = _mm256_sub_ps(iz0,jz2);
2195             dx10             = _mm256_sub_ps(ix1,jx0);
2196             dy10             = _mm256_sub_ps(iy1,jy0);
2197             dz10             = _mm256_sub_ps(iz1,jz0);
2198             dx11             = _mm256_sub_ps(ix1,jx1);
2199             dy11             = _mm256_sub_ps(iy1,jy1);
2200             dz11             = _mm256_sub_ps(iz1,jz1);
2201             dx12             = _mm256_sub_ps(ix1,jx2);
2202             dy12             = _mm256_sub_ps(iy1,jy2);
2203             dz12             = _mm256_sub_ps(iz1,jz2);
2204             dx20             = _mm256_sub_ps(ix2,jx0);
2205             dy20             = _mm256_sub_ps(iy2,jy0);
2206             dz20             = _mm256_sub_ps(iz2,jz0);
2207             dx21             = _mm256_sub_ps(ix2,jx1);
2208             dy21             = _mm256_sub_ps(iy2,jy1);
2209             dz21             = _mm256_sub_ps(iz2,jz1);
2210             dx22             = _mm256_sub_ps(ix2,jx2);
2211             dy22             = _mm256_sub_ps(iy2,jy2);
2212             dz22             = _mm256_sub_ps(iz2,jz2);
2213
2214             /* Calculate squared distance and things based on it */
2215             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2216             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2217             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2218             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2219             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2220             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2221             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2222             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2223             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2224
2225             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2226             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
2227             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
2228             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
2229             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2230             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2231             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
2232             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2233             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2234
2235             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
2236
2237             fjx0             = _mm256_setzero_ps();
2238             fjy0             = _mm256_setzero_ps();
2239             fjz0             = _mm256_setzero_ps();
2240             fjx1             = _mm256_setzero_ps();
2241             fjy1             = _mm256_setzero_ps();
2242             fjz1             = _mm256_setzero_ps();
2243             fjx2             = _mm256_setzero_ps();
2244             fjy2             = _mm256_setzero_ps();
2245             fjz2             = _mm256_setzero_ps();
2246
2247             /**************************
2248              * CALCULATE INTERACTIONS *
2249              **************************/
2250
2251             r00              = _mm256_mul_ps(rsq00,rinv00);
2252             r00              = _mm256_andnot_ps(dummy_mask,r00);
2253
2254             /* Calculate table index by multiplying r with table scale and truncate to integer */
2255             rt               = _mm256_mul_ps(r00,vftabscale);
2256             vfitab           = _mm256_cvttps_epi32(rt);
2257             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2258             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2259             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2260             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2261             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2262             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2263
2264             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2265             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2266                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2267             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2268                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2269             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2270                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2271             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2272                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2273             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2274             Heps             = _mm256_mul_ps(vfeps,H);
2275             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2276             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2277             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2278
2279             /* LENNARD-JONES DISPERSION/REPULSION */
2280
2281             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2282             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2283
2284             fscal            = _mm256_add_ps(felec,fvdw);
2285
2286             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2287
2288             /* Calculate temporary vectorial force */
2289             tx               = _mm256_mul_ps(fscal,dx00);
2290             ty               = _mm256_mul_ps(fscal,dy00);
2291             tz               = _mm256_mul_ps(fscal,dz00);
2292
2293             /* Update vectorial force */
2294             fix0             = _mm256_add_ps(fix0,tx);
2295             fiy0             = _mm256_add_ps(fiy0,ty);
2296             fiz0             = _mm256_add_ps(fiz0,tz);
2297
2298             fjx0             = _mm256_add_ps(fjx0,tx);
2299             fjy0             = _mm256_add_ps(fjy0,ty);
2300             fjz0             = _mm256_add_ps(fjz0,tz);
2301
2302             /**************************
2303              * CALCULATE INTERACTIONS *
2304              **************************/
2305
2306             r01              = _mm256_mul_ps(rsq01,rinv01);
2307             r01              = _mm256_andnot_ps(dummy_mask,r01);
2308
2309             /* Calculate table index by multiplying r with table scale and truncate to integer */
2310             rt               = _mm256_mul_ps(r01,vftabscale);
2311             vfitab           = _mm256_cvttps_epi32(rt);
2312             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2313             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2314             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2315             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2316             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2317             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2318
2319             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2320             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2321                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2322             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2323                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2324             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2325                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2326             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2327                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2328             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2329             Heps             = _mm256_mul_ps(vfeps,H);
2330             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2331             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2332             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2333
2334             fscal            = felec;
2335
2336             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2337
2338             /* Calculate temporary vectorial force */
2339             tx               = _mm256_mul_ps(fscal,dx01);
2340             ty               = _mm256_mul_ps(fscal,dy01);
2341             tz               = _mm256_mul_ps(fscal,dz01);
2342
2343             /* Update vectorial force */
2344             fix0             = _mm256_add_ps(fix0,tx);
2345             fiy0             = _mm256_add_ps(fiy0,ty);
2346             fiz0             = _mm256_add_ps(fiz0,tz);
2347
2348             fjx1             = _mm256_add_ps(fjx1,tx);
2349             fjy1             = _mm256_add_ps(fjy1,ty);
2350             fjz1             = _mm256_add_ps(fjz1,tz);
2351
2352             /**************************
2353              * CALCULATE INTERACTIONS *
2354              **************************/
2355
2356             r02              = _mm256_mul_ps(rsq02,rinv02);
2357             r02              = _mm256_andnot_ps(dummy_mask,r02);
2358
2359             /* Calculate table index by multiplying r with table scale and truncate to integer */
2360             rt               = _mm256_mul_ps(r02,vftabscale);
2361             vfitab           = _mm256_cvttps_epi32(rt);
2362             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2363             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2364             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2365             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2366             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2367             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2368
2369             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2370             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2371                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2372             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2373                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2374             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2375                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2376             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2377                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2378             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2379             Heps             = _mm256_mul_ps(vfeps,H);
2380             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2381             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2382             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2383
2384             fscal            = felec;
2385
2386             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2387
2388             /* Calculate temporary vectorial force */
2389             tx               = _mm256_mul_ps(fscal,dx02);
2390             ty               = _mm256_mul_ps(fscal,dy02);
2391             tz               = _mm256_mul_ps(fscal,dz02);
2392
2393             /* Update vectorial force */
2394             fix0             = _mm256_add_ps(fix0,tx);
2395             fiy0             = _mm256_add_ps(fiy0,ty);
2396             fiz0             = _mm256_add_ps(fiz0,tz);
2397
2398             fjx2             = _mm256_add_ps(fjx2,tx);
2399             fjy2             = _mm256_add_ps(fjy2,ty);
2400             fjz2             = _mm256_add_ps(fjz2,tz);
2401
2402             /**************************
2403              * CALCULATE INTERACTIONS *
2404              **************************/
2405
2406             r10              = _mm256_mul_ps(rsq10,rinv10);
2407             r10              = _mm256_andnot_ps(dummy_mask,r10);
2408
2409             /* Calculate table index by multiplying r with table scale and truncate to integer */
2410             rt               = _mm256_mul_ps(r10,vftabscale);
2411             vfitab           = _mm256_cvttps_epi32(rt);
2412             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2413             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2414             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2415             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2416             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2417             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2418
2419             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2420             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2421                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2422             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2423                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2424             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2425                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2426             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2427                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2428             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2429             Heps             = _mm256_mul_ps(vfeps,H);
2430             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2431             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2432             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2433
2434             fscal            = felec;
2435
2436             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2437
2438             /* Calculate temporary vectorial force */
2439             tx               = _mm256_mul_ps(fscal,dx10);
2440             ty               = _mm256_mul_ps(fscal,dy10);
2441             tz               = _mm256_mul_ps(fscal,dz10);
2442
2443             /* Update vectorial force */
2444             fix1             = _mm256_add_ps(fix1,tx);
2445             fiy1             = _mm256_add_ps(fiy1,ty);
2446             fiz1             = _mm256_add_ps(fiz1,tz);
2447
2448             fjx0             = _mm256_add_ps(fjx0,tx);
2449             fjy0             = _mm256_add_ps(fjy0,ty);
2450             fjz0             = _mm256_add_ps(fjz0,tz);
2451
2452             /**************************
2453              * CALCULATE INTERACTIONS *
2454              **************************/
2455
2456             r11              = _mm256_mul_ps(rsq11,rinv11);
2457             r11              = _mm256_andnot_ps(dummy_mask,r11);
2458
2459             /* Calculate table index by multiplying r with table scale and truncate to integer */
2460             rt               = _mm256_mul_ps(r11,vftabscale);
2461             vfitab           = _mm256_cvttps_epi32(rt);
2462             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2463             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2464             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2465             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2466             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2467             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2468
2469             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2470             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2471                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2472             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2473                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2474             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2475                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2476             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2477                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2478             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2479             Heps             = _mm256_mul_ps(vfeps,H);
2480             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2481             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2482             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2483
2484             fscal            = felec;
2485
2486             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2487
2488             /* Calculate temporary vectorial force */
2489             tx               = _mm256_mul_ps(fscal,dx11);
2490             ty               = _mm256_mul_ps(fscal,dy11);
2491             tz               = _mm256_mul_ps(fscal,dz11);
2492
2493             /* Update vectorial force */
2494             fix1             = _mm256_add_ps(fix1,tx);
2495             fiy1             = _mm256_add_ps(fiy1,ty);
2496             fiz1             = _mm256_add_ps(fiz1,tz);
2497
2498             fjx1             = _mm256_add_ps(fjx1,tx);
2499             fjy1             = _mm256_add_ps(fjy1,ty);
2500             fjz1             = _mm256_add_ps(fjz1,tz);
2501
2502             /**************************
2503              * CALCULATE INTERACTIONS *
2504              **************************/
2505
2506             r12              = _mm256_mul_ps(rsq12,rinv12);
2507             r12              = _mm256_andnot_ps(dummy_mask,r12);
2508
2509             /* Calculate table index by multiplying r with table scale and truncate to integer */
2510             rt               = _mm256_mul_ps(r12,vftabscale);
2511             vfitab           = _mm256_cvttps_epi32(rt);
2512             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2513             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2514             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2515             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2516             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2517             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2518
2519             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2520             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2521                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2522             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2523                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2524             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2525                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2526             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2527                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2528             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2529             Heps             = _mm256_mul_ps(vfeps,H);
2530             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2531             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2532             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2533
2534             fscal            = felec;
2535
2536             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2537
2538             /* Calculate temporary vectorial force */
2539             tx               = _mm256_mul_ps(fscal,dx12);
2540             ty               = _mm256_mul_ps(fscal,dy12);
2541             tz               = _mm256_mul_ps(fscal,dz12);
2542
2543             /* Update vectorial force */
2544             fix1             = _mm256_add_ps(fix1,tx);
2545             fiy1             = _mm256_add_ps(fiy1,ty);
2546             fiz1             = _mm256_add_ps(fiz1,tz);
2547
2548             fjx2             = _mm256_add_ps(fjx2,tx);
2549             fjy2             = _mm256_add_ps(fjy2,ty);
2550             fjz2             = _mm256_add_ps(fjz2,tz);
2551
2552             /**************************
2553              * CALCULATE INTERACTIONS *
2554              **************************/
2555
2556             r20              = _mm256_mul_ps(rsq20,rinv20);
2557             r20              = _mm256_andnot_ps(dummy_mask,r20);
2558
2559             /* Calculate table index by multiplying r with table scale and truncate to integer */
2560             rt               = _mm256_mul_ps(r20,vftabscale);
2561             vfitab           = _mm256_cvttps_epi32(rt);
2562             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2563             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2564             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2565             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2566             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2567             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2568
2569             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2570             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2571                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2572             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2573                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2574             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2575                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2576             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2577                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2578             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2579             Heps             = _mm256_mul_ps(vfeps,H);
2580             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2581             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2582             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2583
2584             fscal            = felec;
2585
2586             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2587
2588             /* Calculate temporary vectorial force */
2589             tx               = _mm256_mul_ps(fscal,dx20);
2590             ty               = _mm256_mul_ps(fscal,dy20);
2591             tz               = _mm256_mul_ps(fscal,dz20);
2592
2593             /* Update vectorial force */
2594             fix2             = _mm256_add_ps(fix2,tx);
2595             fiy2             = _mm256_add_ps(fiy2,ty);
2596             fiz2             = _mm256_add_ps(fiz2,tz);
2597
2598             fjx0             = _mm256_add_ps(fjx0,tx);
2599             fjy0             = _mm256_add_ps(fjy0,ty);
2600             fjz0             = _mm256_add_ps(fjz0,tz);
2601
2602             /**************************
2603              * CALCULATE INTERACTIONS *
2604              **************************/
2605
2606             r21              = _mm256_mul_ps(rsq21,rinv21);
2607             r21              = _mm256_andnot_ps(dummy_mask,r21);
2608
2609             /* Calculate table index by multiplying r with table scale and truncate to integer */
2610             rt               = _mm256_mul_ps(r21,vftabscale);
2611             vfitab           = _mm256_cvttps_epi32(rt);
2612             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2613             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2614             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2615             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2616             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2617             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2618
2619             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2620             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2621                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2622             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2623                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2624             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2625                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2626             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2627                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2628             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2629             Heps             = _mm256_mul_ps(vfeps,H);
2630             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2631             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2632             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2633
2634             fscal            = felec;
2635
2636             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2637
2638             /* Calculate temporary vectorial force */
2639             tx               = _mm256_mul_ps(fscal,dx21);
2640             ty               = _mm256_mul_ps(fscal,dy21);
2641             tz               = _mm256_mul_ps(fscal,dz21);
2642
2643             /* Update vectorial force */
2644             fix2             = _mm256_add_ps(fix2,tx);
2645             fiy2             = _mm256_add_ps(fiy2,ty);
2646             fiz2             = _mm256_add_ps(fiz2,tz);
2647
2648             fjx1             = _mm256_add_ps(fjx1,tx);
2649             fjy1             = _mm256_add_ps(fjy1,ty);
2650             fjz1             = _mm256_add_ps(fjz1,tz);
2651
2652             /**************************
2653              * CALCULATE INTERACTIONS *
2654              **************************/
2655
2656             r22              = _mm256_mul_ps(rsq22,rinv22);
2657             r22              = _mm256_andnot_ps(dummy_mask,r22);
2658
2659             /* Calculate table index by multiplying r with table scale and truncate to integer */
2660             rt               = _mm256_mul_ps(r22,vftabscale);
2661             vfitab           = _mm256_cvttps_epi32(rt);
2662             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2663             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2664             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2665             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2666             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2667             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2668
2669             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2670             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2671                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2672             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2673                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2674             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2675                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2676             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2677                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2678             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2679             Heps             = _mm256_mul_ps(vfeps,H);
2680             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2681             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2682             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2683
2684             fscal            = felec;
2685
2686             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2687
2688             /* Calculate temporary vectorial force */
2689             tx               = _mm256_mul_ps(fscal,dx22);
2690             ty               = _mm256_mul_ps(fscal,dy22);
2691             tz               = _mm256_mul_ps(fscal,dz22);
2692
2693             /* Update vectorial force */
2694             fix2             = _mm256_add_ps(fix2,tx);
2695             fiy2             = _mm256_add_ps(fiy2,ty);
2696             fiz2             = _mm256_add_ps(fiz2,tz);
2697
2698             fjx2             = _mm256_add_ps(fjx2,tx);
2699             fjy2             = _mm256_add_ps(fjy2,ty);
2700             fjz2             = _mm256_add_ps(fjz2,tz);
2701
2702             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2703             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2704             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2705             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2706             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2707             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2708             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2709             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2710
2711             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2712                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2713
2714             /* Inner loop uses 368 flops */
2715         }
2716
2717         /* End of innermost loop */
2718
2719         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2720                                                  f+i_coord_offset,fshift+i_shift_offset);
2721
2722         /* Increment number of inner iterations */
2723         inneriter                  += j_index_end - j_index_start;
2724
2725         /* Outer loop uses 18 flops */
2726     }
2727
2728     /* Increment number of outer iterations */
2729     outeriter        += nri;
2730
2731     /* Update outer/inner flops */
2732
2733     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);
2734 }