Use full path for legacyheaders
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_single
52  * Electrostatics interaction: Coulomb
53  * VdW interaction:            CubicSplineTable
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrE,jnrF,jnrG,jnrH;
76     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84     real             scratch[4*DIM];
85     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
93     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
108     real             *charge;
109     int              nvdwtype;
110     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111     int              *vdwtype;
112     real             *vdwparam;
113     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
114     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
115     __m256i          vfitab;
116     __m128i          vfitab_lo,vfitab_hi;
117     __m128i          ifour       = _mm_set1_epi32(4);
118     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
119     real             *vftab;
120     __m256           dummy_mask,cutoff_mask;
121     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
122     __m256           one     = _mm256_set1_ps(1.0);
123     __m256           two     = _mm256_set1_ps(2.0);
124     x                = xx[0];
125     f                = ff[0];
126
127     nri              = nlist->nri;
128     iinr             = nlist->iinr;
129     jindex           = nlist->jindex;
130     jjnr             = nlist->jjnr;
131     shiftidx         = nlist->shift;
132     gid              = nlist->gid;
133     shiftvec         = fr->shift_vec[0];
134     fshift           = fr->fshift[0];
135     facel            = _mm256_set1_ps(fr->epsfac);
136     charge           = mdatoms->chargeA;
137     nvdwtype         = fr->ntype;
138     vdwparam         = fr->nbfp;
139     vdwtype          = mdatoms->typeA;
140
141     vftab            = kernel_data->table_vdw->data;
142     vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
143
144     /* Setup water-specific parameters */
145     inr              = nlist->iinr[0];
146     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
147     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
148     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
149     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
150
151     jq0              = _mm256_set1_ps(charge[inr+0]);
152     jq1              = _mm256_set1_ps(charge[inr+1]);
153     jq2              = _mm256_set1_ps(charge[inr+2]);
154     vdwjidx0A        = 2*vdwtype[inr+0];
155     qq00             = _mm256_mul_ps(iq0,jq0);
156     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
157     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
158     qq01             = _mm256_mul_ps(iq0,jq1);
159     qq02             = _mm256_mul_ps(iq0,jq2);
160     qq10             = _mm256_mul_ps(iq1,jq0);
161     qq11             = _mm256_mul_ps(iq1,jq1);
162     qq12             = _mm256_mul_ps(iq1,jq2);
163     qq20             = _mm256_mul_ps(iq2,jq0);
164     qq21             = _mm256_mul_ps(iq2,jq1);
165     qq22             = _mm256_mul_ps(iq2,jq2);
166
167     /* Avoid stupid compiler warnings */
168     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
169     j_coord_offsetA = 0;
170     j_coord_offsetB = 0;
171     j_coord_offsetC = 0;
172     j_coord_offsetD = 0;
173     j_coord_offsetE = 0;
174     j_coord_offsetF = 0;
175     j_coord_offsetG = 0;
176     j_coord_offsetH = 0;
177
178     outeriter        = 0;
179     inneriter        = 0;
180
181     for(iidx=0;iidx<4*DIM;iidx++)
182     {
183         scratch[iidx] = 0.0;
184     }
185
186     /* Start outer loop over neighborlists */
187     for(iidx=0; iidx<nri; iidx++)
188     {
189         /* Load shift vector for this list */
190         i_shift_offset   = DIM*shiftidx[iidx];
191
192         /* Load limits for loop over neighbors */
193         j_index_start    = jindex[iidx];
194         j_index_end      = jindex[iidx+1];
195
196         /* Get outer coordinate index */
197         inr              = iinr[iidx];
198         i_coord_offset   = DIM*inr;
199
200         /* Load i particle coords and add shift vector */
201         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
202                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
203
204         fix0             = _mm256_setzero_ps();
205         fiy0             = _mm256_setzero_ps();
206         fiz0             = _mm256_setzero_ps();
207         fix1             = _mm256_setzero_ps();
208         fiy1             = _mm256_setzero_ps();
209         fiz1             = _mm256_setzero_ps();
210         fix2             = _mm256_setzero_ps();
211         fiy2             = _mm256_setzero_ps();
212         fiz2             = _mm256_setzero_ps();
213
214         /* Reset potential sums */
215         velecsum         = _mm256_setzero_ps();
216         vvdwsum          = _mm256_setzero_ps();
217
218         /* Start inner kernel loop */
219         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
220         {
221
222             /* Get j neighbor index, and coordinate index */
223             jnrA             = jjnr[jidx];
224             jnrB             = jjnr[jidx+1];
225             jnrC             = jjnr[jidx+2];
226             jnrD             = jjnr[jidx+3];
227             jnrE             = jjnr[jidx+4];
228             jnrF             = jjnr[jidx+5];
229             jnrG             = jjnr[jidx+6];
230             jnrH             = jjnr[jidx+7];
231             j_coord_offsetA  = DIM*jnrA;
232             j_coord_offsetB  = DIM*jnrB;
233             j_coord_offsetC  = DIM*jnrC;
234             j_coord_offsetD  = DIM*jnrD;
235             j_coord_offsetE  = DIM*jnrE;
236             j_coord_offsetF  = DIM*jnrF;
237             j_coord_offsetG  = DIM*jnrG;
238             j_coord_offsetH  = DIM*jnrH;
239
240             /* load j atom coordinates */
241             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
242                                                  x+j_coord_offsetC,x+j_coord_offsetD,
243                                                  x+j_coord_offsetE,x+j_coord_offsetF,
244                                                  x+j_coord_offsetG,x+j_coord_offsetH,
245                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
246
247             /* Calculate displacement vector */
248             dx00             = _mm256_sub_ps(ix0,jx0);
249             dy00             = _mm256_sub_ps(iy0,jy0);
250             dz00             = _mm256_sub_ps(iz0,jz0);
251             dx01             = _mm256_sub_ps(ix0,jx1);
252             dy01             = _mm256_sub_ps(iy0,jy1);
253             dz01             = _mm256_sub_ps(iz0,jz1);
254             dx02             = _mm256_sub_ps(ix0,jx2);
255             dy02             = _mm256_sub_ps(iy0,jy2);
256             dz02             = _mm256_sub_ps(iz0,jz2);
257             dx10             = _mm256_sub_ps(ix1,jx0);
258             dy10             = _mm256_sub_ps(iy1,jy0);
259             dz10             = _mm256_sub_ps(iz1,jz0);
260             dx11             = _mm256_sub_ps(ix1,jx1);
261             dy11             = _mm256_sub_ps(iy1,jy1);
262             dz11             = _mm256_sub_ps(iz1,jz1);
263             dx12             = _mm256_sub_ps(ix1,jx2);
264             dy12             = _mm256_sub_ps(iy1,jy2);
265             dz12             = _mm256_sub_ps(iz1,jz2);
266             dx20             = _mm256_sub_ps(ix2,jx0);
267             dy20             = _mm256_sub_ps(iy2,jy0);
268             dz20             = _mm256_sub_ps(iz2,jz0);
269             dx21             = _mm256_sub_ps(ix2,jx1);
270             dy21             = _mm256_sub_ps(iy2,jy1);
271             dz21             = _mm256_sub_ps(iz2,jz1);
272             dx22             = _mm256_sub_ps(ix2,jx2);
273             dy22             = _mm256_sub_ps(iy2,jy2);
274             dz22             = _mm256_sub_ps(iz2,jz2);
275
276             /* Calculate squared distance and things based on it */
277             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
278             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
279             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
280             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
281             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
282             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
283             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
284             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
285             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
286
287             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
288             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
289             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
290             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
291             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
292             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
293             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
294             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
295             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
296
297             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
298             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
299             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
300             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
301             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
302             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
303             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
304             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
305             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
306
307             fjx0             = _mm256_setzero_ps();
308             fjy0             = _mm256_setzero_ps();
309             fjz0             = _mm256_setzero_ps();
310             fjx1             = _mm256_setzero_ps();
311             fjy1             = _mm256_setzero_ps();
312             fjz1             = _mm256_setzero_ps();
313             fjx2             = _mm256_setzero_ps();
314             fjy2             = _mm256_setzero_ps();
315             fjz2             = _mm256_setzero_ps();
316
317             /**************************
318              * CALCULATE INTERACTIONS *
319              **************************/
320
321             r00              = _mm256_mul_ps(rsq00,rinv00);
322
323             /* Calculate table index by multiplying r with table scale and truncate to integer */
324             rt               = _mm256_mul_ps(r00,vftabscale);
325             vfitab           = _mm256_cvttps_epi32(rt);
326             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
327             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
328             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
329             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
330             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
331             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
332
333             /* COULOMB ELECTROSTATICS */
334             velec            = _mm256_mul_ps(qq00,rinv00);
335             felec            = _mm256_mul_ps(velec,rinvsq00);
336
337             /* CUBIC SPLINE TABLE DISPERSION */
338             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
339                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
340             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
341                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
342             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
343                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
344             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
345                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
346             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
347             Heps             = _mm256_mul_ps(vfeps,H);
348             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
349             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
350             vvdw6            = _mm256_mul_ps(c6_00,VV);
351             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
352             fvdw6            = _mm256_mul_ps(c6_00,FF);
353
354             /* CUBIC SPLINE TABLE REPULSION */
355             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
356             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
357             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
358                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
359             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
360                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
361             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
362                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
363             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
364                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
365             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
366             Heps             = _mm256_mul_ps(vfeps,H);
367             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
368             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
369             vvdw12           = _mm256_mul_ps(c12_00,VV);
370             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
371             fvdw12           = _mm256_mul_ps(c12_00,FF);
372             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
373             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
374
375             /* Update potential sum for this i atom from the interaction with this j atom. */
376             velecsum         = _mm256_add_ps(velecsum,velec);
377             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
378
379             fscal            = _mm256_add_ps(felec,fvdw);
380
381             /* Calculate temporary vectorial force */
382             tx               = _mm256_mul_ps(fscal,dx00);
383             ty               = _mm256_mul_ps(fscal,dy00);
384             tz               = _mm256_mul_ps(fscal,dz00);
385
386             /* Update vectorial force */
387             fix0             = _mm256_add_ps(fix0,tx);
388             fiy0             = _mm256_add_ps(fiy0,ty);
389             fiz0             = _mm256_add_ps(fiz0,tz);
390
391             fjx0             = _mm256_add_ps(fjx0,tx);
392             fjy0             = _mm256_add_ps(fjy0,ty);
393             fjz0             = _mm256_add_ps(fjz0,tz);
394
395             /**************************
396              * CALCULATE INTERACTIONS *
397              **************************/
398
399             /* COULOMB ELECTROSTATICS */
400             velec            = _mm256_mul_ps(qq01,rinv01);
401             felec            = _mm256_mul_ps(velec,rinvsq01);
402
403             /* Update potential sum for this i atom from the interaction with this j atom. */
404             velecsum         = _mm256_add_ps(velecsum,velec);
405
406             fscal            = felec;
407
408             /* Calculate temporary vectorial force */
409             tx               = _mm256_mul_ps(fscal,dx01);
410             ty               = _mm256_mul_ps(fscal,dy01);
411             tz               = _mm256_mul_ps(fscal,dz01);
412
413             /* Update vectorial force */
414             fix0             = _mm256_add_ps(fix0,tx);
415             fiy0             = _mm256_add_ps(fiy0,ty);
416             fiz0             = _mm256_add_ps(fiz0,tz);
417
418             fjx1             = _mm256_add_ps(fjx1,tx);
419             fjy1             = _mm256_add_ps(fjy1,ty);
420             fjz1             = _mm256_add_ps(fjz1,tz);
421
422             /**************************
423              * CALCULATE INTERACTIONS *
424              **************************/
425
426             /* COULOMB ELECTROSTATICS */
427             velec            = _mm256_mul_ps(qq02,rinv02);
428             felec            = _mm256_mul_ps(velec,rinvsq02);
429
430             /* Update potential sum for this i atom from the interaction with this j atom. */
431             velecsum         = _mm256_add_ps(velecsum,velec);
432
433             fscal            = felec;
434
435             /* Calculate temporary vectorial force */
436             tx               = _mm256_mul_ps(fscal,dx02);
437             ty               = _mm256_mul_ps(fscal,dy02);
438             tz               = _mm256_mul_ps(fscal,dz02);
439
440             /* Update vectorial force */
441             fix0             = _mm256_add_ps(fix0,tx);
442             fiy0             = _mm256_add_ps(fiy0,ty);
443             fiz0             = _mm256_add_ps(fiz0,tz);
444
445             fjx2             = _mm256_add_ps(fjx2,tx);
446             fjy2             = _mm256_add_ps(fjy2,ty);
447             fjz2             = _mm256_add_ps(fjz2,tz);
448
449             /**************************
450              * CALCULATE INTERACTIONS *
451              **************************/
452
453             /* COULOMB ELECTROSTATICS */
454             velec            = _mm256_mul_ps(qq10,rinv10);
455             felec            = _mm256_mul_ps(velec,rinvsq10);
456
457             /* Update potential sum for this i atom from the interaction with this j atom. */
458             velecsum         = _mm256_add_ps(velecsum,velec);
459
460             fscal            = felec;
461
462             /* Calculate temporary vectorial force */
463             tx               = _mm256_mul_ps(fscal,dx10);
464             ty               = _mm256_mul_ps(fscal,dy10);
465             tz               = _mm256_mul_ps(fscal,dz10);
466
467             /* Update vectorial force */
468             fix1             = _mm256_add_ps(fix1,tx);
469             fiy1             = _mm256_add_ps(fiy1,ty);
470             fiz1             = _mm256_add_ps(fiz1,tz);
471
472             fjx0             = _mm256_add_ps(fjx0,tx);
473             fjy0             = _mm256_add_ps(fjy0,ty);
474             fjz0             = _mm256_add_ps(fjz0,tz);
475
476             /**************************
477              * CALCULATE INTERACTIONS *
478              **************************/
479
480             /* COULOMB ELECTROSTATICS */
481             velec            = _mm256_mul_ps(qq11,rinv11);
482             felec            = _mm256_mul_ps(velec,rinvsq11);
483
484             /* Update potential sum for this i atom from the interaction with this j atom. */
485             velecsum         = _mm256_add_ps(velecsum,velec);
486
487             fscal            = felec;
488
489             /* Calculate temporary vectorial force */
490             tx               = _mm256_mul_ps(fscal,dx11);
491             ty               = _mm256_mul_ps(fscal,dy11);
492             tz               = _mm256_mul_ps(fscal,dz11);
493
494             /* Update vectorial force */
495             fix1             = _mm256_add_ps(fix1,tx);
496             fiy1             = _mm256_add_ps(fiy1,ty);
497             fiz1             = _mm256_add_ps(fiz1,tz);
498
499             fjx1             = _mm256_add_ps(fjx1,tx);
500             fjy1             = _mm256_add_ps(fjy1,ty);
501             fjz1             = _mm256_add_ps(fjz1,tz);
502
503             /**************************
504              * CALCULATE INTERACTIONS *
505              **************************/
506
507             /* COULOMB ELECTROSTATICS */
508             velec            = _mm256_mul_ps(qq12,rinv12);
509             felec            = _mm256_mul_ps(velec,rinvsq12);
510
511             /* Update potential sum for this i atom from the interaction with this j atom. */
512             velecsum         = _mm256_add_ps(velecsum,velec);
513
514             fscal            = felec;
515
516             /* Calculate temporary vectorial force */
517             tx               = _mm256_mul_ps(fscal,dx12);
518             ty               = _mm256_mul_ps(fscal,dy12);
519             tz               = _mm256_mul_ps(fscal,dz12);
520
521             /* Update vectorial force */
522             fix1             = _mm256_add_ps(fix1,tx);
523             fiy1             = _mm256_add_ps(fiy1,ty);
524             fiz1             = _mm256_add_ps(fiz1,tz);
525
526             fjx2             = _mm256_add_ps(fjx2,tx);
527             fjy2             = _mm256_add_ps(fjy2,ty);
528             fjz2             = _mm256_add_ps(fjz2,tz);
529
530             /**************************
531              * CALCULATE INTERACTIONS *
532              **************************/
533
534             /* COULOMB ELECTROSTATICS */
535             velec            = _mm256_mul_ps(qq20,rinv20);
536             felec            = _mm256_mul_ps(velec,rinvsq20);
537
538             /* Update potential sum for this i atom from the interaction with this j atom. */
539             velecsum         = _mm256_add_ps(velecsum,velec);
540
541             fscal            = felec;
542
543             /* Calculate temporary vectorial force */
544             tx               = _mm256_mul_ps(fscal,dx20);
545             ty               = _mm256_mul_ps(fscal,dy20);
546             tz               = _mm256_mul_ps(fscal,dz20);
547
548             /* Update vectorial force */
549             fix2             = _mm256_add_ps(fix2,tx);
550             fiy2             = _mm256_add_ps(fiy2,ty);
551             fiz2             = _mm256_add_ps(fiz2,tz);
552
553             fjx0             = _mm256_add_ps(fjx0,tx);
554             fjy0             = _mm256_add_ps(fjy0,ty);
555             fjz0             = _mm256_add_ps(fjz0,tz);
556
557             /**************************
558              * CALCULATE INTERACTIONS *
559              **************************/
560
561             /* COULOMB ELECTROSTATICS */
562             velec            = _mm256_mul_ps(qq21,rinv21);
563             felec            = _mm256_mul_ps(velec,rinvsq21);
564
565             /* Update potential sum for this i atom from the interaction with this j atom. */
566             velecsum         = _mm256_add_ps(velecsum,velec);
567
568             fscal            = felec;
569
570             /* Calculate temporary vectorial force */
571             tx               = _mm256_mul_ps(fscal,dx21);
572             ty               = _mm256_mul_ps(fscal,dy21);
573             tz               = _mm256_mul_ps(fscal,dz21);
574
575             /* Update vectorial force */
576             fix2             = _mm256_add_ps(fix2,tx);
577             fiy2             = _mm256_add_ps(fiy2,ty);
578             fiz2             = _mm256_add_ps(fiz2,tz);
579
580             fjx1             = _mm256_add_ps(fjx1,tx);
581             fjy1             = _mm256_add_ps(fjy1,ty);
582             fjz1             = _mm256_add_ps(fjz1,tz);
583
584             /**************************
585              * CALCULATE INTERACTIONS *
586              **************************/
587
588             /* COULOMB ELECTROSTATICS */
589             velec            = _mm256_mul_ps(qq22,rinv22);
590             felec            = _mm256_mul_ps(velec,rinvsq22);
591
592             /* Update potential sum for this i atom from the interaction with this j atom. */
593             velecsum         = _mm256_add_ps(velecsum,velec);
594
595             fscal            = felec;
596
597             /* Calculate temporary vectorial force */
598             tx               = _mm256_mul_ps(fscal,dx22);
599             ty               = _mm256_mul_ps(fscal,dy22);
600             tz               = _mm256_mul_ps(fscal,dz22);
601
602             /* Update vectorial force */
603             fix2             = _mm256_add_ps(fix2,tx);
604             fiy2             = _mm256_add_ps(fiy2,ty);
605             fiz2             = _mm256_add_ps(fiz2,tz);
606
607             fjx2             = _mm256_add_ps(fjx2,tx);
608             fjy2             = _mm256_add_ps(fjy2,ty);
609             fjz2             = _mm256_add_ps(fjz2,tz);
610
611             fjptrA             = f+j_coord_offsetA;
612             fjptrB             = f+j_coord_offsetB;
613             fjptrC             = f+j_coord_offsetC;
614             fjptrD             = f+j_coord_offsetD;
615             fjptrE             = f+j_coord_offsetE;
616             fjptrF             = f+j_coord_offsetF;
617             fjptrG             = f+j_coord_offsetG;
618             fjptrH             = f+j_coord_offsetH;
619
620             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
621                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
622
623             /* Inner loop uses 278 flops */
624         }
625
626         if(jidx<j_index_end)
627         {
628
629             /* Get j neighbor index, and coordinate index */
630             jnrlistA         = jjnr[jidx];
631             jnrlistB         = jjnr[jidx+1];
632             jnrlistC         = jjnr[jidx+2];
633             jnrlistD         = jjnr[jidx+3];
634             jnrlistE         = jjnr[jidx+4];
635             jnrlistF         = jjnr[jidx+5];
636             jnrlistG         = jjnr[jidx+6];
637             jnrlistH         = jjnr[jidx+7];
638             /* Sign of each element will be negative for non-real atoms.
639              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
640              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
641              */
642             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
643                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
644                                             
645             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
646             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
647             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
648             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
649             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
650             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
651             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
652             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
653             j_coord_offsetA  = DIM*jnrA;
654             j_coord_offsetB  = DIM*jnrB;
655             j_coord_offsetC  = DIM*jnrC;
656             j_coord_offsetD  = DIM*jnrD;
657             j_coord_offsetE  = DIM*jnrE;
658             j_coord_offsetF  = DIM*jnrF;
659             j_coord_offsetG  = DIM*jnrG;
660             j_coord_offsetH  = DIM*jnrH;
661
662             /* load j atom coordinates */
663             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
664                                                  x+j_coord_offsetC,x+j_coord_offsetD,
665                                                  x+j_coord_offsetE,x+j_coord_offsetF,
666                                                  x+j_coord_offsetG,x+j_coord_offsetH,
667                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
668
669             /* Calculate displacement vector */
670             dx00             = _mm256_sub_ps(ix0,jx0);
671             dy00             = _mm256_sub_ps(iy0,jy0);
672             dz00             = _mm256_sub_ps(iz0,jz0);
673             dx01             = _mm256_sub_ps(ix0,jx1);
674             dy01             = _mm256_sub_ps(iy0,jy1);
675             dz01             = _mm256_sub_ps(iz0,jz1);
676             dx02             = _mm256_sub_ps(ix0,jx2);
677             dy02             = _mm256_sub_ps(iy0,jy2);
678             dz02             = _mm256_sub_ps(iz0,jz2);
679             dx10             = _mm256_sub_ps(ix1,jx0);
680             dy10             = _mm256_sub_ps(iy1,jy0);
681             dz10             = _mm256_sub_ps(iz1,jz0);
682             dx11             = _mm256_sub_ps(ix1,jx1);
683             dy11             = _mm256_sub_ps(iy1,jy1);
684             dz11             = _mm256_sub_ps(iz1,jz1);
685             dx12             = _mm256_sub_ps(ix1,jx2);
686             dy12             = _mm256_sub_ps(iy1,jy2);
687             dz12             = _mm256_sub_ps(iz1,jz2);
688             dx20             = _mm256_sub_ps(ix2,jx0);
689             dy20             = _mm256_sub_ps(iy2,jy0);
690             dz20             = _mm256_sub_ps(iz2,jz0);
691             dx21             = _mm256_sub_ps(ix2,jx1);
692             dy21             = _mm256_sub_ps(iy2,jy1);
693             dz21             = _mm256_sub_ps(iz2,jz1);
694             dx22             = _mm256_sub_ps(ix2,jx2);
695             dy22             = _mm256_sub_ps(iy2,jy2);
696             dz22             = _mm256_sub_ps(iz2,jz2);
697
698             /* Calculate squared distance and things based on it */
699             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
700             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
701             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
702             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
703             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
704             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
705             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
706             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
707             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
708
709             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
710             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
711             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
712             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
713             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
714             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
715             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
716             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
717             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
718
719             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
720             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
721             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
722             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
723             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
724             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
725             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
726             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
727             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
728
729             fjx0             = _mm256_setzero_ps();
730             fjy0             = _mm256_setzero_ps();
731             fjz0             = _mm256_setzero_ps();
732             fjx1             = _mm256_setzero_ps();
733             fjy1             = _mm256_setzero_ps();
734             fjz1             = _mm256_setzero_ps();
735             fjx2             = _mm256_setzero_ps();
736             fjy2             = _mm256_setzero_ps();
737             fjz2             = _mm256_setzero_ps();
738
739             /**************************
740              * CALCULATE INTERACTIONS *
741              **************************/
742
743             r00              = _mm256_mul_ps(rsq00,rinv00);
744             r00              = _mm256_andnot_ps(dummy_mask,r00);
745
746             /* Calculate table index by multiplying r with table scale and truncate to integer */
747             rt               = _mm256_mul_ps(r00,vftabscale);
748             vfitab           = _mm256_cvttps_epi32(rt);
749             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
750             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
751             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
752             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
753             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
754             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
755
756             /* COULOMB ELECTROSTATICS */
757             velec            = _mm256_mul_ps(qq00,rinv00);
758             felec            = _mm256_mul_ps(velec,rinvsq00);
759
760             /* CUBIC SPLINE TABLE DISPERSION */
761             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
762                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
763             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
764                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
765             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
766                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
767             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
768                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
769             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
770             Heps             = _mm256_mul_ps(vfeps,H);
771             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
772             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
773             vvdw6            = _mm256_mul_ps(c6_00,VV);
774             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
775             fvdw6            = _mm256_mul_ps(c6_00,FF);
776
777             /* CUBIC SPLINE TABLE REPULSION */
778             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
779             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
780             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
781                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
782             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
783                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
784             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
785                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
786             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
787                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
788             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
789             Heps             = _mm256_mul_ps(vfeps,H);
790             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
791             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
792             vvdw12           = _mm256_mul_ps(c12_00,VV);
793             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
794             fvdw12           = _mm256_mul_ps(c12_00,FF);
795             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
796             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
797
798             /* Update potential sum for this i atom from the interaction with this j atom. */
799             velec            = _mm256_andnot_ps(dummy_mask,velec);
800             velecsum         = _mm256_add_ps(velecsum,velec);
801             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
802             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
803
804             fscal            = _mm256_add_ps(felec,fvdw);
805
806             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
807
808             /* Calculate temporary vectorial force */
809             tx               = _mm256_mul_ps(fscal,dx00);
810             ty               = _mm256_mul_ps(fscal,dy00);
811             tz               = _mm256_mul_ps(fscal,dz00);
812
813             /* Update vectorial force */
814             fix0             = _mm256_add_ps(fix0,tx);
815             fiy0             = _mm256_add_ps(fiy0,ty);
816             fiz0             = _mm256_add_ps(fiz0,tz);
817
818             fjx0             = _mm256_add_ps(fjx0,tx);
819             fjy0             = _mm256_add_ps(fjy0,ty);
820             fjz0             = _mm256_add_ps(fjz0,tz);
821
822             /**************************
823              * CALCULATE INTERACTIONS *
824              **************************/
825
826             /* COULOMB ELECTROSTATICS */
827             velec            = _mm256_mul_ps(qq01,rinv01);
828             felec            = _mm256_mul_ps(velec,rinvsq01);
829
830             /* Update potential sum for this i atom from the interaction with this j atom. */
831             velec            = _mm256_andnot_ps(dummy_mask,velec);
832             velecsum         = _mm256_add_ps(velecsum,velec);
833
834             fscal            = felec;
835
836             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
837
838             /* Calculate temporary vectorial force */
839             tx               = _mm256_mul_ps(fscal,dx01);
840             ty               = _mm256_mul_ps(fscal,dy01);
841             tz               = _mm256_mul_ps(fscal,dz01);
842
843             /* Update vectorial force */
844             fix0             = _mm256_add_ps(fix0,tx);
845             fiy0             = _mm256_add_ps(fiy0,ty);
846             fiz0             = _mm256_add_ps(fiz0,tz);
847
848             fjx1             = _mm256_add_ps(fjx1,tx);
849             fjy1             = _mm256_add_ps(fjy1,ty);
850             fjz1             = _mm256_add_ps(fjz1,tz);
851
852             /**************************
853              * CALCULATE INTERACTIONS *
854              **************************/
855
856             /* COULOMB ELECTROSTATICS */
857             velec            = _mm256_mul_ps(qq02,rinv02);
858             felec            = _mm256_mul_ps(velec,rinvsq02);
859
860             /* Update potential sum for this i atom from the interaction with this j atom. */
861             velec            = _mm256_andnot_ps(dummy_mask,velec);
862             velecsum         = _mm256_add_ps(velecsum,velec);
863
864             fscal            = felec;
865
866             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
867
868             /* Calculate temporary vectorial force */
869             tx               = _mm256_mul_ps(fscal,dx02);
870             ty               = _mm256_mul_ps(fscal,dy02);
871             tz               = _mm256_mul_ps(fscal,dz02);
872
873             /* Update vectorial force */
874             fix0             = _mm256_add_ps(fix0,tx);
875             fiy0             = _mm256_add_ps(fiy0,ty);
876             fiz0             = _mm256_add_ps(fiz0,tz);
877
878             fjx2             = _mm256_add_ps(fjx2,tx);
879             fjy2             = _mm256_add_ps(fjy2,ty);
880             fjz2             = _mm256_add_ps(fjz2,tz);
881
882             /**************************
883              * CALCULATE INTERACTIONS *
884              **************************/
885
886             /* COULOMB ELECTROSTATICS */
887             velec            = _mm256_mul_ps(qq10,rinv10);
888             felec            = _mm256_mul_ps(velec,rinvsq10);
889
890             /* Update potential sum for this i atom from the interaction with this j atom. */
891             velec            = _mm256_andnot_ps(dummy_mask,velec);
892             velecsum         = _mm256_add_ps(velecsum,velec);
893
894             fscal            = felec;
895
896             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
897
898             /* Calculate temporary vectorial force */
899             tx               = _mm256_mul_ps(fscal,dx10);
900             ty               = _mm256_mul_ps(fscal,dy10);
901             tz               = _mm256_mul_ps(fscal,dz10);
902
903             /* Update vectorial force */
904             fix1             = _mm256_add_ps(fix1,tx);
905             fiy1             = _mm256_add_ps(fiy1,ty);
906             fiz1             = _mm256_add_ps(fiz1,tz);
907
908             fjx0             = _mm256_add_ps(fjx0,tx);
909             fjy0             = _mm256_add_ps(fjy0,ty);
910             fjz0             = _mm256_add_ps(fjz0,tz);
911
912             /**************************
913              * CALCULATE INTERACTIONS *
914              **************************/
915
916             /* COULOMB ELECTROSTATICS */
917             velec            = _mm256_mul_ps(qq11,rinv11);
918             felec            = _mm256_mul_ps(velec,rinvsq11);
919
920             /* Update potential sum for this i atom from the interaction with this j atom. */
921             velec            = _mm256_andnot_ps(dummy_mask,velec);
922             velecsum         = _mm256_add_ps(velecsum,velec);
923
924             fscal            = felec;
925
926             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
927
928             /* Calculate temporary vectorial force */
929             tx               = _mm256_mul_ps(fscal,dx11);
930             ty               = _mm256_mul_ps(fscal,dy11);
931             tz               = _mm256_mul_ps(fscal,dz11);
932
933             /* Update vectorial force */
934             fix1             = _mm256_add_ps(fix1,tx);
935             fiy1             = _mm256_add_ps(fiy1,ty);
936             fiz1             = _mm256_add_ps(fiz1,tz);
937
938             fjx1             = _mm256_add_ps(fjx1,tx);
939             fjy1             = _mm256_add_ps(fjy1,ty);
940             fjz1             = _mm256_add_ps(fjz1,tz);
941
942             /**************************
943              * CALCULATE INTERACTIONS *
944              **************************/
945
946             /* COULOMB ELECTROSTATICS */
947             velec            = _mm256_mul_ps(qq12,rinv12);
948             felec            = _mm256_mul_ps(velec,rinvsq12);
949
950             /* Update potential sum for this i atom from the interaction with this j atom. */
951             velec            = _mm256_andnot_ps(dummy_mask,velec);
952             velecsum         = _mm256_add_ps(velecsum,velec);
953
954             fscal            = felec;
955
956             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
957
958             /* Calculate temporary vectorial force */
959             tx               = _mm256_mul_ps(fscal,dx12);
960             ty               = _mm256_mul_ps(fscal,dy12);
961             tz               = _mm256_mul_ps(fscal,dz12);
962
963             /* Update vectorial force */
964             fix1             = _mm256_add_ps(fix1,tx);
965             fiy1             = _mm256_add_ps(fiy1,ty);
966             fiz1             = _mm256_add_ps(fiz1,tz);
967
968             fjx2             = _mm256_add_ps(fjx2,tx);
969             fjy2             = _mm256_add_ps(fjy2,ty);
970             fjz2             = _mm256_add_ps(fjz2,tz);
971
972             /**************************
973              * CALCULATE INTERACTIONS *
974              **************************/
975
976             /* COULOMB ELECTROSTATICS */
977             velec            = _mm256_mul_ps(qq20,rinv20);
978             felec            = _mm256_mul_ps(velec,rinvsq20);
979
980             /* Update potential sum for this i atom from the interaction with this j atom. */
981             velec            = _mm256_andnot_ps(dummy_mask,velec);
982             velecsum         = _mm256_add_ps(velecsum,velec);
983
984             fscal            = felec;
985
986             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
987
988             /* Calculate temporary vectorial force */
989             tx               = _mm256_mul_ps(fscal,dx20);
990             ty               = _mm256_mul_ps(fscal,dy20);
991             tz               = _mm256_mul_ps(fscal,dz20);
992
993             /* Update vectorial force */
994             fix2             = _mm256_add_ps(fix2,tx);
995             fiy2             = _mm256_add_ps(fiy2,ty);
996             fiz2             = _mm256_add_ps(fiz2,tz);
997
998             fjx0             = _mm256_add_ps(fjx0,tx);
999             fjy0             = _mm256_add_ps(fjy0,ty);
1000             fjz0             = _mm256_add_ps(fjz0,tz);
1001
1002             /**************************
1003              * CALCULATE INTERACTIONS *
1004              **************************/
1005
1006             /* COULOMB ELECTROSTATICS */
1007             velec            = _mm256_mul_ps(qq21,rinv21);
1008             felec            = _mm256_mul_ps(velec,rinvsq21);
1009
1010             /* Update potential sum for this i atom from the interaction with this j atom. */
1011             velec            = _mm256_andnot_ps(dummy_mask,velec);
1012             velecsum         = _mm256_add_ps(velecsum,velec);
1013
1014             fscal            = felec;
1015
1016             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1017
1018             /* Calculate temporary vectorial force */
1019             tx               = _mm256_mul_ps(fscal,dx21);
1020             ty               = _mm256_mul_ps(fscal,dy21);
1021             tz               = _mm256_mul_ps(fscal,dz21);
1022
1023             /* Update vectorial force */
1024             fix2             = _mm256_add_ps(fix2,tx);
1025             fiy2             = _mm256_add_ps(fiy2,ty);
1026             fiz2             = _mm256_add_ps(fiz2,tz);
1027
1028             fjx1             = _mm256_add_ps(fjx1,tx);
1029             fjy1             = _mm256_add_ps(fjy1,ty);
1030             fjz1             = _mm256_add_ps(fjz1,tz);
1031
1032             /**************************
1033              * CALCULATE INTERACTIONS *
1034              **************************/
1035
1036             /* COULOMB ELECTROSTATICS */
1037             velec            = _mm256_mul_ps(qq22,rinv22);
1038             felec            = _mm256_mul_ps(velec,rinvsq22);
1039
1040             /* Update potential sum for this i atom from the interaction with this j atom. */
1041             velec            = _mm256_andnot_ps(dummy_mask,velec);
1042             velecsum         = _mm256_add_ps(velecsum,velec);
1043
1044             fscal            = felec;
1045
1046             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1047
1048             /* Calculate temporary vectorial force */
1049             tx               = _mm256_mul_ps(fscal,dx22);
1050             ty               = _mm256_mul_ps(fscal,dy22);
1051             tz               = _mm256_mul_ps(fscal,dz22);
1052
1053             /* Update vectorial force */
1054             fix2             = _mm256_add_ps(fix2,tx);
1055             fiy2             = _mm256_add_ps(fiy2,ty);
1056             fiz2             = _mm256_add_ps(fiz2,tz);
1057
1058             fjx2             = _mm256_add_ps(fjx2,tx);
1059             fjy2             = _mm256_add_ps(fjy2,ty);
1060             fjz2             = _mm256_add_ps(fjz2,tz);
1061
1062             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1063             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1064             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1065             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1066             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1067             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1068             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1069             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1070
1071             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1072                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1073
1074             /* Inner loop uses 279 flops */
1075         }
1076
1077         /* End of innermost loop */
1078
1079         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1080                                                  f+i_coord_offset,fshift+i_shift_offset);
1081
1082         ggid                        = gid[iidx];
1083         /* Update potential energies */
1084         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1085         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1086
1087         /* Increment number of inner iterations */
1088         inneriter                  += j_index_end - j_index_start;
1089
1090         /* Outer loop uses 20 flops */
1091     }
1092
1093     /* Increment number of outer iterations */
1094     outeriter        += nri;
1095
1096     /* Update outer/inner flops */
1097
1098     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*279);
1099 }
1100 /*
1101  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_single
1102  * Electrostatics interaction: Coulomb
1103  * VdW interaction:            CubicSplineTable
1104  * Geometry:                   Water3-Water3
1105  * Calculate force/pot:        Force
1106  */
1107 void
1108 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_single
1109                     (t_nblist                    * gmx_restrict       nlist,
1110                      rvec                        * gmx_restrict          xx,
1111                      rvec                        * gmx_restrict          ff,
1112                      t_forcerec                  * gmx_restrict          fr,
1113                      t_mdatoms                   * gmx_restrict     mdatoms,
1114                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1115                      t_nrnb                      * gmx_restrict        nrnb)
1116 {
1117     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1118      * just 0 for non-waters.
1119      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1120      * jnr indices corresponding to data put in the four positions in the SIMD register.
1121      */
1122     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1123     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1124     int              jnrA,jnrB,jnrC,jnrD;
1125     int              jnrE,jnrF,jnrG,jnrH;
1126     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1127     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1128     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1129     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1130     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1131     real             rcutoff_scalar;
1132     real             *shiftvec,*fshift,*x,*f;
1133     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1134     real             scratch[4*DIM];
1135     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1136     real *           vdwioffsetptr0;
1137     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1138     real *           vdwioffsetptr1;
1139     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1140     real *           vdwioffsetptr2;
1141     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1142     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1143     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1144     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1145     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1146     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1147     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1148     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1149     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1150     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1151     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1152     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1153     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1154     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1155     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1156     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1157     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1158     real             *charge;
1159     int              nvdwtype;
1160     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1161     int              *vdwtype;
1162     real             *vdwparam;
1163     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1164     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1165     __m256i          vfitab;
1166     __m128i          vfitab_lo,vfitab_hi;
1167     __m128i          ifour       = _mm_set1_epi32(4);
1168     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1169     real             *vftab;
1170     __m256           dummy_mask,cutoff_mask;
1171     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1172     __m256           one     = _mm256_set1_ps(1.0);
1173     __m256           two     = _mm256_set1_ps(2.0);
1174     x                = xx[0];
1175     f                = ff[0];
1176
1177     nri              = nlist->nri;
1178     iinr             = nlist->iinr;
1179     jindex           = nlist->jindex;
1180     jjnr             = nlist->jjnr;
1181     shiftidx         = nlist->shift;
1182     gid              = nlist->gid;
1183     shiftvec         = fr->shift_vec[0];
1184     fshift           = fr->fshift[0];
1185     facel            = _mm256_set1_ps(fr->epsfac);
1186     charge           = mdatoms->chargeA;
1187     nvdwtype         = fr->ntype;
1188     vdwparam         = fr->nbfp;
1189     vdwtype          = mdatoms->typeA;
1190
1191     vftab            = kernel_data->table_vdw->data;
1192     vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
1193
1194     /* Setup water-specific parameters */
1195     inr              = nlist->iinr[0];
1196     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1197     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1198     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1199     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1200
1201     jq0              = _mm256_set1_ps(charge[inr+0]);
1202     jq1              = _mm256_set1_ps(charge[inr+1]);
1203     jq2              = _mm256_set1_ps(charge[inr+2]);
1204     vdwjidx0A        = 2*vdwtype[inr+0];
1205     qq00             = _mm256_mul_ps(iq0,jq0);
1206     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1207     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1208     qq01             = _mm256_mul_ps(iq0,jq1);
1209     qq02             = _mm256_mul_ps(iq0,jq2);
1210     qq10             = _mm256_mul_ps(iq1,jq0);
1211     qq11             = _mm256_mul_ps(iq1,jq1);
1212     qq12             = _mm256_mul_ps(iq1,jq2);
1213     qq20             = _mm256_mul_ps(iq2,jq0);
1214     qq21             = _mm256_mul_ps(iq2,jq1);
1215     qq22             = _mm256_mul_ps(iq2,jq2);
1216
1217     /* Avoid stupid compiler warnings */
1218     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1219     j_coord_offsetA = 0;
1220     j_coord_offsetB = 0;
1221     j_coord_offsetC = 0;
1222     j_coord_offsetD = 0;
1223     j_coord_offsetE = 0;
1224     j_coord_offsetF = 0;
1225     j_coord_offsetG = 0;
1226     j_coord_offsetH = 0;
1227
1228     outeriter        = 0;
1229     inneriter        = 0;
1230
1231     for(iidx=0;iidx<4*DIM;iidx++)
1232     {
1233         scratch[iidx] = 0.0;
1234     }
1235
1236     /* Start outer loop over neighborlists */
1237     for(iidx=0; iidx<nri; iidx++)
1238     {
1239         /* Load shift vector for this list */
1240         i_shift_offset   = DIM*shiftidx[iidx];
1241
1242         /* Load limits for loop over neighbors */
1243         j_index_start    = jindex[iidx];
1244         j_index_end      = jindex[iidx+1];
1245
1246         /* Get outer coordinate index */
1247         inr              = iinr[iidx];
1248         i_coord_offset   = DIM*inr;
1249
1250         /* Load i particle coords and add shift vector */
1251         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1252                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1253
1254         fix0             = _mm256_setzero_ps();
1255         fiy0             = _mm256_setzero_ps();
1256         fiz0             = _mm256_setzero_ps();
1257         fix1             = _mm256_setzero_ps();
1258         fiy1             = _mm256_setzero_ps();
1259         fiz1             = _mm256_setzero_ps();
1260         fix2             = _mm256_setzero_ps();
1261         fiy2             = _mm256_setzero_ps();
1262         fiz2             = _mm256_setzero_ps();
1263
1264         /* Start inner kernel loop */
1265         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1266         {
1267
1268             /* Get j neighbor index, and coordinate index */
1269             jnrA             = jjnr[jidx];
1270             jnrB             = jjnr[jidx+1];
1271             jnrC             = jjnr[jidx+2];
1272             jnrD             = jjnr[jidx+3];
1273             jnrE             = jjnr[jidx+4];
1274             jnrF             = jjnr[jidx+5];
1275             jnrG             = jjnr[jidx+6];
1276             jnrH             = jjnr[jidx+7];
1277             j_coord_offsetA  = DIM*jnrA;
1278             j_coord_offsetB  = DIM*jnrB;
1279             j_coord_offsetC  = DIM*jnrC;
1280             j_coord_offsetD  = DIM*jnrD;
1281             j_coord_offsetE  = DIM*jnrE;
1282             j_coord_offsetF  = DIM*jnrF;
1283             j_coord_offsetG  = DIM*jnrG;
1284             j_coord_offsetH  = DIM*jnrH;
1285
1286             /* load j atom coordinates */
1287             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1288                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1289                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1290                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1291                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1292
1293             /* Calculate displacement vector */
1294             dx00             = _mm256_sub_ps(ix0,jx0);
1295             dy00             = _mm256_sub_ps(iy0,jy0);
1296             dz00             = _mm256_sub_ps(iz0,jz0);
1297             dx01             = _mm256_sub_ps(ix0,jx1);
1298             dy01             = _mm256_sub_ps(iy0,jy1);
1299             dz01             = _mm256_sub_ps(iz0,jz1);
1300             dx02             = _mm256_sub_ps(ix0,jx2);
1301             dy02             = _mm256_sub_ps(iy0,jy2);
1302             dz02             = _mm256_sub_ps(iz0,jz2);
1303             dx10             = _mm256_sub_ps(ix1,jx0);
1304             dy10             = _mm256_sub_ps(iy1,jy0);
1305             dz10             = _mm256_sub_ps(iz1,jz0);
1306             dx11             = _mm256_sub_ps(ix1,jx1);
1307             dy11             = _mm256_sub_ps(iy1,jy1);
1308             dz11             = _mm256_sub_ps(iz1,jz1);
1309             dx12             = _mm256_sub_ps(ix1,jx2);
1310             dy12             = _mm256_sub_ps(iy1,jy2);
1311             dz12             = _mm256_sub_ps(iz1,jz2);
1312             dx20             = _mm256_sub_ps(ix2,jx0);
1313             dy20             = _mm256_sub_ps(iy2,jy0);
1314             dz20             = _mm256_sub_ps(iz2,jz0);
1315             dx21             = _mm256_sub_ps(ix2,jx1);
1316             dy21             = _mm256_sub_ps(iy2,jy1);
1317             dz21             = _mm256_sub_ps(iz2,jz1);
1318             dx22             = _mm256_sub_ps(ix2,jx2);
1319             dy22             = _mm256_sub_ps(iy2,jy2);
1320             dz22             = _mm256_sub_ps(iz2,jz2);
1321
1322             /* Calculate squared distance and things based on it */
1323             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1324             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1325             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1326             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1327             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1328             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1329             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1330             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1331             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1332
1333             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1334             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1335             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1336             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1337             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1338             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1339             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1340             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1341             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1342
1343             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1344             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1345             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1346             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1347             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1348             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1349             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1350             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1351             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1352
1353             fjx0             = _mm256_setzero_ps();
1354             fjy0             = _mm256_setzero_ps();
1355             fjz0             = _mm256_setzero_ps();
1356             fjx1             = _mm256_setzero_ps();
1357             fjy1             = _mm256_setzero_ps();
1358             fjz1             = _mm256_setzero_ps();
1359             fjx2             = _mm256_setzero_ps();
1360             fjy2             = _mm256_setzero_ps();
1361             fjz2             = _mm256_setzero_ps();
1362
1363             /**************************
1364              * CALCULATE INTERACTIONS *
1365              **************************/
1366
1367             r00              = _mm256_mul_ps(rsq00,rinv00);
1368
1369             /* Calculate table index by multiplying r with table scale and truncate to integer */
1370             rt               = _mm256_mul_ps(r00,vftabscale);
1371             vfitab           = _mm256_cvttps_epi32(rt);
1372             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1373             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1374             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1375             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1376             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
1377             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
1378
1379             /* COULOMB ELECTROSTATICS */
1380             velec            = _mm256_mul_ps(qq00,rinv00);
1381             felec            = _mm256_mul_ps(velec,rinvsq00);
1382
1383             /* CUBIC SPLINE TABLE DISPERSION */
1384             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1385                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1386             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1387                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1388             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1389                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1390             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1391                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1392             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1393             Heps             = _mm256_mul_ps(vfeps,H);
1394             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1395             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1396             fvdw6            = _mm256_mul_ps(c6_00,FF);
1397
1398             /* CUBIC SPLINE TABLE REPULSION */
1399             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1400             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1401             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1402                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1403             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1404                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1405             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1406                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1407             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1408                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1409             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1410             Heps             = _mm256_mul_ps(vfeps,H);
1411             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1412             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1413             fvdw12           = _mm256_mul_ps(c12_00,FF);
1414             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1415
1416             fscal            = _mm256_add_ps(felec,fvdw);
1417
1418             /* Calculate temporary vectorial force */
1419             tx               = _mm256_mul_ps(fscal,dx00);
1420             ty               = _mm256_mul_ps(fscal,dy00);
1421             tz               = _mm256_mul_ps(fscal,dz00);
1422
1423             /* Update vectorial force */
1424             fix0             = _mm256_add_ps(fix0,tx);
1425             fiy0             = _mm256_add_ps(fiy0,ty);
1426             fiz0             = _mm256_add_ps(fiz0,tz);
1427
1428             fjx0             = _mm256_add_ps(fjx0,tx);
1429             fjy0             = _mm256_add_ps(fjy0,ty);
1430             fjz0             = _mm256_add_ps(fjz0,tz);
1431
1432             /**************************
1433              * CALCULATE INTERACTIONS *
1434              **************************/
1435
1436             /* COULOMB ELECTROSTATICS */
1437             velec            = _mm256_mul_ps(qq01,rinv01);
1438             felec            = _mm256_mul_ps(velec,rinvsq01);
1439
1440             fscal            = felec;
1441
1442             /* Calculate temporary vectorial force */
1443             tx               = _mm256_mul_ps(fscal,dx01);
1444             ty               = _mm256_mul_ps(fscal,dy01);
1445             tz               = _mm256_mul_ps(fscal,dz01);
1446
1447             /* Update vectorial force */
1448             fix0             = _mm256_add_ps(fix0,tx);
1449             fiy0             = _mm256_add_ps(fiy0,ty);
1450             fiz0             = _mm256_add_ps(fiz0,tz);
1451
1452             fjx1             = _mm256_add_ps(fjx1,tx);
1453             fjy1             = _mm256_add_ps(fjy1,ty);
1454             fjz1             = _mm256_add_ps(fjz1,tz);
1455
1456             /**************************
1457              * CALCULATE INTERACTIONS *
1458              **************************/
1459
1460             /* COULOMB ELECTROSTATICS */
1461             velec            = _mm256_mul_ps(qq02,rinv02);
1462             felec            = _mm256_mul_ps(velec,rinvsq02);
1463
1464             fscal            = felec;
1465
1466             /* Calculate temporary vectorial force */
1467             tx               = _mm256_mul_ps(fscal,dx02);
1468             ty               = _mm256_mul_ps(fscal,dy02);
1469             tz               = _mm256_mul_ps(fscal,dz02);
1470
1471             /* Update vectorial force */
1472             fix0             = _mm256_add_ps(fix0,tx);
1473             fiy0             = _mm256_add_ps(fiy0,ty);
1474             fiz0             = _mm256_add_ps(fiz0,tz);
1475
1476             fjx2             = _mm256_add_ps(fjx2,tx);
1477             fjy2             = _mm256_add_ps(fjy2,ty);
1478             fjz2             = _mm256_add_ps(fjz2,tz);
1479
1480             /**************************
1481              * CALCULATE INTERACTIONS *
1482              **************************/
1483
1484             /* COULOMB ELECTROSTATICS */
1485             velec            = _mm256_mul_ps(qq10,rinv10);
1486             felec            = _mm256_mul_ps(velec,rinvsq10);
1487
1488             fscal            = felec;
1489
1490             /* Calculate temporary vectorial force */
1491             tx               = _mm256_mul_ps(fscal,dx10);
1492             ty               = _mm256_mul_ps(fscal,dy10);
1493             tz               = _mm256_mul_ps(fscal,dz10);
1494
1495             /* Update vectorial force */
1496             fix1             = _mm256_add_ps(fix1,tx);
1497             fiy1             = _mm256_add_ps(fiy1,ty);
1498             fiz1             = _mm256_add_ps(fiz1,tz);
1499
1500             fjx0             = _mm256_add_ps(fjx0,tx);
1501             fjy0             = _mm256_add_ps(fjy0,ty);
1502             fjz0             = _mm256_add_ps(fjz0,tz);
1503
1504             /**************************
1505              * CALCULATE INTERACTIONS *
1506              **************************/
1507
1508             /* COULOMB ELECTROSTATICS */
1509             velec            = _mm256_mul_ps(qq11,rinv11);
1510             felec            = _mm256_mul_ps(velec,rinvsq11);
1511
1512             fscal            = felec;
1513
1514             /* Calculate temporary vectorial force */
1515             tx               = _mm256_mul_ps(fscal,dx11);
1516             ty               = _mm256_mul_ps(fscal,dy11);
1517             tz               = _mm256_mul_ps(fscal,dz11);
1518
1519             /* Update vectorial force */
1520             fix1             = _mm256_add_ps(fix1,tx);
1521             fiy1             = _mm256_add_ps(fiy1,ty);
1522             fiz1             = _mm256_add_ps(fiz1,tz);
1523
1524             fjx1             = _mm256_add_ps(fjx1,tx);
1525             fjy1             = _mm256_add_ps(fjy1,ty);
1526             fjz1             = _mm256_add_ps(fjz1,tz);
1527
1528             /**************************
1529              * CALCULATE INTERACTIONS *
1530              **************************/
1531
1532             /* COULOMB ELECTROSTATICS */
1533             velec            = _mm256_mul_ps(qq12,rinv12);
1534             felec            = _mm256_mul_ps(velec,rinvsq12);
1535
1536             fscal            = felec;
1537
1538             /* Calculate temporary vectorial force */
1539             tx               = _mm256_mul_ps(fscal,dx12);
1540             ty               = _mm256_mul_ps(fscal,dy12);
1541             tz               = _mm256_mul_ps(fscal,dz12);
1542
1543             /* Update vectorial force */
1544             fix1             = _mm256_add_ps(fix1,tx);
1545             fiy1             = _mm256_add_ps(fiy1,ty);
1546             fiz1             = _mm256_add_ps(fiz1,tz);
1547
1548             fjx2             = _mm256_add_ps(fjx2,tx);
1549             fjy2             = _mm256_add_ps(fjy2,ty);
1550             fjz2             = _mm256_add_ps(fjz2,tz);
1551
1552             /**************************
1553              * CALCULATE INTERACTIONS *
1554              **************************/
1555
1556             /* COULOMB ELECTROSTATICS */
1557             velec            = _mm256_mul_ps(qq20,rinv20);
1558             felec            = _mm256_mul_ps(velec,rinvsq20);
1559
1560             fscal            = felec;
1561
1562             /* Calculate temporary vectorial force */
1563             tx               = _mm256_mul_ps(fscal,dx20);
1564             ty               = _mm256_mul_ps(fscal,dy20);
1565             tz               = _mm256_mul_ps(fscal,dz20);
1566
1567             /* Update vectorial force */
1568             fix2             = _mm256_add_ps(fix2,tx);
1569             fiy2             = _mm256_add_ps(fiy2,ty);
1570             fiz2             = _mm256_add_ps(fiz2,tz);
1571
1572             fjx0             = _mm256_add_ps(fjx0,tx);
1573             fjy0             = _mm256_add_ps(fjy0,ty);
1574             fjz0             = _mm256_add_ps(fjz0,tz);
1575
1576             /**************************
1577              * CALCULATE INTERACTIONS *
1578              **************************/
1579
1580             /* COULOMB ELECTROSTATICS */
1581             velec            = _mm256_mul_ps(qq21,rinv21);
1582             felec            = _mm256_mul_ps(velec,rinvsq21);
1583
1584             fscal            = felec;
1585
1586             /* Calculate temporary vectorial force */
1587             tx               = _mm256_mul_ps(fscal,dx21);
1588             ty               = _mm256_mul_ps(fscal,dy21);
1589             tz               = _mm256_mul_ps(fscal,dz21);
1590
1591             /* Update vectorial force */
1592             fix2             = _mm256_add_ps(fix2,tx);
1593             fiy2             = _mm256_add_ps(fiy2,ty);
1594             fiz2             = _mm256_add_ps(fiz2,tz);
1595
1596             fjx1             = _mm256_add_ps(fjx1,tx);
1597             fjy1             = _mm256_add_ps(fjy1,ty);
1598             fjz1             = _mm256_add_ps(fjz1,tz);
1599
1600             /**************************
1601              * CALCULATE INTERACTIONS *
1602              **************************/
1603
1604             /* COULOMB ELECTROSTATICS */
1605             velec            = _mm256_mul_ps(qq22,rinv22);
1606             felec            = _mm256_mul_ps(velec,rinvsq22);
1607
1608             fscal            = felec;
1609
1610             /* Calculate temporary vectorial force */
1611             tx               = _mm256_mul_ps(fscal,dx22);
1612             ty               = _mm256_mul_ps(fscal,dy22);
1613             tz               = _mm256_mul_ps(fscal,dz22);
1614
1615             /* Update vectorial force */
1616             fix2             = _mm256_add_ps(fix2,tx);
1617             fiy2             = _mm256_add_ps(fiy2,ty);
1618             fiz2             = _mm256_add_ps(fiz2,tz);
1619
1620             fjx2             = _mm256_add_ps(fjx2,tx);
1621             fjy2             = _mm256_add_ps(fjy2,ty);
1622             fjz2             = _mm256_add_ps(fjz2,tz);
1623
1624             fjptrA             = f+j_coord_offsetA;
1625             fjptrB             = f+j_coord_offsetB;
1626             fjptrC             = f+j_coord_offsetC;
1627             fjptrD             = f+j_coord_offsetD;
1628             fjptrE             = f+j_coord_offsetE;
1629             fjptrF             = f+j_coord_offsetF;
1630             fjptrG             = f+j_coord_offsetG;
1631             fjptrH             = f+j_coord_offsetH;
1632
1633             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1634                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1635
1636             /* Inner loop uses 261 flops */
1637         }
1638
1639         if(jidx<j_index_end)
1640         {
1641
1642             /* Get j neighbor index, and coordinate index */
1643             jnrlistA         = jjnr[jidx];
1644             jnrlistB         = jjnr[jidx+1];
1645             jnrlistC         = jjnr[jidx+2];
1646             jnrlistD         = jjnr[jidx+3];
1647             jnrlistE         = jjnr[jidx+4];
1648             jnrlistF         = jjnr[jidx+5];
1649             jnrlistG         = jjnr[jidx+6];
1650             jnrlistH         = jjnr[jidx+7];
1651             /* Sign of each element will be negative for non-real atoms.
1652              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1653              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1654              */
1655             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1656                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1657                                             
1658             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1659             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1660             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1661             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1662             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1663             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1664             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1665             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1666             j_coord_offsetA  = DIM*jnrA;
1667             j_coord_offsetB  = DIM*jnrB;
1668             j_coord_offsetC  = DIM*jnrC;
1669             j_coord_offsetD  = DIM*jnrD;
1670             j_coord_offsetE  = DIM*jnrE;
1671             j_coord_offsetF  = DIM*jnrF;
1672             j_coord_offsetG  = DIM*jnrG;
1673             j_coord_offsetH  = DIM*jnrH;
1674
1675             /* load j atom coordinates */
1676             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1677                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1678                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1679                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1680                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1681
1682             /* Calculate displacement vector */
1683             dx00             = _mm256_sub_ps(ix0,jx0);
1684             dy00             = _mm256_sub_ps(iy0,jy0);
1685             dz00             = _mm256_sub_ps(iz0,jz0);
1686             dx01             = _mm256_sub_ps(ix0,jx1);
1687             dy01             = _mm256_sub_ps(iy0,jy1);
1688             dz01             = _mm256_sub_ps(iz0,jz1);
1689             dx02             = _mm256_sub_ps(ix0,jx2);
1690             dy02             = _mm256_sub_ps(iy0,jy2);
1691             dz02             = _mm256_sub_ps(iz0,jz2);
1692             dx10             = _mm256_sub_ps(ix1,jx0);
1693             dy10             = _mm256_sub_ps(iy1,jy0);
1694             dz10             = _mm256_sub_ps(iz1,jz0);
1695             dx11             = _mm256_sub_ps(ix1,jx1);
1696             dy11             = _mm256_sub_ps(iy1,jy1);
1697             dz11             = _mm256_sub_ps(iz1,jz1);
1698             dx12             = _mm256_sub_ps(ix1,jx2);
1699             dy12             = _mm256_sub_ps(iy1,jy2);
1700             dz12             = _mm256_sub_ps(iz1,jz2);
1701             dx20             = _mm256_sub_ps(ix2,jx0);
1702             dy20             = _mm256_sub_ps(iy2,jy0);
1703             dz20             = _mm256_sub_ps(iz2,jz0);
1704             dx21             = _mm256_sub_ps(ix2,jx1);
1705             dy21             = _mm256_sub_ps(iy2,jy1);
1706             dz21             = _mm256_sub_ps(iz2,jz1);
1707             dx22             = _mm256_sub_ps(ix2,jx2);
1708             dy22             = _mm256_sub_ps(iy2,jy2);
1709             dz22             = _mm256_sub_ps(iz2,jz2);
1710
1711             /* Calculate squared distance and things based on it */
1712             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1713             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1714             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1715             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1716             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1717             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1718             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1719             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1720             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1721
1722             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1723             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1724             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1725             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1726             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1727             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1728             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1729             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1730             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1731
1732             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1733             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1734             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1735             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1736             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1737             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1738             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1739             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1740             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1741
1742             fjx0             = _mm256_setzero_ps();
1743             fjy0             = _mm256_setzero_ps();
1744             fjz0             = _mm256_setzero_ps();
1745             fjx1             = _mm256_setzero_ps();
1746             fjy1             = _mm256_setzero_ps();
1747             fjz1             = _mm256_setzero_ps();
1748             fjx2             = _mm256_setzero_ps();
1749             fjy2             = _mm256_setzero_ps();
1750             fjz2             = _mm256_setzero_ps();
1751
1752             /**************************
1753              * CALCULATE INTERACTIONS *
1754              **************************/
1755
1756             r00              = _mm256_mul_ps(rsq00,rinv00);
1757             r00              = _mm256_andnot_ps(dummy_mask,r00);
1758
1759             /* Calculate table index by multiplying r with table scale and truncate to integer */
1760             rt               = _mm256_mul_ps(r00,vftabscale);
1761             vfitab           = _mm256_cvttps_epi32(rt);
1762             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1763             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1764             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1765             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1766             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
1767             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
1768
1769             /* COULOMB ELECTROSTATICS */
1770             velec            = _mm256_mul_ps(qq00,rinv00);
1771             felec            = _mm256_mul_ps(velec,rinvsq00);
1772
1773             /* CUBIC SPLINE TABLE DISPERSION */
1774             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1775                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1776             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1777                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1778             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1779                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1780             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1781                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1782             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1783             Heps             = _mm256_mul_ps(vfeps,H);
1784             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1785             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1786             fvdw6            = _mm256_mul_ps(c6_00,FF);
1787
1788             /* CUBIC SPLINE TABLE REPULSION */
1789             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1790             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1791             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1792                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1793             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1794                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1795             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1796                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1797             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1798                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1799             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1800             Heps             = _mm256_mul_ps(vfeps,H);
1801             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1802             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1803             fvdw12           = _mm256_mul_ps(c12_00,FF);
1804             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1805
1806             fscal            = _mm256_add_ps(felec,fvdw);
1807
1808             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1809
1810             /* Calculate temporary vectorial force */
1811             tx               = _mm256_mul_ps(fscal,dx00);
1812             ty               = _mm256_mul_ps(fscal,dy00);
1813             tz               = _mm256_mul_ps(fscal,dz00);
1814
1815             /* Update vectorial force */
1816             fix0             = _mm256_add_ps(fix0,tx);
1817             fiy0             = _mm256_add_ps(fiy0,ty);
1818             fiz0             = _mm256_add_ps(fiz0,tz);
1819
1820             fjx0             = _mm256_add_ps(fjx0,tx);
1821             fjy0             = _mm256_add_ps(fjy0,ty);
1822             fjz0             = _mm256_add_ps(fjz0,tz);
1823
1824             /**************************
1825              * CALCULATE INTERACTIONS *
1826              **************************/
1827
1828             /* COULOMB ELECTROSTATICS */
1829             velec            = _mm256_mul_ps(qq01,rinv01);
1830             felec            = _mm256_mul_ps(velec,rinvsq01);
1831
1832             fscal            = felec;
1833
1834             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1835
1836             /* Calculate temporary vectorial force */
1837             tx               = _mm256_mul_ps(fscal,dx01);
1838             ty               = _mm256_mul_ps(fscal,dy01);
1839             tz               = _mm256_mul_ps(fscal,dz01);
1840
1841             /* Update vectorial force */
1842             fix0             = _mm256_add_ps(fix0,tx);
1843             fiy0             = _mm256_add_ps(fiy0,ty);
1844             fiz0             = _mm256_add_ps(fiz0,tz);
1845
1846             fjx1             = _mm256_add_ps(fjx1,tx);
1847             fjy1             = _mm256_add_ps(fjy1,ty);
1848             fjz1             = _mm256_add_ps(fjz1,tz);
1849
1850             /**************************
1851              * CALCULATE INTERACTIONS *
1852              **************************/
1853
1854             /* COULOMB ELECTROSTATICS */
1855             velec            = _mm256_mul_ps(qq02,rinv02);
1856             felec            = _mm256_mul_ps(velec,rinvsq02);
1857
1858             fscal            = felec;
1859
1860             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1861
1862             /* Calculate temporary vectorial force */
1863             tx               = _mm256_mul_ps(fscal,dx02);
1864             ty               = _mm256_mul_ps(fscal,dy02);
1865             tz               = _mm256_mul_ps(fscal,dz02);
1866
1867             /* Update vectorial force */
1868             fix0             = _mm256_add_ps(fix0,tx);
1869             fiy0             = _mm256_add_ps(fiy0,ty);
1870             fiz0             = _mm256_add_ps(fiz0,tz);
1871
1872             fjx2             = _mm256_add_ps(fjx2,tx);
1873             fjy2             = _mm256_add_ps(fjy2,ty);
1874             fjz2             = _mm256_add_ps(fjz2,tz);
1875
1876             /**************************
1877              * CALCULATE INTERACTIONS *
1878              **************************/
1879
1880             /* COULOMB ELECTROSTATICS */
1881             velec            = _mm256_mul_ps(qq10,rinv10);
1882             felec            = _mm256_mul_ps(velec,rinvsq10);
1883
1884             fscal            = felec;
1885
1886             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1887
1888             /* Calculate temporary vectorial force */
1889             tx               = _mm256_mul_ps(fscal,dx10);
1890             ty               = _mm256_mul_ps(fscal,dy10);
1891             tz               = _mm256_mul_ps(fscal,dz10);
1892
1893             /* Update vectorial force */
1894             fix1             = _mm256_add_ps(fix1,tx);
1895             fiy1             = _mm256_add_ps(fiy1,ty);
1896             fiz1             = _mm256_add_ps(fiz1,tz);
1897
1898             fjx0             = _mm256_add_ps(fjx0,tx);
1899             fjy0             = _mm256_add_ps(fjy0,ty);
1900             fjz0             = _mm256_add_ps(fjz0,tz);
1901
1902             /**************************
1903              * CALCULATE INTERACTIONS *
1904              **************************/
1905
1906             /* COULOMB ELECTROSTATICS */
1907             velec            = _mm256_mul_ps(qq11,rinv11);
1908             felec            = _mm256_mul_ps(velec,rinvsq11);
1909
1910             fscal            = felec;
1911
1912             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1913
1914             /* Calculate temporary vectorial force */
1915             tx               = _mm256_mul_ps(fscal,dx11);
1916             ty               = _mm256_mul_ps(fscal,dy11);
1917             tz               = _mm256_mul_ps(fscal,dz11);
1918
1919             /* Update vectorial force */
1920             fix1             = _mm256_add_ps(fix1,tx);
1921             fiy1             = _mm256_add_ps(fiy1,ty);
1922             fiz1             = _mm256_add_ps(fiz1,tz);
1923
1924             fjx1             = _mm256_add_ps(fjx1,tx);
1925             fjy1             = _mm256_add_ps(fjy1,ty);
1926             fjz1             = _mm256_add_ps(fjz1,tz);
1927
1928             /**************************
1929              * CALCULATE INTERACTIONS *
1930              **************************/
1931
1932             /* COULOMB ELECTROSTATICS */
1933             velec            = _mm256_mul_ps(qq12,rinv12);
1934             felec            = _mm256_mul_ps(velec,rinvsq12);
1935
1936             fscal            = felec;
1937
1938             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1939
1940             /* Calculate temporary vectorial force */
1941             tx               = _mm256_mul_ps(fscal,dx12);
1942             ty               = _mm256_mul_ps(fscal,dy12);
1943             tz               = _mm256_mul_ps(fscal,dz12);
1944
1945             /* Update vectorial force */
1946             fix1             = _mm256_add_ps(fix1,tx);
1947             fiy1             = _mm256_add_ps(fiy1,ty);
1948             fiz1             = _mm256_add_ps(fiz1,tz);
1949
1950             fjx2             = _mm256_add_ps(fjx2,tx);
1951             fjy2             = _mm256_add_ps(fjy2,ty);
1952             fjz2             = _mm256_add_ps(fjz2,tz);
1953
1954             /**************************
1955              * CALCULATE INTERACTIONS *
1956              **************************/
1957
1958             /* COULOMB ELECTROSTATICS */
1959             velec            = _mm256_mul_ps(qq20,rinv20);
1960             felec            = _mm256_mul_ps(velec,rinvsq20);
1961
1962             fscal            = felec;
1963
1964             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1965
1966             /* Calculate temporary vectorial force */
1967             tx               = _mm256_mul_ps(fscal,dx20);
1968             ty               = _mm256_mul_ps(fscal,dy20);
1969             tz               = _mm256_mul_ps(fscal,dz20);
1970
1971             /* Update vectorial force */
1972             fix2             = _mm256_add_ps(fix2,tx);
1973             fiy2             = _mm256_add_ps(fiy2,ty);
1974             fiz2             = _mm256_add_ps(fiz2,tz);
1975
1976             fjx0             = _mm256_add_ps(fjx0,tx);
1977             fjy0             = _mm256_add_ps(fjy0,ty);
1978             fjz0             = _mm256_add_ps(fjz0,tz);
1979
1980             /**************************
1981              * CALCULATE INTERACTIONS *
1982              **************************/
1983
1984             /* COULOMB ELECTROSTATICS */
1985             velec            = _mm256_mul_ps(qq21,rinv21);
1986             felec            = _mm256_mul_ps(velec,rinvsq21);
1987
1988             fscal            = felec;
1989
1990             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1991
1992             /* Calculate temporary vectorial force */
1993             tx               = _mm256_mul_ps(fscal,dx21);
1994             ty               = _mm256_mul_ps(fscal,dy21);
1995             tz               = _mm256_mul_ps(fscal,dz21);
1996
1997             /* Update vectorial force */
1998             fix2             = _mm256_add_ps(fix2,tx);
1999             fiy2             = _mm256_add_ps(fiy2,ty);
2000             fiz2             = _mm256_add_ps(fiz2,tz);
2001
2002             fjx1             = _mm256_add_ps(fjx1,tx);
2003             fjy1             = _mm256_add_ps(fjy1,ty);
2004             fjz1             = _mm256_add_ps(fjz1,tz);
2005
2006             /**************************
2007              * CALCULATE INTERACTIONS *
2008              **************************/
2009
2010             /* COULOMB ELECTROSTATICS */
2011             velec            = _mm256_mul_ps(qq22,rinv22);
2012             felec            = _mm256_mul_ps(velec,rinvsq22);
2013
2014             fscal            = felec;
2015
2016             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2017
2018             /* Calculate temporary vectorial force */
2019             tx               = _mm256_mul_ps(fscal,dx22);
2020             ty               = _mm256_mul_ps(fscal,dy22);
2021             tz               = _mm256_mul_ps(fscal,dz22);
2022
2023             /* Update vectorial force */
2024             fix2             = _mm256_add_ps(fix2,tx);
2025             fiy2             = _mm256_add_ps(fiy2,ty);
2026             fiz2             = _mm256_add_ps(fiz2,tz);
2027
2028             fjx2             = _mm256_add_ps(fjx2,tx);
2029             fjy2             = _mm256_add_ps(fjy2,ty);
2030             fjz2             = _mm256_add_ps(fjz2,tz);
2031
2032             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2033             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2034             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2035             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2036             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2037             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2038             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2039             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2040
2041             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2042                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2043
2044             /* Inner loop uses 262 flops */
2045         }
2046
2047         /* End of innermost loop */
2048
2049         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2050                                                  f+i_coord_offset,fshift+i_shift_offset);
2051
2052         /* Increment number of inner iterations */
2053         inneriter                  += j_index_end - j_index_start;
2054
2055         /* Outer loop uses 18 flops */
2056     }
2057
2058     /* Increment number of outer iterations */
2059     outeriter        += nri;
2060
2061     /* Update outer/inner flops */
2062
2063     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*262);
2064 }