made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_single
38  * Electrostatics interaction: Coulomb
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
94     real             *charge;
95     int              nvdwtype;
96     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97     int              *vdwtype;
98     real             *vdwparam;
99     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
100     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
101     __m256i          vfitab;
102     __m128i          vfitab_lo,vfitab_hi;
103     __m128i          ifour       = _mm_set1_epi32(4);
104     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
105     real             *vftab;
106     __m256           dummy_mask,cutoff_mask;
107     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
108     __m256           one     = _mm256_set1_ps(1.0);
109     __m256           two     = _mm256_set1_ps(2.0);
110     x                = xx[0];
111     f                = ff[0];
112
113     nri              = nlist->nri;
114     iinr             = nlist->iinr;
115     jindex           = nlist->jindex;
116     jjnr             = nlist->jjnr;
117     shiftidx         = nlist->shift;
118     gid              = nlist->gid;
119     shiftvec         = fr->shift_vec[0];
120     fshift           = fr->fshift[0];
121     facel            = _mm256_set1_ps(fr->epsfac);
122     charge           = mdatoms->chargeA;
123     nvdwtype         = fr->ntype;
124     vdwparam         = fr->nbfp;
125     vdwtype          = mdatoms->typeA;
126
127     vftab            = kernel_data->table_vdw->data;
128     vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
129
130     /* Setup water-specific parameters */
131     inr              = nlist->iinr[0];
132     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
133     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
134     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
135     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
136
137     jq0              = _mm256_set1_ps(charge[inr+0]);
138     jq1              = _mm256_set1_ps(charge[inr+1]);
139     jq2              = _mm256_set1_ps(charge[inr+2]);
140     vdwjidx0A        = 2*vdwtype[inr+0];
141     qq00             = _mm256_mul_ps(iq0,jq0);
142     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
143     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
144     qq01             = _mm256_mul_ps(iq0,jq1);
145     qq02             = _mm256_mul_ps(iq0,jq2);
146     qq10             = _mm256_mul_ps(iq1,jq0);
147     qq11             = _mm256_mul_ps(iq1,jq1);
148     qq12             = _mm256_mul_ps(iq1,jq2);
149     qq20             = _mm256_mul_ps(iq2,jq0);
150     qq21             = _mm256_mul_ps(iq2,jq1);
151     qq22             = _mm256_mul_ps(iq2,jq2);
152
153     /* Avoid stupid compiler warnings */
154     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
155     j_coord_offsetA = 0;
156     j_coord_offsetB = 0;
157     j_coord_offsetC = 0;
158     j_coord_offsetD = 0;
159     j_coord_offsetE = 0;
160     j_coord_offsetF = 0;
161     j_coord_offsetG = 0;
162     j_coord_offsetH = 0;
163
164     outeriter        = 0;
165     inneriter        = 0;
166
167     for(iidx=0;iidx<4*DIM;iidx++)
168     {
169         scratch[iidx] = 0.0;
170     }
171
172     /* Start outer loop over neighborlists */
173     for(iidx=0; iidx<nri; iidx++)
174     {
175         /* Load shift vector for this list */
176         i_shift_offset   = DIM*shiftidx[iidx];
177
178         /* Load limits for loop over neighbors */
179         j_index_start    = jindex[iidx];
180         j_index_end      = jindex[iidx+1];
181
182         /* Get outer coordinate index */
183         inr              = iinr[iidx];
184         i_coord_offset   = DIM*inr;
185
186         /* Load i particle coords and add shift vector */
187         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
188                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
189
190         fix0             = _mm256_setzero_ps();
191         fiy0             = _mm256_setzero_ps();
192         fiz0             = _mm256_setzero_ps();
193         fix1             = _mm256_setzero_ps();
194         fiy1             = _mm256_setzero_ps();
195         fiz1             = _mm256_setzero_ps();
196         fix2             = _mm256_setzero_ps();
197         fiy2             = _mm256_setzero_ps();
198         fiz2             = _mm256_setzero_ps();
199
200         /* Reset potential sums */
201         velecsum         = _mm256_setzero_ps();
202         vvdwsum          = _mm256_setzero_ps();
203
204         /* Start inner kernel loop */
205         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
206         {
207
208             /* Get j neighbor index, and coordinate index */
209             jnrA             = jjnr[jidx];
210             jnrB             = jjnr[jidx+1];
211             jnrC             = jjnr[jidx+2];
212             jnrD             = jjnr[jidx+3];
213             jnrE             = jjnr[jidx+4];
214             jnrF             = jjnr[jidx+5];
215             jnrG             = jjnr[jidx+6];
216             jnrH             = jjnr[jidx+7];
217             j_coord_offsetA  = DIM*jnrA;
218             j_coord_offsetB  = DIM*jnrB;
219             j_coord_offsetC  = DIM*jnrC;
220             j_coord_offsetD  = DIM*jnrD;
221             j_coord_offsetE  = DIM*jnrE;
222             j_coord_offsetF  = DIM*jnrF;
223             j_coord_offsetG  = DIM*jnrG;
224             j_coord_offsetH  = DIM*jnrH;
225
226             /* load j atom coordinates */
227             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
228                                                  x+j_coord_offsetC,x+j_coord_offsetD,
229                                                  x+j_coord_offsetE,x+j_coord_offsetF,
230                                                  x+j_coord_offsetG,x+j_coord_offsetH,
231                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
232
233             /* Calculate displacement vector */
234             dx00             = _mm256_sub_ps(ix0,jx0);
235             dy00             = _mm256_sub_ps(iy0,jy0);
236             dz00             = _mm256_sub_ps(iz0,jz0);
237             dx01             = _mm256_sub_ps(ix0,jx1);
238             dy01             = _mm256_sub_ps(iy0,jy1);
239             dz01             = _mm256_sub_ps(iz0,jz1);
240             dx02             = _mm256_sub_ps(ix0,jx2);
241             dy02             = _mm256_sub_ps(iy0,jy2);
242             dz02             = _mm256_sub_ps(iz0,jz2);
243             dx10             = _mm256_sub_ps(ix1,jx0);
244             dy10             = _mm256_sub_ps(iy1,jy0);
245             dz10             = _mm256_sub_ps(iz1,jz0);
246             dx11             = _mm256_sub_ps(ix1,jx1);
247             dy11             = _mm256_sub_ps(iy1,jy1);
248             dz11             = _mm256_sub_ps(iz1,jz1);
249             dx12             = _mm256_sub_ps(ix1,jx2);
250             dy12             = _mm256_sub_ps(iy1,jy2);
251             dz12             = _mm256_sub_ps(iz1,jz2);
252             dx20             = _mm256_sub_ps(ix2,jx0);
253             dy20             = _mm256_sub_ps(iy2,jy0);
254             dz20             = _mm256_sub_ps(iz2,jz0);
255             dx21             = _mm256_sub_ps(ix2,jx1);
256             dy21             = _mm256_sub_ps(iy2,jy1);
257             dz21             = _mm256_sub_ps(iz2,jz1);
258             dx22             = _mm256_sub_ps(ix2,jx2);
259             dy22             = _mm256_sub_ps(iy2,jy2);
260             dz22             = _mm256_sub_ps(iz2,jz2);
261
262             /* Calculate squared distance and things based on it */
263             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
264             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
265             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
266             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
267             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
268             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
269             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
270             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
271             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
272
273             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
274             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
275             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
276             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
277             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
278             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
279             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
280             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
281             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
282
283             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
284             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
285             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
286             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
287             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
288             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
289             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
290             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
291             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
292
293             fjx0             = _mm256_setzero_ps();
294             fjy0             = _mm256_setzero_ps();
295             fjz0             = _mm256_setzero_ps();
296             fjx1             = _mm256_setzero_ps();
297             fjy1             = _mm256_setzero_ps();
298             fjz1             = _mm256_setzero_ps();
299             fjx2             = _mm256_setzero_ps();
300             fjy2             = _mm256_setzero_ps();
301             fjz2             = _mm256_setzero_ps();
302
303             /**************************
304              * CALCULATE INTERACTIONS *
305              **************************/
306
307             r00              = _mm256_mul_ps(rsq00,rinv00);
308
309             /* Calculate table index by multiplying r with table scale and truncate to integer */
310             rt               = _mm256_mul_ps(r00,vftabscale);
311             vfitab           = _mm256_cvttps_epi32(rt);
312             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
313             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
314             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
315             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
316             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
317             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
318
319             /* COULOMB ELECTROSTATICS */
320             velec            = _mm256_mul_ps(qq00,rinv00);
321             felec            = _mm256_mul_ps(velec,rinvsq00);
322
323             /* CUBIC SPLINE TABLE DISPERSION */
324             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
325                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
326             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
327                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
328             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
329                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
330             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
331                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
332             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
333             Heps             = _mm256_mul_ps(vfeps,H);
334             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
335             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
336             vvdw6            = _mm256_mul_ps(c6_00,VV);
337             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
338             fvdw6            = _mm256_mul_ps(c6_00,FF);
339
340             /* CUBIC SPLINE TABLE REPULSION */
341             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
342             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
343             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
344                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
345             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
346                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
347             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
348                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
349             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
350                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
351             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
352             Heps             = _mm256_mul_ps(vfeps,H);
353             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
354             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
355             vvdw12           = _mm256_mul_ps(c12_00,VV);
356             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
357             fvdw12           = _mm256_mul_ps(c12_00,FF);
358             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
359             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
360
361             /* Update potential sum for this i atom from the interaction with this j atom. */
362             velecsum         = _mm256_add_ps(velecsum,velec);
363             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
364
365             fscal            = _mm256_add_ps(felec,fvdw);
366
367             /* Calculate temporary vectorial force */
368             tx               = _mm256_mul_ps(fscal,dx00);
369             ty               = _mm256_mul_ps(fscal,dy00);
370             tz               = _mm256_mul_ps(fscal,dz00);
371
372             /* Update vectorial force */
373             fix0             = _mm256_add_ps(fix0,tx);
374             fiy0             = _mm256_add_ps(fiy0,ty);
375             fiz0             = _mm256_add_ps(fiz0,tz);
376
377             fjx0             = _mm256_add_ps(fjx0,tx);
378             fjy0             = _mm256_add_ps(fjy0,ty);
379             fjz0             = _mm256_add_ps(fjz0,tz);
380
381             /**************************
382              * CALCULATE INTERACTIONS *
383              **************************/
384
385             /* COULOMB ELECTROSTATICS */
386             velec            = _mm256_mul_ps(qq01,rinv01);
387             felec            = _mm256_mul_ps(velec,rinvsq01);
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velecsum         = _mm256_add_ps(velecsum,velec);
391
392             fscal            = felec;
393
394             /* Calculate temporary vectorial force */
395             tx               = _mm256_mul_ps(fscal,dx01);
396             ty               = _mm256_mul_ps(fscal,dy01);
397             tz               = _mm256_mul_ps(fscal,dz01);
398
399             /* Update vectorial force */
400             fix0             = _mm256_add_ps(fix0,tx);
401             fiy0             = _mm256_add_ps(fiy0,ty);
402             fiz0             = _mm256_add_ps(fiz0,tz);
403
404             fjx1             = _mm256_add_ps(fjx1,tx);
405             fjy1             = _mm256_add_ps(fjy1,ty);
406             fjz1             = _mm256_add_ps(fjz1,tz);
407
408             /**************************
409              * CALCULATE INTERACTIONS *
410              **************************/
411
412             /* COULOMB ELECTROSTATICS */
413             velec            = _mm256_mul_ps(qq02,rinv02);
414             felec            = _mm256_mul_ps(velec,rinvsq02);
415
416             /* Update potential sum for this i atom from the interaction with this j atom. */
417             velecsum         = _mm256_add_ps(velecsum,velec);
418
419             fscal            = felec;
420
421             /* Calculate temporary vectorial force */
422             tx               = _mm256_mul_ps(fscal,dx02);
423             ty               = _mm256_mul_ps(fscal,dy02);
424             tz               = _mm256_mul_ps(fscal,dz02);
425
426             /* Update vectorial force */
427             fix0             = _mm256_add_ps(fix0,tx);
428             fiy0             = _mm256_add_ps(fiy0,ty);
429             fiz0             = _mm256_add_ps(fiz0,tz);
430
431             fjx2             = _mm256_add_ps(fjx2,tx);
432             fjy2             = _mm256_add_ps(fjy2,ty);
433             fjz2             = _mm256_add_ps(fjz2,tz);
434
435             /**************************
436              * CALCULATE INTERACTIONS *
437              **************************/
438
439             /* COULOMB ELECTROSTATICS */
440             velec            = _mm256_mul_ps(qq10,rinv10);
441             felec            = _mm256_mul_ps(velec,rinvsq10);
442
443             /* Update potential sum for this i atom from the interaction with this j atom. */
444             velecsum         = _mm256_add_ps(velecsum,velec);
445
446             fscal            = felec;
447
448             /* Calculate temporary vectorial force */
449             tx               = _mm256_mul_ps(fscal,dx10);
450             ty               = _mm256_mul_ps(fscal,dy10);
451             tz               = _mm256_mul_ps(fscal,dz10);
452
453             /* Update vectorial force */
454             fix1             = _mm256_add_ps(fix1,tx);
455             fiy1             = _mm256_add_ps(fiy1,ty);
456             fiz1             = _mm256_add_ps(fiz1,tz);
457
458             fjx0             = _mm256_add_ps(fjx0,tx);
459             fjy0             = _mm256_add_ps(fjy0,ty);
460             fjz0             = _mm256_add_ps(fjz0,tz);
461
462             /**************************
463              * CALCULATE INTERACTIONS *
464              **************************/
465
466             /* COULOMB ELECTROSTATICS */
467             velec            = _mm256_mul_ps(qq11,rinv11);
468             felec            = _mm256_mul_ps(velec,rinvsq11);
469
470             /* Update potential sum for this i atom from the interaction with this j atom. */
471             velecsum         = _mm256_add_ps(velecsum,velec);
472
473             fscal            = felec;
474
475             /* Calculate temporary vectorial force */
476             tx               = _mm256_mul_ps(fscal,dx11);
477             ty               = _mm256_mul_ps(fscal,dy11);
478             tz               = _mm256_mul_ps(fscal,dz11);
479
480             /* Update vectorial force */
481             fix1             = _mm256_add_ps(fix1,tx);
482             fiy1             = _mm256_add_ps(fiy1,ty);
483             fiz1             = _mm256_add_ps(fiz1,tz);
484
485             fjx1             = _mm256_add_ps(fjx1,tx);
486             fjy1             = _mm256_add_ps(fjy1,ty);
487             fjz1             = _mm256_add_ps(fjz1,tz);
488
489             /**************************
490              * CALCULATE INTERACTIONS *
491              **************************/
492
493             /* COULOMB ELECTROSTATICS */
494             velec            = _mm256_mul_ps(qq12,rinv12);
495             felec            = _mm256_mul_ps(velec,rinvsq12);
496
497             /* Update potential sum for this i atom from the interaction with this j atom. */
498             velecsum         = _mm256_add_ps(velecsum,velec);
499
500             fscal            = felec;
501
502             /* Calculate temporary vectorial force */
503             tx               = _mm256_mul_ps(fscal,dx12);
504             ty               = _mm256_mul_ps(fscal,dy12);
505             tz               = _mm256_mul_ps(fscal,dz12);
506
507             /* Update vectorial force */
508             fix1             = _mm256_add_ps(fix1,tx);
509             fiy1             = _mm256_add_ps(fiy1,ty);
510             fiz1             = _mm256_add_ps(fiz1,tz);
511
512             fjx2             = _mm256_add_ps(fjx2,tx);
513             fjy2             = _mm256_add_ps(fjy2,ty);
514             fjz2             = _mm256_add_ps(fjz2,tz);
515
516             /**************************
517              * CALCULATE INTERACTIONS *
518              **************************/
519
520             /* COULOMB ELECTROSTATICS */
521             velec            = _mm256_mul_ps(qq20,rinv20);
522             felec            = _mm256_mul_ps(velec,rinvsq20);
523
524             /* Update potential sum for this i atom from the interaction with this j atom. */
525             velecsum         = _mm256_add_ps(velecsum,velec);
526
527             fscal            = felec;
528
529             /* Calculate temporary vectorial force */
530             tx               = _mm256_mul_ps(fscal,dx20);
531             ty               = _mm256_mul_ps(fscal,dy20);
532             tz               = _mm256_mul_ps(fscal,dz20);
533
534             /* Update vectorial force */
535             fix2             = _mm256_add_ps(fix2,tx);
536             fiy2             = _mm256_add_ps(fiy2,ty);
537             fiz2             = _mm256_add_ps(fiz2,tz);
538
539             fjx0             = _mm256_add_ps(fjx0,tx);
540             fjy0             = _mm256_add_ps(fjy0,ty);
541             fjz0             = _mm256_add_ps(fjz0,tz);
542
543             /**************************
544              * CALCULATE INTERACTIONS *
545              **************************/
546
547             /* COULOMB ELECTROSTATICS */
548             velec            = _mm256_mul_ps(qq21,rinv21);
549             felec            = _mm256_mul_ps(velec,rinvsq21);
550
551             /* Update potential sum for this i atom from the interaction with this j atom. */
552             velecsum         = _mm256_add_ps(velecsum,velec);
553
554             fscal            = felec;
555
556             /* Calculate temporary vectorial force */
557             tx               = _mm256_mul_ps(fscal,dx21);
558             ty               = _mm256_mul_ps(fscal,dy21);
559             tz               = _mm256_mul_ps(fscal,dz21);
560
561             /* Update vectorial force */
562             fix2             = _mm256_add_ps(fix2,tx);
563             fiy2             = _mm256_add_ps(fiy2,ty);
564             fiz2             = _mm256_add_ps(fiz2,tz);
565
566             fjx1             = _mm256_add_ps(fjx1,tx);
567             fjy1             = _mm256_add_ps(fjy1,ty);
568             fjz1             = _mm256_add_ps(fjz1,tz);
569
570             /**************************
571              * CALCULATE INTERACTIONS *
572              **************************/
573
574             /* COULOMB ELECTROSTATICS */
575             velec            = _mm256_mul_ps(qq22,rinv22);
576             felec            = _mm256_mul_ps(velec,rinvsq22);
577
578             /* Update potential sum for this i atom from the interaction with this j atom. */
579             velecsum         = _mm256_add_ps(velecsum,velec);
580
581             fscal            = felec;
582
583             /* Calculate temporary vectorial force */
584             tx               = _mm256_mul_ps(fscal,dx22);
585             ty               = _mm256_mul_ps(fscal,dy22);
586             tz               = _mm256_mul_ps(fscal,dz22);
587
588             /* Update vectorial force */
589             fix2             = _mm256_add_ps(fix2,tx);
590             fiy2             = _mm256_add_ps(fiy2,ty);
591             fiz2             = _mm256_add_ps(fiz2,tz);
592
593             fjx2             = _mm256_add_ps(fjx2,tx);
594             fjy2             = _mm256_add_ps(fjy2,ty);
595             fjz2             = _mm256_add_ps(fjz2,tz);
596
597             fjptrA             = f+j_coord_offsetA;
598             fjptrB             = f+j_coord_offsetB;
599             fjptrC             = f+j_coord_offsetC;
600             fjptrD             = f+j_coord_offsetD;
601             fjptrE             = f+j_coord_offsetE;
602             fjptrF             = f+j_coord_offsetF;
603             fjptrG             = f+j_coord_offsetG;
604             fjptrH             = f+j_coord_offsetH;
605
606             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
607                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
608
609             /* Inner loop uses 278 flops */
610         }
611
612         if(jidx<j_index_end)
613         {
614
615             /* Get j neighbor index, and coordinate index */
616             jnrlistA         = jjnr[jidx];
617             jnrlistB         = jjnr[jidx+1];
618             jnrlistC         = jjnr[jidx+2];
619             jnrlistD         = jjnr[jidx+3];
620             jnrlistE         = jjnr[jidx+4];
621             jnrlistF         = jjnr[jidx+5];
622             jnrlistG         = jjnr[jidx+6];
623             jnrlistH         = jjnr[jidx+7];
624             /* Sign of each element will be negative for non-real atoms.
625              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
626              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
627              */
628             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
629                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
630                                             
631             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
632             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
633             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
634             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
635             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
636             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
637             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
638             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
639             j_coord_offsetA  = DIM*jnrA;
640             j_coord_offsetB  = DIM*jnrB;
641             j_coord_offsetC  = DIM*jnrC;
642             j_coord_offsetD  = DIM*jnrD;
643             j_coord_offsetE  = DIM*jnrE;
644             j_coord_offsetF  = DIM*jnrF;
645             j_coord_offsetG  = DIM*jnrG;
646             j_coord_offsetH  = DIM*jnrH;
647
648             /* load j atom coordinates */
649             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
650                                                  x+j_coord_offsetC,x+j_coord_offsetD,
651                                                  x+j_coord_offsetE,x+j_coord_offsetF,
652                                                  x+j_coord_offsetG,x+j_coord_offsetH,
653                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
654
655             /* Calculate displacement vector */
656             dx00             = _mm256_sub_ps(ix0,jx0);
657             dy00             = _mm256_sub_ps(iy0,jy0);
658             dz00             = _mm256_sub_ps(iz0,jz0);
659             dx01             = _mm256_sub_ps(ix0,jx1);
660             dy01             = _mm256_sub_ps(iy0,jy1);
661             dz01             = _mm256_sub_ps(iz0,jz1);
662             dx02             = _mm256_sub_ps(ix0,jx2);
663             dy02             = _mm256_sub_ps(iy0,jy2);
664             dz02             = _mm256_sub_ps(iz0,jz2);
665             dx10             = _mm256_sub_ps(ix1,jx0);
666             dy10             = _mm256_sub_ps(iy1,jy0);
667             dz10             = _mm256_sub_ps(iz1,jz0);
668             dx11             = _mm256_sub_ps(ix1,jx1);
669             dy11             = _mm256_sub_ps(iy1,jy1);
670             dz11             = _mm256_sub_ps(iz1,jz1);
671             dx12             = _mm256_sub_ps(ix1,jx2);
672             dy12             = _mm256_sub_ps(iy1,jy2);
673             dz12             = _mm256_sub_ps(iz1,jz2);
674             dx20             = _mm256_sub_ps(ix2,jx0);
675             dy20             = _mm256_sub_ps(iy2,jy0);
676             dz20             = _mm256_sub_ps(iz2,jz0);
677             dx21             = _mm256_sub_ps(ix2,jx1);
678             dy21             = _mm256_sub_ps(iy2,jy1);
679             dz21             = _mm256_sub_ps(iz2,jz1);
680             dx22             = _mm256_sub_ps(ix2,jx2);
681             dy22             = _mm256_sub_ps(iy2,jy2);
682             dz22             = _mm256_sub_ps(iz2,jz2);
683
684             /* Calculate squared distance and things based on it */
685             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
686             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
687             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
688             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
689             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
690             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
691             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
692             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
693             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
694
695             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
696             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
697             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
698             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
699             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
700             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
701             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
702             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
703             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
704
705             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
706             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
707             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
708             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
709             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
710             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
711             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
712             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
713             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
714
715             fjx0             = _mm256_setzero_ps();
716             fjy0             = _mm256_setzero_ps();
717             fjz0             = _mm256_setzero_ps();
718             fjx1             = _mm256_setzero_ps();
719             fjy1             = _mm256_setzero_ps();
720             fjz1             = _mm256_setzero_ps();
721             fjx2             = _mm256_setzero_ps();
722             fjy2             = _mm256_setzero_ps();
723             fjz2             = _mm256_setzero_ps();
724
725             /**************************
726              * CALCULATE INTERACTIONS *
727              **************************/
728
729             r00              = _mm256_mul_ps(rsq00,rinv00);
730             r00              = _mm256_andnot_ps(dummy_mask,r00);
731
732             /* Calculate table index by multiplying r with table scale and truncate to integer */
733             rt               = _mm256_mul_ps(r00,vftabscale);
734             vfitab           = _mm256_cvttps_epi32(rt);
735             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
736             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
737             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
738             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
739             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
740             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
741
742             /* COULOMB ELECTROSTATICS */
743             velec            = _mm256_mul_ps(qq00,rinv00);
744             felec            = _mm256_mul_ps(velec,rinvsq00);
745
746             /* CUBIC SPLINE TABLE DISPERSION */
747             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
748                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
749             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
750                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
751             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
752                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
753             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
754                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
755             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
756             Heps             = _mm256_mul_ps(vfeps,H);
757             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
758             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
759             vvdw6            = _mm256_mul_ps(c6_00,VV);
760             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
761             fvdw6            = _mm256_mul_ps(c6_00,FF);
762
763             /* CUBIC SPLINE TABLE REPULSION */
764             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
765             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
766             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
767                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
768             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
769                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
770             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
771                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
772             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
773                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
774             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
775             Heps             = _mm256_mul_ps(vfeps,H);
776             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
777             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
778             vvdw12           = _mm256_mul_ps(c12_00,VV);
779             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
780             fvdw12           = _mm256_mul_ps(c12_00,FF);
781             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
782             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
783
784             /* Update potential sum for this i atom from the interaction with this j atom. */
785             velec            = _mm256_andnot_ps(dummy_mask,velec);
786             velecsum         = _mm256_add_ps(velecsum,velec);
787             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
788             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
789
790             fscal            = _mm256_add_ps(felec,fvdw);
791
792             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
793
794             /* Calculate temporary vectorial force */
795             tx               = _mm256_mul_ps(fscal,dx00);
796             ty               = _mm256_mul_ps(fscal,dy00);
797             tz               = _mm256_mul_ps(fscal,dz00);
798
799             /* Update vectorial force */
800             fix0             = _mm256_add_ps(fix0,tx);
801             fiy0             = _mm256_add_ps(fiy0,ty);
802             fiz0             = _mm256_add_ps(fiz0,tz);
803
804             fjx0             = _mm256_add_ps(fjx0,tx);
805             fjy0             = _mm256_add_ps(fjy0,ty);
806             fjz0             = _mm256_add_ps(fjz0,tz);
807
808             /**************************
809              * CALCULATE INTERACTIONS *
810              **************************/
811
812             /* COULOMB ELECTROSTATICS */
813             velec            = _mm256_mul_ps(qq01,rinv01);
814             felec            = _mm256_mul_ps(velec,rinvsq01);
815
816             /* Update potential sum for this i atom from the interaction with this j atom. */
817             velec            = _mm256_andnot_ps(dummy_mask,velec);
818             velecsum         = _mm256_add_ps(velecsum,velec);
819
820             fscal            = felec;
821
822             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
823
824             /* Calculate temporary vectorial force */
825             tx               = _mm256_mul_ps(fscal,dx01);
826             ty               = _mm256_mul_ps(fscal,dy01);
827             tz               = _mm256_mul_ps(fscal,dz01);
828
829             /* Update vectorial force */
830             fix0             = _mm256_add_ps(fix0,tx);
831             fiy0             = _mm256_add_ps(fiy0,ty);
832             fiz0             = _mm256_add_ps(fiz0,tz);
833
834             fjx1             = _mm256_add_ps(fjx1,tx);
835             fjy1             = _mm256_add_ps(fjy1,ty);
836             fjz1             = _mm256_add_ps(fjz1,tz);
837
838             /**************************
839              * CALCULATE INTERACTIONS *
840              **************************/
841
842             /* COULOMB ELECTROSTATICS */
843             velec            = _mm256_mul_ps(qq02,rinv02);
844             felec            = _mm256_mul_ps(velec,rinvsq02);
845
846             /* Update potential sum for this i atom from the interaction with this j atom. */
847             velec            = _mm256_andnot_ps(dummy_mask,velec);
848             velecsum         = _mm256_add_ps(velecsum,velec);
849
850             fscal            = felec;
851
852             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
853
854             /* Calculate temporary vectorial force */
855             tx               = _mm256_mul_ps(fscal,dx02);
856             ty               = _mm256_mul_ps(fscal,dy02);
857             tz               = _mm256_mul_ps(fscal,dz02);
858
859             /* Update vectorial force */
860             fix0             = _mm256_add_ps(fix0,tx);
861             fiy0             = _mm256_add_ps(fiy0,ty);
862             fiz0             = _mm256_add_ps(fiz0,tz);
863
864             fjx2             = _mm256_add_ps(fjx2,tx);
865             fjy2             = _mm256_add_ps(fjy2,ty);
866             fjz2             = _mm256_add_ps(fjz2,tz);
867
868             /**************************
869              * CALCULATE INTERACTIONS *
870              **************************/
871
872             /* COULOMB ELECTROSTATICS */
873             velec            = _mm256_mul_ps(qq10,rinv10);
874             felec            = _mm256_mul_ps(velec,rinvsq10);
875
876             /* Update potential sum for this i atom from the interaction with this j atom. */
877             velec            = _mm256_andnot_ps(dummy_mask,velec);
878             velecsum         = _mm256_add_ps(velecsum,velec);
879
880             fscal            = felec;
881
882             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
883
884             /* Calculate temporary vectorial force */
885             tx               = _mm256_mul_ps(fscal,dx10);
886             ty               = _mm256_mul_ps(fscal,dy10);
887             tz               = _mm256_mul_ps(fscal,dz10);
888
889             /* Update vectorial force */
890             fix1             = _mm256_add_ps(fix1,tx);
891             fiy1             = _mm256_add_ps(fiy1,ty);
892             fiz1             = _mm256_add_ps(fiz1,tz);
893
894             fjx0             = _mm256_add_ps(fjx0,tx);
895             fjy0             = _mm256_add_ps(fjy0,ty);
896             fjz0             = _mm256_add_ps(fjz0,tz);
897
898             /**************************
899              * CALCULATE INTERACTIONS *
900              **************************/
901
902             /* COULOMB ELECTROSTATICS */
903             velec            = _mm256_mul_ps(qq11,rinv11);
904             felec            = _mm256_mul_ps(velec,rinvsq11);
905
906             /* Update potential sum for this i atom from the interaction with this j atom. */
907             velec            = _mm256_andnot_ps(dummy_mask,velec);
908             velecsum         = _mm256_add_ps(velecsum,velec);
909
910             fscal            = felec;
911
912             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
913
914             /* Calculate temporary vectorial force */
915             tx               = _mm256_mul_ps(fscal,dx11);
916             ty               = _mm256_mul_ps(fscal,dy11);
917             tz               = _mm256_mul_ps(fscal,dz11);
918
919             /* Update vectorial force */
920             fix1             = _mm256_add_ps(fix1,tx);
921             fiy1             = _mm256_add_ps(fiy1,ty);
922             fiz1             = _mm256_add_ps(fiz1,tz);
923
924             fjx1             = _mm256_add_ps(fjx1,tx);
925             fjy1             = _mm256_add_ps(fjy1,ty);
926             fjz1             = _mm256_add_ps(fjz1,tz);
927
928             /**************************
929              * CALCULATE INTERACTIONS *
930              **************************/
931
932             /* COULOMB ELECTROSTATICS */
933             velec            = _mm256_mul_ps(qq12,rinv12);
934             felec            = _mm256_mul_ps(velec,rinvsq12);
935
936             /* Update potential sum for this i atom from the interaction with this j atom. */
937             velec            = _mm256_andnot_ps(dummy_mask,velec);
938             velecsum         = _mm256_add_ps(velecsum,velec);
939
940             fscal            = felec;
941
942             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
943
944             /* Calculate temporary vectorial force */
945             tx               = _mm256_mul_ps(fscal,dx12);
946             ty               = _mm256_mul_ps(fscal,dy12);
947             tz               = _mm256_mul_ps(fscal,dz12);
948
949             /* Update vectorial force */
950             fix1             = _mm256_add_ps(fix1,tx);
951             fiy1             = _mm256_add_ps(fiy1,ty);
952             fiz1             = _mm256_add_ps(fiz1,tz);
953
954             fjx2             = _mm256_add_ps(fjx2,tx);
955             fjy2             = _mm256_add_ps(fjy2,ty);
956             fjz2             = _mm256_add_ps(fjz2,tz);
957
958             /**************************
959              * CALCULATE INTERACTIONS *
960              **************************/
961
962             /* COULOMB ELECTROSTATICS */
963             velec            = _mm256_mul_ps(qq20,rinv20);
964             felec            = _mm256_mul_ps(velec,rinvsq20);
965
966             /* Update potential sum for this i atom from the interaction with this j atom. */
967             velec            = _mm256_andnot_ps(dummy_mask,velec);
968             velecsum         = _mm256_add_ps(velecsum,velec);
969
970             fscal            = felec;
971
972             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
973
974             /* Calculate temporary vectorial force */
975             tx               = _mm256_mul_ps(fscal,dx20);
976             ty               = _mm256_mul_ps(fscal,dy20);
977             tz               = _mm256_mul_ps(fscal,dz20);
978
979             /* Update vectorial force */
980             fix2             = _mm256_add_ps(fix2,tx);
981             fiy2             = _mm256_add_ps(fiy2,ty);
982             fiz2             = _mm256_add_ps(fiz2,tz);
983
984             fjx0             = _mm256_add_ps(fjx0,tx);
985             fjy0             = _mm256_add_ps(fjy0,ty);
986             fjz0             = _mm256_add_ps(fjz0,tz);
987
988             /**************************
989              * CALCULATE INTERACTIONS *
990              **************************/
991
992             /* COULOMB ELECTROSTATICS */
993             velec            = _mm256_mul_ps(qq21,rinv21);
994             felec            = _mm256_mul_ps(velec,rinvsq21);
995
996             /* Update potential sum for this i atom from the interaction with this j atom. */
997             velec            = _mm256_andnot_ps(dummy_mask,velec);
998             velecsum         = _mm256_add_ps(velecsum,velec);
999
1000             fscal            = felec;
1001
1002             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1003
1004             /* Calculate temporary vectorial force */
1005             tx               = _mm256_mul_ps(fscal,dx21);
1006             ty               = _mm256_mul_ps(fscal,dy21);
1007             tz               = _mm256_mul_ps(fscal,dz21);
1008
1009             /* Update vectorial force */
1010             fix2             = _mm256_add_ps(fix2,tx);
1011             fiy2             = _mm256_add_ps(fiy2,ty);
1012             fiz2             = _mm256_add_ps(fiz2,tz);
1013
1014             fjx1             = _mm256_add_ps(fjx1,tx);
1015             fjy1             = _mm256_add_ps(fjy1,ty);
1016             fjz1             = _mm256_add_ps(fjz1,tz);
1017
1018             /**************************
1019              * CALCULATE INTERACTIONS *
1020              **************************/
1021
1022             /* COULOMB ELECTROSTATICS */
1023             velec            = _mm256_mul_ps(qq22,rinv22);
1024             felec            = _mm256_mul_ps(velec,rinvsq22);
1025
1026             /* Update potential sum for this i atom from the interaction with this j atom. */
1027             velec            = _mm256_andnot_ps(dummy_mask,velec);
1028             velecsum         = _mm256_add_ps(velecsum,velec);
1029
1030             fscal            = felec;
1031
1032             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1033
1034             /* Calculate temporary vectorial force */
1035             tx               = _mm256_mul_ps(fscal,dx22);
1036             ty               = _mm256_mul_ps(fscal,dy22);
1037             tz               = _mm256_mul_ps(fscal,dz22);
1038
1039             /* Update vectorial force */
1040             fix2             = _mm256_add_ps(fix2,tx);
1041             fiy2             = _mm256_add_ps(fiy2,ty);
1042             fiz2             = _mm256_add_ps(fiz2,tz);
1043
1044             fjx2             = _mm256_add_ps(fjx2,tx);
1045             fjy2             = _mm256_add_ps(fjy2,ty);
1046             fjz2             = _mm256_add_ps(fjz2,tz);
1047
1048             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1049             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1050             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1051             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1052             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1053             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1054             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1055             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1056
1057             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1058                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1059
1060             /* Inner loop uses 279 flops */
1061         }
1062
1063         /* End of innermost loop */
1064
1065         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1066                                                  f+i_coord_offset,fshift+i_shift_offset);
1067
1068         ggid                        = gid[iidx];
1069         /* Update potential energies */
1070         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1071         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1072
1073         /* Increment number of inner iterations */
1074         inneriter                  += j_index_end - j_index_start;
1075
1076         /* Outer loop uses 20 flops */
1077     }
1078
1079     /* Increment number of outer iterations */
1080     outeriter        += nri;
1081
1082     /* Update outer/inner flops */
1083
1084     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*279);
1085 }
1086 /*
1087  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_single
1088  * Electrostatics interaction: Coulomb
1089  * VdW interaction:            CubicSplineTable
1090  * Geometry:                   Water3-Water3
1091  * Calculate force/pot:        Force
1092  */
1093 void
1094 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_256_single
1095                     (t_nblist * gmx_restrict                nlist,
1096                      rvec * gmx_restrict                    xx,
1097                      rvec * gmx_restrict                    ff,
1098                      t_forcerec * gmx_restrict              fr,
1099                      t_mdatoms * gmx_restrict               mdatoms,
1100                      nb_kernel_data_t * gmx_restrict        kernel_data,
1101                      t_nrnb * gmx_restrict                  nrnb)
1102 {
1103     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1104      * just 0 for non-waters.
1105      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1106      * jnr indices corresponding to data put in the four positions in the SIMD register.
1107      */
1108     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1109     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1110     int              jnrA,jnrB,jnrC,jnrD;
1111     int              jnrE,jnrF,jnrG,jnrH;
1112     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1113     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1114     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1115     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1116     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1117     real             rcutoff_scalar;
1118     real             *shiftvec,*fshift,*x,*f;
1119     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1120     real             scratch[4*DIM];
1121     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1122     real *           vdwioffsetptr0;
1123     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1124     real *           vdwioffsetptr1;
1125     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1126     real *           vdwioffsetptr2;
1127     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1128     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1129     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1130     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1131     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1132     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1133     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1134     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1135     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1136     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1137     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1138     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1139     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1140     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1141     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1142     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1143     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1144     real             *charge;
1145     int              nvdwtype;
1146     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1147     int              *vdwtype;
1148     real             *vdwparam;
1149     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1150     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1151     __m256i          vfitab;
1152     __m128i          vfitab_lo,vfitab_hi;
1153     __m128i          ifour       = _mm_set1_epi32(4);
1154     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1155     real             *vftab;
1156     __m256           dummy_mask,cutoff_mask;
1157     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1158     __m256           one     = _mm256_set1_ps(1.0);
1159     __m256           two     = _mm256_set1_ps(2.0);
1160     x                = xx[0];
1161     f                = ff[0];
1162
1163     nri              = nlist->nri;
1164     iinr             = nlist->iinr;
1165     jindex           = nlist->jindex;
1166     jjnr             = nlist->jjnr;
1167     shiftidx         = nlist->shift;
1168     gid              = nlist->gid;
1169     shiftvec         = fr->shift_vec[0];
1170     fshift           = fr->fshift[0];
1171     facel            = _mm256_set1_ps(fr->epsfac);
1172     charge           = mdatoms->chargeA;
1173     nvdwtype         = fr->ntype;
1174     vdwparam         = fr->nbfp;
1175     vdwtype          = mdatoms->typeA;
1176
1177     vftab            = kernel_data->table_vdw->data;
1178     vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
1179
1180     /* Setup water-specific parameters */
1181     inr              = nlist->iinr[0];
1182     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1183     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1184     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1185     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1186
1187     jq0              = _mm256_set1_ps(charge[inr+0]);
1188     jq1              = _mm256_set1_ps(charge[inr+1]);
1189     jq2              = _mm256_set1_ps(charge[inr+2]);
1190     vdwjidx0A        = 2*vdwtype[inr+0];
1191     qq00             = _mm256_mul_ps(iq0,jq0);
1192     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1193     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1194     qq01             = _mm256_mul_ps(iq0,jq1);
1195     qq02             = _mm256_mul_ps(iq0,jq2);
1196     qq10             = _mm256_mul_ps(iq1,jq0);
1197     qq11             = _mm256_mul_ps(iq1,jq1);
1198     qq12             = _mm256_mul_ps(iq1,jq2);
1199     qq20             = _mm256_mul_ps(iq2,jq0);
1200     qq21             = _mm256_mul_ps(iq2,jq1);
1201     qq22             = _mm256_mul_ps(iq2,jq2);
1202
1203     /* Avoid stupid compiler warnings */
1204     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1205     j_coord_offsetA = 0;
1206     j_coord_offsetB = 0;
1207     j_coord_offsetC = 0;
1208     j_coord_offsetD = 0;
1209     j_coord_offsetE = 0;
1210     j_coord_offsetF = 0;
1211     j_coord_offsetG = 0;
1212     j_coord_offsetH = 0;
1213
1214     outeriter        = 0;
1215     inneriter        = 0;
1216
1217     for(iidx=0;iidx<4*DIM;iidx++)
1218     {
1219         scratch[iidx] = 0.0;
1220     }
1221
1222     /* Start outer loop over neighborlists */
1223     for(iidx=0; iidx<nri; iidx++)
1224     {
1225         /* Load shift vector for this list */
1226         i_shift_offset   = DIM*shiftidx[iidx];
1227
1228         /* Load limits for loop over neighbors */
1229         j_index_start    = jindex[iidx];
1230         j_index_end      = jindex[iidx+1];
1231
1232         /* Get outer coordinate index */
1233         inr              = iinr[iidx];
1234         i_coord_offset   = DIM*inr;
1235
1236         /* Load i particle coords and add shift vector */
1237         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1238                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1239
1240         fix0             = _mm256_setzero_ps();
1241         fiy0             = _mm256_setzero_ps();
1242         fiz0             = _mm256_setzero_ps();
1243         fix1             = _mm256_setzero_ps();
1244         fiy1             = _mm256_setzero_ps();
1245         fiz1             = _mm256_setzero_ps();
1246         fix2             = _mm256_setzero_ps();
1247         fiy2             = _mm256_setzero_ps();
1248         fiz2             = _mm256_setzero_ps();
1249
1250         /* Start inner kernel loop */
1251         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1252         {
1253
1254             /* Get j neighbor index, and coordinate index */
1255             jnrA             = jjnr[jidx];
1256             jnrB             = jjnr[jidx+1];
1257             jnrC             = jjnr[jidx+2];
1258             jnrD             = jjnr[jidx+3];
1259             jnrE             = jjnr[jidx+4];
1260             jnrF             = jjnr[jidx+5];
1261             jnrG             = jjnr[jidx+6];
1262             jnrH             = jjnr[jidx+7];
1263             j_coord_offsetA  = DIM*jnrA;
1264             j_coord_offsetB  = DIM*jnrB;
1265             j_coord_offsetC  = DIM*jnrC;
1266             j_coord_offsetD  = DIM*jnrD;
1267             j_coord_offsetE  = DIM*jnrE;
1268             j_coord_offsetF  = DIM*jnrF;
1269             j_coord_offsetG  = DIM*jnrG;
1270             j_coord_offsetH  = DIM*jnrH;
1271
1272             /* load j atom coordinates */
1273             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1274                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1275                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1276                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1277                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1278
1279             /* Calculate displacement vector */
1280             dx00             = _mm256_sub_ps(ix0,jx0);
1281             dy00             = _mm256_sub_ps(iy0,jy0);
1282             dz00             = _mm256_sub_ps(iz0,jz0);
1283             dx01             = _mm256_sub_ps(ix0,jx1);
1284             dy01             = _mm256_sub_ps(iy0,jy1);
1285             dz01             = _mm256_sub_ps(iz0,jz1);
1286             dx02             = _mm256_sub_ps(ix0,jx2);
1287             dy02             = _mm256_sub_ps(iy0,jy2);
1288             dz02             = _mm256_sub_ps(iz0,jz2);
1289             dx10             = _mm256_sub_ps(ix1,jx0);
1290             dy10             = _mm256_sub_ps(iy1,jy0);
1291             dz10             = _mm256_sub_ps(iz1,jz0);
1292             dx11             = _mm256_sub_ps(ix1,jx1);
1293             dy11             = _mm256_sub_ps(iy1,jy1);
1294             dz11             = _mm256_sub_ps(iz1,jz1);
1295             dx12             = _mm256_sub_ps(ix1,jx2);
1296             dy12             = _mm256_sub_ps(iy1,jy2);
1297             dz12             = _mm256_sub_ps(iz1,jz2);
1298             dx20             = _mm256_sub_ps(ix2,jx0);
1299             dy20             = _mm256_sub_ps(iy2,jy0);
1300             dz20             = _mm256_sub_ps(iz2,jz0);
1301             dx21             = _mm256_sub_ps(ix2,jx1);
1302             dy21             = _mm256_sub_ps(iy2,jy1);
1303             dz21             = _mm256_sub_ps(iz2,jz1);
1304             dx22             = _mm256_sub_ps(ix2,jx2);
1305             dy22             = _mm256_sub_ps(iy2,jy2);
1306             dz22             = _mm256_sub_ps(iz2,jz2);
1307
1308             /* Calculate squared distance and things based on it */
1309             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1310             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1311             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1312             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1313             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1314             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1315             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1316             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1317             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1318
1319             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1320             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1321             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1322             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1323             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1324             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1325             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1326             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1327             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1328
1329             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1330             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1331             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1332             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1333             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1334             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1335             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1336             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1337             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1338
1339             fjx0             = _mm256_setzero_ps();
1340             fjy0             = _mm256_setzero_ps();
1341             fjz0             = _mm256_setzero_ps();
1342             fjx1             = _mm256_setzero_ps();
1343             fjy1             = _mm256_setzero_ps();
1344             fjz1             = _mm256_setzero_ps();
1345             fjx2             = _mm256_setzero_ps();
1346             fjy2             = _mm256_setzero_ps();
1347             fjz2             = _mm256_setzero_ps();
1348
1349             /**************************
1350              * CALCULATE INTERACTIONS *
1351              **************************/
1352
1353             r00              = _mm256_mul_ps(rsq00,rinv00);
1354
1355             /* Calculate table index by multiplying r with table scale and truncate to integer */
1356             rt               = _mm256_mul_ps(r00,vftabscale);
1357             vfitab           = _mm256_cvttps_epi32(rt);
1358             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1359             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1360             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1361             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1362             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
1363             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
1364
1365             /* COULOMB ELECTROSTATICS */
1366             velec            = _mm256_mul_ps(qq00,rinv00);
1367             felec            = _mm256_mul_ps(velec,rinvsq00);
1368
1369             /* CUBIC SPLINE TABLE DISPERSION */
1370             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1371                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1372             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1373                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1374             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1375                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1376             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1377                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1378             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1379             Heps             = _mm256_mul_ps(vfeps,H);
1380             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1381             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1382             fvdw6            = _mm256_mul_ps(c6_00,FF);
1383
1384             /* CUBIC SPLINE TABLE REPULSION */
1385             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1386             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1387             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1388                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1389             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1390                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1391             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1392                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1393             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1394                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1395             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1396             Heps             = _mm256_mul_ps(vfeps,H);
1397             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1398             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1399             fvdw12           = _mm256_mul_ps(c12_00,FF);
1400             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1401
1402             fscal            = _mm256_add_ps(felec,fvdw);
1403
1404             /* Calculate temporary vectorial force */
1405             tx               = _mm256_mul_ps(fscal,dx00);
1406             ty               = _mm256_mul_ps(fscal,dy00);
1407             tz               = _mm256_mul_ps(fscal,dz00);
1408
1409             /* Update vectorial force */
1410             fix0             = _mm256_add_ps(fix0,tx);
1411             fiy0             = _mm256_add_ps(fiy0,ty);
1412             fiz0             = _mm256_add_ps(fiz0,tz);
1413
1414             fjx0             = _mm256_add_ps(fjx0,tx);
1415             fjy0             = _mm256_add_ps(fjy0,ty);
1416             fjz0             = _mm256_add_ps(fjz0,tz);
1417
1418             /**************************
1419              * CALCULATE INTERACTIONS *
1420              **************************/
1421
1422             /* COULOMB ELECTROSTATICS */
1423             velec            = _mm256_mul_ps(qq01,rinv01);
1424             felec            = _mm256_mul_ps(velec,rinvsq01);
1425
1426             fscal            = felec;
1427
1428             /* Calculate temporary vectorial force */
1429             tx               = _mm256_mul_ps(fscal,dx01);
1430             ty               = _mm256_mul_ps(fscal,dy01);
1431             tz               = _mm256_mul_ps(fscal,dz01);
1432
1433             /* Update vectorial force */
1434             fix0             = _mm256_add_ps(fix0,tx);
1435             fiy0             = _mm256_add_ps(fiy0,ty);
1436             fiz0             = _mm256_add_ps(fiz0,tz);
1437
1438             fjx1             = _mm256_add_ps(fjx1,tx);
1439             fjy1             = _mm256_add_ps(fjy1,ty);
1440             fjz1             = _mm256_add_ps(fjz1,tz);
1441
1442             /**************************
1443              * CALCULATE INTERACTIONS *
1444              **************************/
1445
1446             /* COULOMB ELECTROSTATICS */
1447             velec            = _mm256_mul_ps(qq02,rinv02);
1448             felec            = _mm256_mul_ps(velec,rinvsq02);
1449
1450             fscal            = felec;
1451
1452             /* Calculate temporary vectorial force */
1453             tx               = _mm256_mul_ps(fscal,dx02);
1454             ty               = _mm256_mul_ps(fscal,dy02);
1455             tz               = _mm256_mul_ps(fscal,dz02);
1456
1457             /* Update vectorial force */
1458             fix0             = _mm256_add_ps(fix0,tx);
1459             fiy0             = _mm256_add_ps(fiy0,ty);
1460             fiz0             = _mm256_add_ps(fiz0,tz);
1461
1462             fjx2             = _mm256_add_ps(fjx2,tx);
1463             fjy2             = _mm256_add_ps(fjy2,ty);
1464             fjz2             = _mm256_add_ps(fjz2,tz);
1465
1466             /**************************
1467              * CALCULATE INTERACTIONS *
1468              **************************/
1469
1470             /* COULOMB ELECTROSTATICS */
1471             velec            = _mm256_mul_ps(qq10,rinv10);
1472             felec            = _mm256_mul_ps(velec,rinvsq10);
1473
1474             fscal            = felec;
1475
1476             /* Calculate temporary vectorial force */
1477             tx               = _mm256_mul_ps(fscal,dx10);
1478             ty               = _mm256_mul_ps(fscal,dy10);
1479             tz               = _mm256_mul_ps(fscal,dz10);
1480
1481             /* Update vectorial force */
1482             fix1             = _mm256_add_ps(fix1,tx);
1483             fiy1             = _mm256_add_ps(fiy1,ty);
1484             fiz1             = _mm256_add_ps(fiz1,tz);
1485
1486             fjx0             = _mm256_add_ps(fjx0,tx);
1487             fjy0             = _mm256_add_ps(fjy0,ty);
1488             fjz0             = _mm256_add_ps(fjz0,tz);
1489
1490             /**************************
1491              * CALCULATE INTERACTIONS *
1492              **************************/
1493
1494             /* COULOMB ELECTROSTATICS */
1495             velec            = _mm256_mul_ps(qq11,rinv11);
1496             felec            = _mm256_mul_ps(velec,rinvsq11);
1497
1498             fscal            = felec;
1499
1500             /* Calculate temporary vectorial force */
1501             tx               = _mm256_mul_ps(fscal,dx11);
1502             ty               = _mm256_mul_ps(fscal,dy11);
1503             tz               = _mm256_mul_ps(fscal,dz11);
1504
1505             /* Update vectorial force */
1506             fix1             = _mm256_add_ps(fix1,tx);
1507             fiy1             = _mm256_add_ps(fiy1,ty);
1508             fiz1             = _mm256_add_ps(fiz1,tz);
1509
1510             fjx1             = _mm256_add_ps(fjx1,tx);
1511             fjy1             = _mm256_add_ps(fjy1,ty);
1512             fjz1             = _mm256_add_ps(fjz1,tz);
1513
1514             /**************************
1515              * CALCULATE INTERACTIONS *
1516              **************************/
1517
1518             /* COULOMB ELECTROSTATICS */
1519             velec            = _mm256_mul_ps(qq12,rinv12);
1520             felec            = _mm256_mul_ps(velec,rinvsq12);
1521
1522             fscal            = felec;
1523
1524             /* Calculate temporary vectorial force */
1525             tx               = _mm256_mul_ps(fscal,dx12);
1526             ty               = _mm256_mul_ps(fscal,dy12);
1527             tz               = _mm256_mul_ps(fscal,dz12);
1528
1529             /* Update vectorial force */
1530             fix1             = _mm256_add_ps(fix1,tx);
1531             fiy1             = _mm256_add_ps(fiy1,ty);
1532             fiz1             = _mm256_add_ps(fiz1,tz);
1533
1534             fjx2             = _mm256_add_ps(fjx2,tx);
1535             fjy2             = _mm256_add_ps(fjy2,ty);
1536             fjz2             = _mm256_add_ps(fjz2,tz);
1537
1538             /**************************
1539              * CALCULATE INTERACTIONS *
1540              **************************/
1541
1542             /* COULOMB ELECTROSTATICS */
1543             velec            = _mm256_mul_ps(qq20,rinv20);
1544             felec            = _mm256_mul_ps(velec,rinvsq20);
1545
1546             fscal            = felec;
1547
1548             /* Calculate temporary vectorial force */
1549             tx               = _mm256_mul_ps(fscal,dx20);
1550             ty               = _mm256_mul_ps(fscal,dy20);
1551             tz               = _mm256_mul_ps(fscal,dz20);
1552
1553             /* Update vectorial force */
1554             fix2             = _mm256_add_ps(fix2,tx);
1555             fiy2             = _mm256_add_ps(fiy2,ty);
1556             fiz2             = _mm256_add_ps(fiz2,tz);
1557
1558             fjx0             = _mm256_add_ps(fjx0,tx);
1559             fjy0             = _mm256_add_ps(fjy0,ty);
1560             fjz0             = _mm256_add_ps(fjz0,tz);
1561
1562             /**************************
1563              * CALCULATE INTERACTIONS *
1564              **************************/
1565
1566             /* COULOMB ELECTROSTATICS */
1567             velec            = _mm256_mul_ps(qq21,rinv21);
1568             felec            = _mm256_mul_ps(velec,rinvsq21);
1569
1570             fscal            = felec;
1571
1572             /* Calculate temporary vectorial force */
1573             tx               = _mm256_mul_ps(fscal,dx21);
1574             ty               = _mm256_mul_ps(fscal,dy21);
1575             tz               = _mm256_mul_ps(fscal,dz21);
1576
1577             /* Update vectorial force */
1578             fix2             = _mm256_add_ps(fix2,tx);
1579             fiy2             = _mm256_add_ps(fiy2,ty);
1580             fiz2             = _mm256_add_ps(fiz2,tz);
1581
1582             fjx1             = _mm256_add_ps(fjx1,tx);
1583             fjy1             = _mm256_add_ps(fjy1,ty);
1584             fjz1             = _mm256_add_ps(fjz1,tz);
1585
1586             /**************************
1587              * CALCULATE INTERACTIONS *
1588              **************************/
1589
1590             /* COULOMB ELECTROSTATICS */
1591             velec            = _mm256_mul_ps(qq22,rinv22);
1592             felec            = _mm256_mul_ps(velec,rinvsq22);
1593
1594             fscal            = felec;
1595
1596             /* Calculate temporary vectorial force */
1597             tx               = _mm256_mul_ps(fscal,dx22);
1598             ty               = _mm256_mul_ps(fscal,dy22);
1599             tz               = _mm256_mul_ps(fscal,dz22);
1600
1601             /* Update vectorial force */
1602             fix2             = _mm256_add_ps(fix2,tx);
1603             fiy2             = _mm256_add_ps(fiy2,ty);
1604             fiz2             = _mm256_add_ps(fiz2,tz);
1605
1606             fjx2             = _mm256_add_ps(fjx2,tx);
1607             fjy2             = _mm256_add_ps(fjy2,ty);
1608             fjz2             = _mm256_add_ps(fjz2,tz);
1609
1610             fjptrA             = f+j_coord_offsetA;
1611             fjptrB             = f+j_coord_offsetB;
1612             fjptrC             = f+j_coord_offsetC;
1613             fjptrD             = f+j_coord_offsetD;
1614             fjptrE             = f+j_coord_offsetE;
1615             fjptrF             = f+j_coord_offsetF;
1616             fjptrG             = f+j_coord_offsetG;
1617             fjptrH             = f+j_coord_offsetH;
1618
1619             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1620                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1621
1622             /* Inner loop uses 261 flops */
1623         }
1624
1625         if(jidx<j_index_end)
1626         {
1627
1628             /* Get j neighbor index, and coordinate index */
1629             jnrlistA         = jjnr[jidx];
1630             jnrlistB         = jjnr[jidx+1];
1631             jnrlistC         = jjnr[jidx+2];
1632             jnrlistD         = jjnr[jidx+3];
1633             jnrlistE         = jjnr[jidx+4];
1634             jnrlistF         = jjnr[jidx+5];
1635             jnrlistG         = jjnr[jidx+6];
1636             jnrlistH         = jjnr[jidx+7];
1637             /* Sign of each element will be negative for non-real atoms.
1638              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1639              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1640              */
1641             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1642                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1643                                             
1644             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1645             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1646             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1647             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1648             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1649             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1650             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1651             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1652             j_coord_offsetA  = DIM*jnrA;
1653             j_coord_offsetB  = DIM*jnrB;
1654             j_coord_offsetC  = DIM*jnrC;
1655             j_coord_offsetD  = DIM*jnrD;
1656             j_coord_offsetE  = DIM*jnrE;
1657             j_coord_offsetF  = DIM*jnrF;
1658             j_coord_offsetG  = DIM*jnrG;
1659             j_coord_offsetH  = DIM*jnrH;
1660
1661             /* load j atom coordinates */
1662             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1663                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1664                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1665                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1666                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1667
1668             /* Calculate displacement vector */
1669             dx00             = _mm256_sub_ps(ix0,jx0);
1670             dy00             = _mm256_sub_ps(iy0,jy0);
1671             dz00             = _mm256_sub_ps(iz0,jz0);
1672             dx01             = _mm256_sub_ps(ix0,jx1);
1673             dy01             = _mm256_sub_ps(iy0,jy1);
1674             dz01             = _mm256_sub_ps(iz0,jz1);
1675             dx02             = _mm256_sub_ps(ix0,jx2);
1676             dy02             = _mm256_sub_ps(iy0,jy2);
1677             dz02             = _mm256_sub_ps(iz0,jz2);
1678             dx10             = _mm256_sub_ps(ix1,jx0);
1679             dy10             = _mm256_sub_ps(iy1,jy0);
1680             dz10             = _mm256_sub_ps(iz1,jz0);
1681             dx11             = _mm256_sub_ps(ix1,jx1);
1682             dy11             = _mm256_sub_ps(iy1,jy1);
1683             dz11             = _mm256_sub_ps(iz1,jz1);
1684             dx12             = _mm256_sub_ps(ix1,jx2);
1685             dy12             = _mm256_sub_ps(iy1,jy2);
1686             dz12             = _mm256_sub_ps(iz1,jz2);
1687             dx20             = _mm256_sub_ps(ix2,jx0);
1688             dy20             = _mm256_sub_ps(iy2,jy0);
1689             dz20             = _mm256_sub_ps(iz2,jz0);
1690             dx21             = _mm256_sub_ps(ix2,jx1);
1691             dy21             = _mm256_sub_ps(iy2,jy1);
1692             dz21             = _mm256_sub_ps(iz2,jz1);
1693             dx22             = _mm256_sub_ps(ix2,jx2);
1694             dy22             = _mm256_sub_ps(iy2,jy2);
1695             dz22             = _mm256_sub_ps(iz2,jz2);
1696
1697             /* Calculate squared distance and things based on it */
1698             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1699             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1700             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1701             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1702             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1703             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1704             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1705             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1706             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1707
1708             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1709             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1710             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1711             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1712             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1713             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1714             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1715             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1716             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1717
1718             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1719             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1720             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1721             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1722             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1723             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1724             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1725             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1726             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1727
1728             fjx0             = _mm256_setzero_ps();
1729             fjy0             = _mm256_setzero_ps();
1730             fjz0             = _mm256_setzero_ps();
1731             fjx1             = _mm256_setzero_ps();
1732             fjy1             = _mm256_setzero_ps();
1733             fjz1             = _mm256_setzero_ps();
1734             fjx2             = _mm256_setzero_ps();
1735             fjy2             = _mm256_setzero_ps();
1736             fjz2             = _mm256_setzero_ps();
1737
1738             /**************************
1739              * CALCULATE INTERACTIONS *
1740              **************************/
1741
1742             r00              = _mm256_mul_ps(rsq00,rinv00);
1743             r00              = _mm256_andnot_ps(dummy_mask,r00);
1744
1745             /* Calculate table index by multiplying r with table scale and truncate to integer */
1746             rt               = _mm256_mul_ps(r00,vftabscale);
1747             vfitab           = _mm256_cvttps_epi32(rt);
1748             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1749             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1750             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1751             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1752             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
1753             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
1754
1755             /* COULOMB ELECTROSTATICS */
1756             velec            = _mm256_mul_ps(qq00,rinv00);
1757             felec            = _mm256_mul_ps(velec,rinvsq00);
1758
1759             /* CUBIC SPLINE TABLE DISPERSION */
1760             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1761                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1762             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1763                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1764             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1765                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1766             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1767                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1768             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1769             Heps             = _mm256_mul_ps(vfeps,H);
1770             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1771             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1772             fvdw6            = _mm256_mul_ps(c6_00,FF);
1773
1774             /* CUBIC SPLINE TABLE REPULSION */
1775             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1776             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1777             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1778                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1779             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1780                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1781             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1782                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1783             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1784                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1785             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1786             Heps             = _mm256_mul_ps(vfeps,H);
1787             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1788             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1789             fvdw12           = _mm256_mul_ps(c12_00,FF);
1790             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1791
1792             fscal            = _mm256_add_ps(felec,fvdw);
1793
1794             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1795
1796             /* Calculate temporary vectorial force */
1797             tx               = _mm256_mul_ps(fscal,dx00);
1798             ty               = _mm256_mul_ps(fscal,dy00);
1799             tz               = _mm256_mul_ps(fscal,dz00);
1800
1801             /* Update vectorial force */
1802             fix0             = _mm256_add_ps(fix0,tx);
1803             fiy0             = _mm256_add_ps(fiy0,ty);
1804             fiz0             = _mm256_add_ps(fiz0,tz);
1805
1806             fjx0             = _mm256_add_ps(fjx0,tx);
1807             fjy0             = _mm256_add_ps(fjy0,ty);
1808             fjz0             = _mm256_add_ps(fjz0,tz);
1809
1810             /**************************
1811              * CALCULATE INTERACTIONS *
1812              **************************/
1813
1814             /* COULOMB ELECTROSTATICS */
1815             velec            = _mm256_mul_ps(qq01,rinv01);
1816             felec            = _mm256_mul_ps(velec,rinvsq01);
1817
1818             fscal            = felec;
1819
1820             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1821
1822             /* Calculate temporary vectorial force */
1823             tx               = _mm256_mul_ps(fscal,dx01);
1824             ty               = _mm256_mul_ps(fscal,dy01);
1825             tz               = _mm256_mul_ps(fscal,dz01);
1826
1827             /* Update vectorial force */
1828             fix0             = _mm256_add_ps(fix0,tx);
1829             fiy0             = _mm256_add_ps(fiy0,ty);
1830             fiz0             = _mm256_add_ps(fiz0,tz);
1831
1832             fjx1             = _mm256_add_ps(fjx1,tx);
1833             fjy1             = _mm256_add_ps(fjy1,ty);
1834             fjz1             = _mm256_add_ps(fjz1,tz);
1835
1836             /**************************
1837              * CALCULATE INTERACTIONS *
1838              **************************/
1839
1840             /* COULOMB ELECTROSTATICS */
1841             velec            = _mm256_mul_ps(qq02,rinv02);
1842             felec            = _mm256_mul_ps(velec,rinvsq02);
1843
1844             fscal            = felec;
1845
1846             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1847
1848             /* Calculate temporary vectorial force */
1849             tx               = _mm256_mul_ps(fscal,dx02);
1850             ty               = _mm256_mul_ps(fscal,dy02);
1851             tz               = _mm256_mul_ps(fscal,dz02);
1852
1853             /* Update vectorial force */
1854             fix0             = _mm256_add_ps(fix0,tx);
1855             fiy0             = _mm256_add_ps(fiy0,ty);
1856             fiz0             = _mm256_add_ps(fiz0,tz);
1857
1858             fjx2             = _mm256_add_ps(fjx2,tx);
1859             fjy2             = _mm256_add_ps(fjy2,ty);
1860             fjz2             = _mm256_add_ps(fjz2,tz);
1861
1862             /**************************
1863              * CALCULATE INTERACTIONS *
1864              **************************/
1865
1866             /* COULOMB ELECTROSTATICS */
1867             velec            = _mm256_mul_ps(qq10,rinv10);
1868             felec            = _mm256_mul_ps(velec,rinvsq10);
1869
1870             fscal            = felec;
1871
1872             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1873
1874             /* Calculate temporary vectorial force */
1875             tx               = _mm256_mul_ps(fscal,dx10);
1876             ty               = _mm256_mul_ps(fscal,dy10);
1877             tz               = _mm256_mul_ps(fscal,dz10);
1878
1879             /* Update vectorial force */
1880             fix1             = _mm256_add_ps(fix1,tx);
1881             fiy1             = _mm256_add_ps(fiy1,ty);
1882             fiz1             = _mm256_add_ps(fiz1,tz);
1883
1884             fjx0             = _mm256_add_ps(fjx0,tx);
1885             fjy0             = _mm256_add_ps(fjy0,ty);
1886             fjz0             = _mm256_add_ps(fjz0,tz);
1887
1888             /**************************
1889              * CALCULATE INTERACTIONS *
1890              **************************/
1891
1892             /* COULOMB ELECTROSTATICS */
1893             velec            = _mm256_mul_ps(qq11,rinv11);
1894             felec            = _mm256_mul_ps(velec,rinvsq11);
1895
1896             fscal            = felec;
1897
1898             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1899
1900             /* Calculate temporary vectorial force */
1901             tx               = _mm256_mul_ps(fscal,dx11);
1902             ty               = _mm256_mul_ps(fscal,dy11);
1903             tz               = _mm256_mul_ps(fscal,dz11);
1904
1905             /* Update vectorial force */
1906             fix1             = _mm256_add_ps(fix1,tx);
1907             fiy1             = _mm256_add_ps(fiy1,ty);
1908             fiz1             = _mm256_add_ps(fiz1,tz);
1909
1910             fjx1             = _mm256_add_ps(fjx1,tx);
1911             fjy1             = _mm256_add_ps(fjy1,ty);
1912             fjz1             = _mm256_add_ps(fjz1,tz);
1913
1914             /**************************
1915              * CALCULATE INTERACTIONS *
1916              **************************/
1917
1918             /* COULOMB ELECTROSTATICS */
1919             velec            = _mm256_mul_ps(qq12,rinv12);
1920             felec            = _mm256_mul_ps(velec,rinvsq12);
1921
1922             fscal            = felec;
1923
1924             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1925
1926             /* Calculate temporary vectorial force */
1927             tx               = _mm256_mul_ps(fscal,dx12);
1928             ty               = _mm256_mul_ps(fscal,dy12);
1929             tz               = _mm256_mul_ps(fscal,dz12);
1930
1931             /* Update vectorial force */
1932             fix1             = _mm256_add_ps(fix1,tx);
1933             fiy1             = _mm256_add_ps(fiy1,ty);
1934             fiz1             = _mm256_add_ps(fiz1,tz);
1935
1936             fjx2             = _mm256_add_ps(fjx2,tx);
1937             fjy2             = _mm256_add_ps(fjy2,ty);
1938             fjz2             = _mm256_add_ps(fjz2,tz);
1939
1940             /**************************
1941              * CALCULATE INTERACTIONS *
1942              **************************/
1943
1944             /* COULOMB ELECTROSTATICS */
1945             velec            = _mm256_mul_ps(qq20,rinv20);
1946             felec            = _mm256_mul_ps(velec,rinvsq20);
1947
1948             fscal            = felec;
1949
1950             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1951
1952             /* Calculate temporary vectorial force */
1953             tx               = _mm256_mul_ps(fscal,dx20);
1954             ty               = _mm256_mul_ps(fscal,dy20);
1955             tz               = _mm256_mul_ps(fscal,dz20);
1956
1957             /* Update vectorial force */
1958             fix2             = _mm256_add_ps(fix2,tx);
1959             fiy2             = _mm256_add_ps(fiy2,ty);
1960             fiz2             = _mm256_add_ps(fiz2,tz);
1961
1962             fjx0             = _mm256_add_ps(fjx0,tx);
1963             fjy0             = _mm256_add_ps(fjy0,ty);
1964             fjz0             = _mm256_add_ps(fjz0,tz);
1965
1966             /**************************
1967              * CALCULATE INTERACTIONS *
1968              **************************/
1969
1970             /* COULOMB ELECTROSTATICS */
1971             velec            = _mm256_mul_ps(qq21,rinv21);
1972             felec            = _mm256_mul_ps(velec,rinvsq21);
1973
1974             fscal            = felec;
1975
1976             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1977
1978             /* Calculate temporary vectorial force */
1979             tx               = _mm256_mul_ps(fscal,dx21);
1980             ty               = _mm256_mul_ps(fscal,dy21);
1981             tz               = _mm256_mul_ps(fscal,dz21);
1982
1983             /* Update vectorial force */
1984             fix2             = _mm256_add_ps(fix2,tx);
1985             fiy2             = _mm256_add_ps(fiy2,ty);
1986             fiz2             = _mm256_add_ps(fiz2,tz);
1987
1988             fjx1             = _mm256_add_ps(fjx1,tx);
1989             fjy1             = _mm256_add_ps(fjy1,ty);
1990             fjz1             = _mm256_add_ps(fjz1,tz);
1991
1992             /**************************
1993              * CALCULATE INTERACTIONS *
1994              **************************/
1995
1996             /* COULOMB ELECTROSTATICS */
1997             velec            = _mm256_mul_ps(qq22,rinv22);
1998             felec            = _mm256_mul_ps(velec,rinvsq22);
1999
2000             fscal            = felec;
2001
2002             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2003
2004             /* Calculate temporary vectorial force */
2005             tx               = _mm256_mul_ps(fscal,dx22);
2006             ty               = _mm256_mul_ps(fscal,dy22);
2007             tz               = _mm256_mul_ps(fscal,dz22);
2008
2009             /* Update vectorial force */
2010             fix2             = _mm256_add_ps(fix2,tx);
2011             fiy2             = _mm256_add_ps(fiy2,ty);
2012             fiz2             = _mm256_add_ps(fiz2,tz);
2013
2014             fjx2             = _mm256_add_ps(fjx2,tx);
2015             fjy2             = _mm256_add_ps(fjy2,ty);
2016             fjz2             = _mm256_add_ps(fjz2,tz);
2017
2018             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2019             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2020             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2021             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2022             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2023             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2024             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2025             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2026
2027             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2028                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2029
2030             /* Inner loop uses 262 flops */
2031         }
2032
2033         /* End of innermost loop */
2034
2035         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2036                                                  f+i_coord_offset,fshift+i_shift_offset);
2037
2038         /* Increment number of inner iterations */
2039         inneriter                  += j_index_end - j_index_start;
2040
2041         /* Outer loop uses 18 flops */
2042     }
2043
2044     /* Increment number of outer iterations */
2045     outeriter        += nri;
2046
2047     /* Update outer/inner flops */
2048
2049     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*262);
2050 }