551b95d5fd7f91a3140512b7e77c172296f6e780
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEw_VdwCSTab_GeomW3W3_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gmx_math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            CubicSplineTable
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrE,jnrF,jnrG,jnrH;
78     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
83     real             rcutoff_scalar;
84     real             *shiftvec,*fshift,*x,*f;
85     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
86     real             scratch[4*DIM];
87     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88     real *           vdwioffsetptr0;
89     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90     real *           vdwioffsetptr1;
91     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92     real *           vdwioffsetptr2;
93     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
102     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
103     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
104     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
105     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
106     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
107     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
108     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
109     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
116     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
117     __m256i          vfitab;
118     __m128i          vfitab_lo,vfitab_hi;
119     __m128i          ifour       = _mm_set1_epi32(4);
120     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
121     real             *vftab;
122     __m256i          ewitab;
123     __m128i          ewitab_lo,ewitab_hi;
124     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
125     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
126     real             *ewtab;
127     __m256           dummy_mask,cutoff_mask;
128     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
129     __m256           one     = _mm256_set1_ps(1.0);
130     __m256           two     = _mm256_set1_ps(2.0);
131     x                = xx[0];
132     f                = ff[0];
133
134     nri              = nlist->nri;
135     iinr             = nlist->iinr;
136     jindex           = nlist->jindex;
137     jjnr             = nlist->jjnr;
138     shiftidx         = nlist->shift;
139     gid              = nlist->gid;
140     shiftvec         = fr->shift_vec[0];
141     fshift           = fr->fshift[0];
142     facel            = _mm256_set1_ps(fr->epsfac);
143     charge           = mdatoms->chargeA;
144     nvdwtype         = fr->ntype;
145     vdwparam         = fr->nbfp;
146     vdwtype          = mdatoms->typeA;
147
148     vftab            = kernel_data->table_vdw->data;
149     vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
150
151     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
152     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
153     beta2            = _mm256_mul_ps(beta,beta);
154     beta3            = _mm256_mul_ps(beta,beta2);
155
156     ewtab            = fr->ic->tabq_coul_FDV0;
157     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
158     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
159
160     /* Setup water-specific parameters */
161     inr              = nlist->iinr[0];
162     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
163     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
164     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
165     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
166
167     jq0              = _mm256_set1_ps(charge[inr+0]);
168     jq1              = _mm256_set1_ps(charge[inr+1]);
169     jq2              = _mm256_set1_ps(charge[inr+2]);
170     vdwjidx0A        = 2*vdwtype[inr+0];
171     qq00             = _mm256_mul_ps(iq0,jq0);
172     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
173     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
174     qq01             = _mm256_mul_ps(iq0,jq1);
175     qq02             = _mm256_mul_ps(iq0,jq2);
176     qq10             = _mm256_mul_ps(iq1,jq0);
177     qq11             = _mm256_mul_ps(iq1,jq1);
178     qq12             = _mm256_mul_ps(iq1,jq2);
179     qq20             = _mm256_mul_ps(iq2,jq0);
180     qq21             = _mm256_mul_ps(iq2,jq1);
181     qq22             = _mm256_mul_ps(iq2,jq2);
182
183     /* Avoid stupid compiler warnings */
184     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
185     j_coord_offsetA = 0;
186     j_coord_offsetB = 0;
187     j_coord_offsetC = 0;
188     j_coord_offsetD = 0;
189     j_coord_offsetE = 0;
190     j_coord_offsetF = 0;
191     j_coord_offsetG = 0;
192     j_coord_offsetH = 0;
193
194     outeriter        = 0;
195     inneriter        = 0;
196
197     for(iidx=0;iidx<4*DIM;iidx++)
198     {
199         scratch[iidx] = 0.0;
200     }
201
202     /* Start outer loop over neighborlists */
203     for(iidx=0; iidx<nri; iidx++)
204     {
205         /* Load shift vector for this list */
206         i_shift_offset   = DIM*shiftidx[iidx];
207
208         /* Load limits for loop over neighbors */
209         j_index_start    = jindex[iidx];
210         j_index_end      = jindex[iidx+1];
211
212         /* Get outer coordinate index */
213         inr              = iinr[iidx];
214         i_coord_offset   = DIM*inr;
215
216         /* Load i particle coords and add shift vector */
217         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
218                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
219
220         fix0             = _mm256_setzero_ps();
221         fiy0             = _mm256_setzero_ps();
222         fiz0             = _mm256_setzero_ps();
223         fix1             = _mm256_setzero_ps();
224         fiy1             = _mm256_setzero_ps();
225         fiz1             = _mm256_setzero_ps();
226         fix2             = _mm256_setzero_ps();
227         fiy2             = _mm256_setzero_ps();
228         fiz2             = _mm256_setzero_ps();
229
230         /* Reset potential sums */
231         velecsum         = _mm256_setzero_ps();
232         vvdwsum          = _mm256_setzero_ps();
233
234         /* Start inner kernel loop */
235         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
236         {
237
238             /* Get j neighbor index, and coordinate index */
239             jnrA             = jjnr[jidx];
240             jnrB             = jjnr[jidx+1];
241             jnrC             = jjnr[jidx+2];
242             jnrD             = jjnr[jidx+3];
243             jnrE             = jjnr[jidx+4];
244             jnrF             = jjnr[jidx+5];
245             jnrG             = jjnr[jidx+6];
246             jnrH             = jjnr[jidx+7];
247             j_coord_offsetA  = DIM*jnrA;
248             j_coord_offsetB  = DIM*jnrB;
249             j_coord_offsetC  = DIM*jnrC;
250             j_coord_offsetD  = DIM*jnrD;
251             j_coord_offsetE  = DIM*jnrE;
252             j_coord_offsetF  = DIM*jnrF;
253             j_coord_offsetG  = DIM*jnrG;
254             j_coord_offsetH  = DIM*jnrH;
255
256             /* load j atom coordinates */
257             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
258                                                  x+j_coord_offsetC,x+j_coord_offsetD,
259                                                  x+j_coord_offsetE,x+j_coord_offsetF,
260                                                  x+j_coord_offsetG,x+j_coord_offsetH,
261                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
262
263             /* Calculate displacement vector */
264             dx00             = _mm256_sub_ps(ix0,jx0);
265             dy00             = _mm256_sub_ps(iy0,jy0);
266             dz00             = _mm256_sub_ps(iz0,jz0);
267             dx01             = _mm256_sub_ps(ix0,jx1);
268             dy01             = _mm256_sub_ps(iy0,jy1);
269             dz01             = _mm256_sub_ps(iz0,jz1);
270             dx02             = _mm256_sub_ps(ix0,jx2);
271             dy02             = _mm256_sub_ps(iy0,jy2);
272             dz02             = _mm256_sub_ps(iz0,jz2);
273             dx10             = _mm256_sub_ps(ix1,jx0);
274             dy10             = _mm256_sub_ps(iy1,jy0);
275             dz10             = _mm256_sub_ps(iz1,jz0);
276             dx11             = _mm256_sub_ps(ix1,jx1);
277             dy11             = _mm256_sub_ps(iy1,jy1);
278             dz11             = _mm256_sub_ps(iz1,jz1);
279             dx12             = _mm256_sub_ps(ix1,jx2);
280             dy12             = _mm256_sub_ps(iy1,jy2);
281             dz12             = _mm256_sub_ps(iz1,jz2);
282             dx20             = _mm256_sub_ps(ix2,jx0);
283             dy20             = _mm256_sub_ps(iy2,jy0);
284             dz20             = _mm256_sub_ps(iz2,jz0);
285             dx21             = _mm256_sub_ps(ix2,jx1);
286             dy21             = _mm256_sub_ps(iy2,jy1);
287             dz21             = _mm256_sub_ps(iz2,jz1);
288             dx22             = _mm256_sub_ps(ix2,jx2);
289             dy22             = _mm256_sub_ps(iy2,jy2);
290             dz22             = _mm256_sub_ps(iz2,jz2);
291
292             /* Calculate squared distance and things based on it */
293             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
294             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
295             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
296             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
297             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
298             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
299             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
300             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
301             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
302
303             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
304             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
305             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
306             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
307             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
308             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
309             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
310             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
311             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
312
313             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
314             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
315             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
316             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
317             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
318             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
319             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
320             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
321             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
322
323             fjx0             = _mm256_setzero_ps();
324             fjy0             = _mm256_setzero_ps();
325             fjz0             = _mm256_setzero_ps();
326             fjx1             = _mm256_setzero_ps();
327             fjy1             = _mm256_setzero_ps();
328             fjz1             = _mm256_setzero_ps();
329             fjx2             = _mm256_setzero_ps();
330             fjy2             = _mm256_setzero_ps();
331             fjz2             = _mm256_setzero_ps();
332
333             /**************************
334              * CALCULATE INTERACTIONS *
335              **************************/
336
337             r00              = _mm256_mul_ps(rsq00,rinv00);
338
339             /* Calculate table index by multiplying r with table scale and truncate to integer */
340             rt               = _mm256_mul_ps(r00,vftabscale);
341             vfitab           = _mm256_cvttps_epi32(rt);
342             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
343             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
344             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
345             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
346             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
347             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
348
349             /* EWALD ELECTROSTATICS */
350             
351             /* Analytical PME correction */
352             zeta2            = _mm256_mul_ps(beta2,rsq00);
353             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
354             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
355             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
356             felec            = _mm256_mul_ps(qq00,felec);
357             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
358             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
359             velec            = _mm256_sub_ps(rinv00,pmecorrV);
360             velec            = _mm256_mul_ps(qq00,velec);
361             
362             /* CUBIC SPLINE TABLE DISPERSION */
363             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
364                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
365             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
366                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
367             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
368                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
369             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
370                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
371             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
372             Heps             = _mm256_mul_ps(vfeps,H);
373             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
374             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
375             vvdw6            = _mm256_mul_ps(c6_00,VV);
376             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
377             fvdw6            = _mm256_mul_ps(c6_00,FF);
378
379             /* CUBIC SPLINE TABLE REPULSION */
380             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
381             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
382             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
383                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
384             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
385                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
386             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
387                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
388             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
389                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
390             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
391             Heps             = _mm256_mul_ps(vfeps,H);
392             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
393             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
394             vvdw12           = _mm256_mul_ps(c12_00,VV);
395             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
396             fvdw12           = _mm256_mul_ps(c12_00,FF);
397             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
398             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
399
400             /* Update potential sum for this i atom from the interaction with this j atom. */
401             velecsum         = _mm256_add_ps(velecsum,velec);
402             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
403
404             fscal            = _mm256_add_ps(felec,fvdw);
405
406             /* Calculate temporary vectorial force */
407             tx               = _mm256_mul_ps(fscal,dx00);
408             ty               = _mm256_mul_ps(fscal,dy00);
409             tz               = _mm256_mul_ps(fscal,dz00);
410
411             /* Update vectorial force */
412             fix0             = _mm256_add_ps(fix0,tx);
413             fiy0             = _mm256_add_ps(fiy0,ty);
414             fiz0             = _mm256_add_ps(fiz0,tz);
415
416             fjx0             = _mm256_add_ps(fjx0,tx);
417             fjy0             = _mm256_add_ps(fjy0,ty);
418             fjz0             = _mm256_add_ps(fjz0,tz);
419
420             /**************************
421              * CALCULATE INTERACTIONS *
422              **************************/
423
424             r01              = _mm256_mul_ps(rsq01,rinv01);
425
426             /* EWALD ELECTROSTATICS */
427             
428             /* Analytical PME correction */
429             zeta2            = _mm256_mul_ps(beta2,rsq01);
430             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
431             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
432             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
433             felec            = _mm256_mul_ps(qq01,felec);
434             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
435             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
436             velec            = _mm256_sub_ps(rinv01,pmecorrV);
437             velec            = _mm256_mul_ps(qq01,velec);
438             
439             /* Update potential sum for this i atom from the interaction with this j atom. */
440             velecsum         = _mm256_add_ps(velecsum,velec);
441
442             fscal            = felec;
443
444             /* Calculate temporary vectorial force */
445             tx               = _mm256_mul_ps(fscal,dx01);
446             ty               = _mm256_mul_ps(fscal,dy01);
447             tz               = _mm256_mul_ps(fscal,dz01);
448
449             /* Update vectorial force */
450             fix0             = _mm256_add_ps(fix0,tx);
451             fiy0             = _mm256_add_ps(fiy0,ty);
452             fiz0             = _mm256_add_ps(fiz0,tz);
453
454             fjx1             = _mm256_add_ps(fjx1,tx);
455             fjy1             = _mm256_add_ps(fjy1,ty);
456             fjz1             = _mm256_add_ps(fjz1,tz);
457
458             /**************************
459              * CALCULATE INTERACTIONS *
460              **************************/
461
462             r02              = _mm256_mul_ps(rsq02,rinv02);
463
464             /* EWALD ELECTROSTATICS */
465             
466             /* Analytical PME correction */
467             zeta2            = _mm256_mul_ps(beta2,rsq02);
468             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
469             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
470             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
471             felec            = _mm256_mul_ps(qq02,felec);
472             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
473             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
474             velec            = _mm256_sub_ps(rinv02,pmecorrV);
475             velec            = _mm256_mul_ps(qq02,velec);
476             
477             /* Update potential sum for this i atom from the interaction with this j atom. */
478             velecsum         = _mm256_add_ps(velecsum,velec);
479
480             fscal            = felec;
481
482             /* Calculate temporary vectorial force */
483             tx               = _mm256_mul_ps(fscal,dx02);
484             ty               = _mm256_mul_ps(fscal,dy02);
485             tz               = _mm256_mul_ps(fscal,dz02);
486
487             /* Update vectorial force */
488             fix0             = _mm256_add_ps(fix0,tx);
489             fiy0             = _mm256_add_ps(fiy0,ty);
490             fiz0             = _mm256_add_ps(fiz0,tz);
491
492             fjx2             = _mm256_add_ps(fjx2,tx);
493             fjy2             = _mm256_add_ps(fjy2,ty);
494             fjz2             = _mm256_add_ps(fjz2,tz);
495
496             /**************************
497              * CALCULATE INTERACTIONS *
498              **************************/
499
500             r10              = _mm256_mul_ps(rsq10,rinv10);
501
502             /* EWALD ELECTROSTATICS */
503             
504             /* Analytical PME correction */
505             zeta2            = _mm256_mul_ps(beta2,rsq10);
506             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
507             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
508             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
509             felec            = _mm256_mul_ps(qq10,felec);
510             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
511             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
512             velec            = _mm256_sub_ps(rinv10,pmecorrV);
513             velec            = _mm256_mul_ps(qq10,velec);
514             
515             /* Update potential sum for this i atom from the interaction with this j atom. */
516             velecsum         = _mm256_add_ps(velecsum,velec);
517
518             fscal            = felec;
519
520             /* Calculate temporary vectorial force */
521             tx               = _mm256_mul_ps(fscal,dx10);
522             ty               = _mm256_mul_ps(fscal,dy10);
523             tz               = _mm256_mul_ps(fscal,dz10);
524
525             /* Update vectorial force */
526             fix1             = _mm256_add_ps(fix1,tx);
527             fiy1             = _mm256_add_ps(fiy1,ty);
528             fiz1             = _mm256_add_ps(fiz1,tz);
529
530             fjx0             = _mm256_add_ps(fjx0,tx);
531             fjy0             = _mm256_add_ps(fjy0,ty);
532             fjz0             = _mm256_add_ps(fjz0,tz);
533
534             /**************************
535              * CALCULATE INTERACTIONS *
536              **************************/
537
538             r11              = _mm256_mul_ps(rsq11,rinv11);
539
540             /* EWALD ELECTROSTATICS */
541             
542             /* Analytical PME correction */
543             zeta2            = _mm256_mul_ps(beta2,rsq11);
544             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
545             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
546             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
547             felec            = _mm256_mul_ps(qq11,felec);
548             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
549             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
550             velec            = _mm256_sub_ps(rinv11,pmecorrV);
551             velec            = _mm256_mul_ps(qq11,velec);
552             
553             /* Update potential sum for this i atom from the interaction with this j atom. */
554             velecsum         = _mm256_add_ps(velecsum,velec);
555
556             fscal            = felec;
557
558             /* Calculate temporary vectorial force */
559             tx               = _mm256_mul_ps(fscal,dx11);
560             ty               = _mm256_mul_ps(fscal,dy11);
561             tz               = _mm256_mul_ps(fscal,dz11);
562
563             /* Update vectorial force */
564             fix1             = _mm256_add_ps(fix1,tx);
565             fiy1             = _mm256_add_ps(fiy1,ty);
566             fiz1             = _mm256_add_ps(fiz1,tz);
567
568             fjx1             = _mm256_add_ps(fjx1,tx);
569             fjy1             = _mm256_add_ps(fjy1,ty);
570             fjz1             = _mm256_add_ps(fjz1,tz);
571
572             /**************************
573              * CALCULATE INTERACTIONS *
574              **************************/
575
576             r12              = _mm256_mul_ps(rsq12,rinv12);
577
578             /* EWALD ELECTROSTATICS */
579             
580             /* Analytical PME correction */
581             zeta2            = _mm256_mul_ps(beta2,rsq12);
582             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
583             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
584             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
585             felec            = _mm256_mul_ps(qq12,felec);
586             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
587             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
588             velec            = _mm256_sub_ps(rinv12,pmecorrV);
589             velec            = _mm256_mul_ps(qq12,velec);
590             
591             /* Update potential sum for this i atom from the interaction with this j atom. */
592             velecsum         = _mm256_add_ps(velecsum,velec);
593
594             fscal            = felec;
595
596             /* Calculate temporary vectorial force */
597             tx               = _mm256_mul_ps(fscal,dx12);
598             ty               = _mm256_mul_ps(fscal,dy12);
599             tz               = _mm256_mul_ps(fscal,dz12);
600
601             /* Update vectorial force */
602             fix1             = _mm256_add_ps(fix1,tx);
603             fiy1             = _mm256_add_ps(fiy1,ty);
604             fiz1             = _mm256_add_ps(fiz1,tz);
605
606             fjx2             = _mm256_add_ps(fjx2,tx);
607             fjy2             = _mm256_add_ps(fjy2,ty);
608             fjz2             = _mm256_add_ps(fjz2,tz);
609
610             /**************************
611              * CALCULATE INTERACTIONS *
612              **************************/
613
614             r20              = _mm256_mul_ps(rsq20,rinv20);
615
616             /* EWALD ELECTROSTATICS */
617             
618             /* Analytical PME correction */
619             zeta2            = _mm256_mul_ps(beta2,rsq20);
620             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
621             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
622             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
623             felec            = _mm256_mul_ps(qq20,felec);
624             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
625             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
626             velec            = _mm256_sub_ps(rinv20,pmecorrV);
627             velec            = _mm256_mul_ps(qq20,velec);
628             
629             /* Update potential sum for this i atom from the interaction with this j atom. */
630             velecsum         = _mm256_add_ps(velecsum,velec);
631
632             fscal            = felec;
633
634             /* Calculate temporary vectorial force */
635             tx               = _mm256_mul_ps(fscal,dx20);
636             ty               = _mm256_mul_ps(fscal,dy20);
637             tz               = _mm256_mul_ps(fscal,dz20);
638
639             /* Update vectorial force */
640             fix2             = _mm256_add_ps(fix2,tx);
641             fiy2             = _mm256_add_ps(fiy2,ty);
642             fiz2             = _mm256_add_ps(fiz2,tz);
643
644             fjx0             = _mm256_add_ps(fjx0,tx);
645             fjy0             = _mm256_add_ps(fjy0,ty);
646             fjz0             = _mm256_add_ps(fjz0,tz);
647
648             /**************************
649              * CALCULATE INTERACTIONS *
650              **************************/
651
652             r21              = _mm256_mul_ps(rsq21,rinv21);
653
654             /* EWALD ELECTROSTATICS */
655             
656             /* Analytical PME correction */
657             zeta2            = _mm256_mul_ps(beta2,rsq21);
658             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
659             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
660             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
661             felec            = _mm256_mul_ps(qq21,felec);
662             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
663             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
664             velec            = _mm256_sub_ps(rinv21,pmecorrV);
665             velec            = _mm256_mul_ps(qq21,velec);
666             
667             /* Update potential sum for this i atom from the interaction with this j atom. */
668             velecsum         = _mm256_add_ps(velecsum,velec);
669
670             fscal            = felec;
671
672             /* Calculate temporary vectorial force */
673             tx               = _mm256_mul_ps(fscal,dx21);
674             ty               = _mm256_mul_ps(fscal,dy21);
675             tz               = _mm256_mul_ps(fscal,dz21);
676
677             /* Update vectorial force */
678             fix2             = _mm256_add_ps(fix2,tx);
679             fiy2             = _mm256_add_ps(fiy2,ty);
680             fiz2             = _mm256_add_ps(fiz2,tz);
681
682             fjx1             = _mm256_add_ps(fjx1,tx);
683             fjy1             = _mm256_add_ps(fjy1,ty);
684             fjz1             = _mm256_add_ps(fjz1,tz);
685
686             /**************************
687              * CALCULATE INTERACTIONS *
688              **************************/
689
690             r22              = _mm256_mul_ps(rsq22,rinv22);
691
692             /* EWALD ELECTROSTATICS */
693             
694             /* Analytical PME correction */
695             zeta2            = _mm256_mul_ps(beta2,rsq22);
696             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
697             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
698             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
699             felec            = _mm256_mul_ps(qq22,felec);
700             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
701             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
702             velec            = _mm256_sub_ps(rinv22,pmecorrV);
703             velec            = _mm256_mul_ps(qq22,velec);
704             
705             /* Update potential sum for this i atom from the interaction with this j atom. */
706             velecsum         = _mm256_add_ps(velecsum,velec);
707
708             fscal            = felec;
709
710             /* Calculate temporary vectorial force */
711             tx               = _mm256_mul_ps(fscal,dx22);
712             ty               = _mm256_mul_ps(fscal,dy22);
713             tz               = _mm256_mul_ps(fscal,dz22);
714
715             /* Update vectorial force */
716             fix2             = _mm256_add_ps(fix2,tx);
717             fiy2             = _mm256_add_ps(fiy2,ty);
718             fiz2             = _mm256_add_ps(fiz2,tz);
719
720             fjx2             = _mm256_add_ps(fjx2,tx);
721             fjy2             = _mm256_add_ps(fjy2,ty);
722             fjz2             = _mm256_add_ps(fjz2,tz);
723
724             fjptrA             = f+j_coord_offsetA;
725             fjptrB             = f+j_coord_offsetB;
726             fjptrC             = f+j_coord_offsetC;
727             fjptrD             = f+j_coord_offsetD;
728             fjptrE             = f+j_coord_offsetE;
729             fjptrF             = f+j_coord_offsetF;
730             fjptrG             = f+j_coord_offsetG;
731             fjptrH             = f+j_coord_offsetH;
732
733             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
734                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
735
736             /* Inner loop uses 790 flops */
737         }
738
739         if(jidx<j_index_end)
740         {
741
742             /* Get j neighbor index, and coordinate index */
743             jnrlistA         = jjnr[jidx];
744             jnrlistB         = jjnr[jidx+1];
745             jnrlistC         = jjnr[jidx+2];
746             jnrlistD         = jjnr[jidx+3];
747             jnrlistE         = jjnr[jidx+4];
748             jnrlistF         = jjnr[jidx+5];
749             jnrlistG         = jjnr[jidx+6];
750             jnrlistH         = jjnr[jidx+7];
751             /* Sign of each element will be negative for non-real atoms.
752              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
753              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
754              */
755             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
756                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
757                                             
758             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
759             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
760             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
761             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
762             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
763             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
764             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
765             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
766             j_coord_offsetA  = DIM*jnrA;
767             j_coord_offsetB  = DIM*jnrB;
768             j_coord_offsetC  = DIM*jnrC;
769             j_coord_offsetD  = DIM*jnrD;
770             j_coord_offsetE  = DIM*jnrE;
771             j_coord_offsetF  = DIM*jnrF;
772             j_coord_offsetG  = DIM*jnrG;
773             j_coord_offsetH  = DIM*jnrH;
774
775             /* load j atom coordinates */
776             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
777                                                  x+j_coord_offsetC,x+j_coord_offsetD,
778                                                  x+j_coord_offsetE,x+j_coord_offsetF,
779                                                  x+j_coord_offsetG,x+j_coord_offsetH,
780                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
781
782             /* Calculate displacement vector */
783             dx00             = _mm256_sub_ps(ix0,jx0);
784             dy00             = _mm256_sub_ps(iy0,jy0);
785             dz00             = _mm256_sub_ps(iz0,jz0);
786             dx01             = _mm256_sub_ps(ix0,jx1);
787             dy01             = _mm256_sub_ps(iy0,jy1);
788             dz01             = _mm256_sub_ps(iz0,jz1);
789             dx02             = _mm256_sub_ps(ix0,jx2);
790             dy02             = _mm256_sub_ps(iy0,jy2);
791             dz02             = _mm256_sub_ps(iz0,jz2);
792             dx10             = _mm256_sub_ps(ix1,jx0);
793             dy10             = _mm256_sub_ps(iy1,jy0);
794             dz10             = _mm256_sub_ps(iz1,jz0);
795             dx11             = _mm256_sub_ps(ix1,jx1);
796             dy11             = _mm256_sub_ps(iy1,jy1);
797             dz11             = _mm256_sub_ps(iz1,jz1);
798             dx12             = _mm256_sub_ps(ix1,jx2);
799             dy12             = _mm256_sub_ps(iy1,jy2);
800             dz12             = _mm256_sub_ps(iz1,jz2);
801             dx20             = _mm256_sub_ps(ix2,jx0);
802             dy20             = _mm256_sub_ps(iy2,jy0);
803             dz20             = _mm256_sub_ps(iz2,jz0);
804             dx21             = _mm256_sub_ps(ix2,jx1);
805             dy21             = _mm256_sub_ps(iy2,jy1);
806             dz21             = _mm256_sub_ps(iz2,jz1);
807             dx22             = _mm256_sub_ps(ix2,jx2);
808             dy22             = _mm256_sub_ps(iy2,jy2);
809             dz22             = _mm256_sub_ps(iz2,jz2);
810
811             /* Calculate squared distance and things based on it */
812             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
813             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
814             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
815             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
816             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
817             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
818             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
819             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
820             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
821
822             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
823             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
824             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
825             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
826             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
827             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
828             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
829             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
830             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
831
832             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
833             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
834             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
835             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
836             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
837             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
838             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
839             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
840             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
841
842             fjx0             = _mm256_setzero_ps();
843             fjy0             = _mm256_setzero_ps();
844             fjz0             = _mm256_setzero_ps();
845             fjx1             = _mm256_setzero_ps();
846             fjy1             = _mm256_setzero_ps();
847             fjz1             = _mm256_setzero_ps();
848             fjx2             = _mm256_setzero_ps();
849             fjy2             = _mm256_setzero_ps();
850             fjz2             = _mm256_setzero_ps();
851
852             /**************************
853              * CALCULATE INTERACTIONS *
854              **************************/
855
856             r00              = _mm256_mul_ps(rsq00,rinv00);
857             r00              = _mm256_andnot_ps(dummy_mask,r00);
858
859             /* Calculate table index by multiplying r with table scale and truncate to integer */
860             rt               = _mm256_mul_ps(r00,vftabscale);
861             vfitab           = _mm256_cvttps_epi32(rt);
862             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
863             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
864             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
865             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
866             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
867             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
868
869             /* EWALD ELECTROSTATICS */
870             
871             /* Analytical PME correction */
872             zeta2            = _mm256_mul_ps(beta2,rsq00);
873             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
874             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
875             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
876             felec            = _mm256_mul_ps(qq00,felec);
877             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
878             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
879             velec            = _mm256_sub_ps(rinv00,pmecorrV);
880             velec            = _mm256_mul_ps(qq00,velec);
881             
882             /* CUBIC SPLINE TABLE DISPERSION */
883             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
884                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
885             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
886                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
887             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
888                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
889             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
890                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
891             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
892             Heps             = _mm256_mul_ps(vfeps,H);
893             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
894             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
895             vvdw6            = _mm256_mul_ps(c6_00,VV);
896             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
897             fvdw6            = _mm256_mul_ps(c6_00,FF);
898
899             /* CUBIC SPLINE TABLE REPULSION */
900             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
901             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
902             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
903                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
904             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
905                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
906             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
907                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
908             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
909                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
910             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
911             Heps             = _mm256_mul_ps(vfeps,H);
912             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
913             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
914             vvdw12           = _mm256_mul_ps(c12_00,VV);
915             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
916             fvdw12           = _mm256_mul_ps(c12_00,FF);
917             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
918             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
919
920             /* Update potential sum for this i atom from the interaction with this j atom. */
921             velec            = _mm256_andnot_ps(dummy_mask,velec);
922             velecsum         = _mm256_add_ps(velecsum,velec);
923             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
924             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
925
926             fscal            = _mm256_add_ps(felec,fvdw);
927
928             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
929
930             /* Calculate temporary vectorial force */
931             tx               = _mm256_mul_ps(fscal,dx00);
932             ty               = _mm256_mul_ps(fscal,dy00);
933             tz               = _mm256_mul_ps(fscal,dz00);
934
935             /* Update vectorial force */
936             fix0             = _mm256_add_ps(fix0,tx);
937             fiy0             = _mm256_add_ps(fiy0,ty);
938             fiz0             = _mm256_add_ps(fiz0,tz);
939
940             fjx0             = _mm256_add_ps(fjx0,tx);
941             fjy0             = _mm256_add_ps(fjy0,ty);
942             fjz0             = _mm256_add_ps(fjz0,tz);
943
944             /**************************
945              * CALCULATE INTERACTIONS *
946              **************************/
947
948             r01              = _mm256_mul_ps(rsq01,rinv01);
949             r01              = _mm256_andnot_ps(dummy_mask,r01);
950
951             /* EWALD ELECTROSTATICS */
952             
953             /* Analytical PME correction */
954             zeta2            = _mm256_mul_ps(beta2,rsq01);
955             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
956             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
957             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
958             felec            = _mm256_mul_ps(qq01,felec);
959             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
960             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
961             velec            = _mm256_sub_ps(rinv01,pmecorrV);
962             velec            = _mm256_mul_ps(qq01,velec);
963             
964             /* Update potential sum for this i atom from the interaction with this j atom. */
965             velec            = _mm256_andnot_ps(dummy_mask,velec);
966             velecsum         = _mm256_add_ps(velecsum,velec);
967
968             fscal            = felec;
969
970             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
971
972             /* Calculate temporary vectorial force */
973             tx               = _mm256_mul_ps(fscal,dx01);
974             ty               = _mm256_mul_ps(fscal,dy01);
975             tz               = _mm256_mul_ps(fscal,dz01);
976
977             /* Update vectorial force */
978             fix0             = _mm256_add_ps(fix0,tx);
979             fiy0             = _mm256_add_ps(fiy0,ty);
980             fiz0             = _mm256_add_ps(fiz0,tz);
981
982             fjx1             = _mm256_add_ps(fjx1,tx);
983             fjy1             = _mm256_add_ps(fjy1,ty);
984             fjz1             = _mm256_add_ps(fjz1,tz);
985
986             /**************************
987              * CALCULATE INTERACTIONS *
988              **************************/
989
990             r02              = _mm256_mul_ps(rsq02,rinv02);
991             r02              = _mm256_andnot_ps(dummy_mask,r02);
992
993             /* EWALD ELECTROSTATICS */
994             
995             /* Analytical PME correction */
996             zeta2            = _mm256_mul_ps(beta2,rsq02);
997             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
998             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
999             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1000             felec            = _mm256_mul_ps(qq02,felec);
1001             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1002             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1003             velec            = _mm256_sub_ps(rinv02,pmecorrV);
1004             velec            = _mm256_mul_ps(qq02,velec);
1005             
1006             /* Update potential sum for this i atom from the interaction with this j atom. */
1007             velec            = _mm256_andnot_ps(dummy_mask,velec);
1008             velecsum         = _mm256_add_ps(velecsum,velec);
1009
1010             fscal            = felec;
1011
1012             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1013
1014             /* Calculate temporary vectorial force */
1015             tx               = _mm256_mul_ps(fscal,dx02);
1016             ty               = _mm256_mul_ps(fscal,dy02);
1017             tz               = _mm256_mul_ps(fscal,dz02);
1018
1019             /* Update vectorial force */
1020             fix0             = _mm256_add_ps(fix0,tx);
1021             fiy0             = _mm256_add_ps(fiy0,ty);
1022             fiz0             = _mm256_add_ps(fiz0,tz);
1023
1024             fjx2             = _mm256_add_ps(fjx2,tx);
1025             fjy2             = _mm256_add_ps(fjy2,ty);
1026             fjz2             = _mm256_add_ps(fjz2,tz);
1027
1028             /**************************
1029              * CALCULATE INTERACTIONS *
1030              **************************/
1031
1032             r10              = _mm256_mul_ps(rsq10,rinv10);
1033             r10              = _mm256_andnot_ps(dummy_mask,r10);
1034
1035             /* EWALD ELECTROSTATICS */
1036             
1037             /* Analytical PME correction */
1038             zeta2            = _mm256_mul_ps(beta2,rsq10);
1039             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1040             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1041             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1042             felec            = _mm256_mul_ps(qq10,felec);
1043             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1044             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1045             velec            = _mm256_sub_ps(rinv10,pmecorrV);
1046             velec            = _mm256_mul_ps(qq10,velec);
1047             
1048             /* Update potential sum for this i atom from the interaction with this j atom. */
1049             velec            = _mm256_andnot_ps(dummy_mask,velec);
1050             velecsum         = _mm256_add_ps(velecsum,velec);
1051
1052             fscal            = felec;
1053
1054             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1055
1056             /* Calculate temporary vectorial force */
1057             tx               = _mm256_mul_ps(fscal,dx10);
1058             ty               = _mm256_mul_ps(fscal,dy10);
1059             tz               = _mm256_mul_ps(fscal,dz10);
1060
1061             /* Update vectorial force */
1062             fix1             = _mm256_add_ps(fix1,tx);
1063             fiy1             = _mm256_add_ps(fiy1,ty);
1064             fiz1             = _mm256_add_ps(fiz1,tz);
1065
1066             fjx0             = _mm256_add_ps(fjx0,tx);
1067             fjy0             = _mm256_add_ps(fjy0,ty);
1068             fjz0             = _mm256_add_ps(fjz0,tz);
1069
1070             /**************************
1071              * CALCULATE INTERACTIONS *
1072              **************************/
1073
1074             r11              = _mm256_mul_ps(rsq11,rinv11);
1075             r11              = _mm256_andnot_ps(dummy_mask,r11);
1076
1077             /* EWALD ELECTROSTATICS */
1078             
1079             /* Analytical PME correction */
1080             zeta2            = _mm256_mul_ps(beta2,rsq11);
1081             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1082             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1083             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1084             felec            = _mm256_mul_ps(qq11,felec);
1085             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1086             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1087             velec            = _mm256_sub_ps(rinv11,pmecorrV);
1088             velec            = _mm256_mul_ps(qq11,velec);
1089             
1090             /* Update potential sum for this i atom from the interaction with this j atom. */
1091             velec            = _mm256_andnot_ps(dummy_mask,velec);
1092             velecsum         = _mm256_add_ps(velecsum,velec);
1093
1094             fscal            = felec;
1095
1096             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1097
1098             /* Calculate temporary vectorial force */
1099             tx               = _mm256_mul_ps(fscal,dx11);
1100             ty               = _mm256_mul_ps(fscal,dy11);
1101             tz               = _mm256_mul_ps(fscal,dz11);
1102
1103             /* Update vectorial force */
1104             fix1             = _mm256_add_ps(fix1,tx);
1105             fiy1             = _mm256_add_ps(fiy1,ty);
1106             fiz1             = _mm256_add_ps(fiz1,tz);
1107
1108             fjx1             = _mm256_add_ps(fjx1,tx);
1109             fjy1             = _mm256_add_ps(fjy1,ty);
1110             fjz1             = _mm256_add_ps(fjz1,tz);
1111
1112             /**************************
1113              * CALCULATE INTERACTIONS *
1114              **************************/
1115
1116             r12              = _mm256_mul_ps(rsq12,rinv12);
1117             r12              = _mm256_andnot_ps(dummy_mask,r12);
1118
1119             /* EWALD ELECTROSTATICS */
1120             
1121             /* Analytical PME correction */
1122             zeta2            = _mm256_mul_ps(beta2,rsq12);
1123             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1124             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1125             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1126             felec            = _mm256_mul_ps(qq12,felec);
1127             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1128             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1129             velec            = _mm256_sub_ps(rinv12,pmecorrV);
1130             velec            = _mm256_mul_ps(qq12,velec);
1131             
1132             /* Update potential sum for this i atom from the interaction with this j atom. */
1133             velec            = _mm256_andnot_ps(dummy_mask,velec);
1134             velecsum         = _mm256_add_ps(velecsum,velec);
1135
1136             fscal            = felec;
1137
1138             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1139
1140             /* Calculate temporary vectorial force */
1141             tx               = _mm256_mul_ps(fscal,dx12);
1142             ty               = _mm256_mul_ps(fscal,dy12);
1143             tz               = _mm256_mul_ps(fscal,dz12);
1144
1145             /* Update vectorial force */
1146             fix1             = _mm256_add_ps(fix1,tx);
1147             fiy1             = _mm256_add_ps(fiy1,ty);
1148             fiz1             = _mm256_add_ps(fiz1,tz);
1149
1150             fjx2             = _mm256_add_ps(fjx2,tx);
1151             fjy2             = _mm256_add_ps(fjy2,ty);
1152             fjz2             = _mm256_add_ps(fjz2,tz);
1153
1154             /**************************
1155              * CALCULATE INTERACTIONS *
1156              **************************/
1157
1158             r20              = _mm256_mul_ps(rsq20,rinv20);
1159             r20              = _mm256_andnot_ps(dummy_mask,r20);
1160
1161             /* EWALD ELECTROSTATICS */
1162             
1163             /* Analytical PME correction */
1164             zeta2            = _mm256_mul_ps(beta2,rsq20);
1165             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1166             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1167             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1168             felec            = _mm256_mul_ps(qq20,felec);
1169             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1170             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1171             velec            = _mm256_sub_ps(rinv20,pmecorrV);
1172             velec            = _mm256_mul_ps(qq20,velec);
1173             
1174             /* Update potential sum for this i atom from the interaction with this j atom. */
1175             velec            = _mm256_andnot_ps(dummy_mask,velec);
1176             velecsum         = _mm256_add_ps(velecsum,velec);
1177
1178             fscal            = felec;
1179
1180             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1181
1182             /* Calculate temporary vectorial force */
1183             tx               = _mm256_mul_ps(fscal,dx20);
1184             ty               = _mm256_mul_ps(fscal,dy20);
1185             tz               = _mm256_mul_ps(fscal,dz20);
1186
1187             /* Update vectorial force */
1188             fix2             = _mm256_add_ps(fix2,tx);
1189             fiy2             = _mm256_add_ps(fiy2,ty);
1190             fiz2             = _mm256_add_ps(fiz2,tz);
1191
1192             fjx0             = _mm256_add_ps(fjx0,tx);
1193             fjy0             = _mm256_add_ps(fjy0,ty);
1194             fjz0             = _mm256_add_ps(fjz0,tz);
1195
1196             /**************************
1197              * CALCULATE INTERACTIONS *
1198              **************************/
1199
1200             r21              = _mm256_mul_ps(rsq21,rinv21);
1201             r21              = _mm256_andnot_ps(dummy_mask,r21);
1202
1203             /* EWALD ELECTROSTATICS */
1204             
1205             /* Analytical PME correction */
1206             zeta2            = _mm256_mul_ps(beta2,rsq21);
1207             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1208             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1209             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1210             felec            = _mm256_mul_ps(qq21,felec);
1211             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1212             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1213             velec            = _mm256_sub_ps(rinv21,pmecorrV);
1214             velec            = _mm256_mul_ps(qq21,velec);
1215             
1216             /* Update potential sum for this i atom from the interaction with this j atom. */
1217             velec            = _mm256_andnot_ps(dummy_mask,velec);
1218             velecsum         = _mm256_add_ps(velecsum,velec);
1219
1220             fscal            = felec;
1221
1222             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1223
1224             /* Calculate temporary vectorial force */
1225             tx               = _mm256_mul_ps(fscal,dx21);
1226             ty               = _mm256_mul_ps(fscal,dy21);
1227             tz               = _mm256_mul_ps(fscal,dz21);
1228
1229             /* Update vectorial force */
1230             fix2             = _mm256_add_ps(fix2,tx);
1231             fiy2             = _mm256_add_ps(fiy2,ty);
1232             fiz2             = _mm256_add_ps(fiz2,tz);
1233
1234             fjx1             = _mm256_add_ps(fjx1,tx);
1235             fjy1             = _mm256_add_ps(fjy1,ty);
1236             fjz1             = _mm256_add_ps(fjz1,tz);
1237
1238             /**************************
1239              * CALCULATE INTERACTIONS *
1240              **************************/
1241
1242             r22              = _mm256_mul_ps(rsq22,rinv22);
1243             r22              = _mm256_andnot_ps(dummy_mask,r22);
1244
1245             /* EWALD ELECTROSTATICS */
1246             
1247             /* Analytical PME correction */
1248             zeta2            = _mm256_mul_ps(beta2,rsq22);
1249             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1250             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1251             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1252             felec            = _mm256_mul_ps(qq22,felec);
1253             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1254             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1255             velec            = _mm256_sub_ps(rinv22,pmecorrV);
1256             velec            = _mm256_mul_ps(qq22,velec);
1257             
1258             /* Update potential sum for this i atom from the interaction with this j atom. */
1259             velec            = _mm256_andnot_ps(dummy_mask,velec);
1260             velecsum         = _mm256_add_ps(velecsum,velec);
1261
1262             fscal            = felec;
1263
1264             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1265
1266             /* Calculate temporary vectorial force */
1267             tx               = _mm256_mul_ps(fscal,dx22);
1268             ty               = _mm256_mul_ps(fscal,dy22);
1269             tz               = _mm256_mul_ps(fscal,dz22);
1270
1271             /* Update vectorial force */
1272             fix2             = _mm256_add_ps(fix2,tx);
1273             fiy2             = _mm256_add_ps(fiy2,ty);
1274             fiz2             = _mm256_add_ps(fiz2,tz);
1275
1276             fjx2             = _mm256_add_ps(fjx2,tx);
1277             fjy2             = _mm256_add_ps(fjy2,ty);
1278             fjz2             = _mm256_add_ps(fjz2,tz);
1279
1280             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1281             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1282             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1283             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1284             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1285             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1286             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1287             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1288
1289             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1290                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1291
1292             /* Inner loop uses 799 flops */
1293         }
1294
1295         /* End of innermost loop */
1296
1297         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1298                                                  f+i_coord_offset,fshift+i_shift_offset);
1299
1300         ggid                        = gid[iidx];
1301         /* Update potential energies */
1302         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1303         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1304
1305         /* Increment number of inner iterations */
1306         inneriter                  += j_index_end - j_index_start;
1307
1308         /* Outer loop uses 20 flops */
1309     }
1310
1311     /* Increment number of outer iterations */
1312     outeriter        += nri;
1313
1314     /* Update outer/inner flops */
1315
1316     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*799);
1317 }
1318 /*
1319  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_single
1320  * Electrostatics interaction: Ewald
1321  * VdW interaction:            CubicSplineTable
1322  * Geometry:                   Water3-Water3
1323  * Calculate force/pot:        Force
1324  */
1325 void
1326 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_single
1327                     (t_nblist                    * gmx_restrict       nlist,
1328                      rvec                        * gmx_restrict          xx,
1329                      rvec                        * gmx_restrict          ff,
1330                      t_forcerec                  * gmx_restrict          fr,
1331                      t_mdatoms                   * gmx_restrict     mdatoms,
1332                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1333                      t_nrnb                      * gmx_restrict        nrnb)
1334 {
1335     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1336      * just 0 for non-waters.
1337      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1338      * jnr indices corresponding to data put in the four positions in the SIMD register.
1339      */
1340     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1341     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1342     int              jnrA,jnrB,jnrC,jnrD;
1343     int              jnrE,jnrF,jnrG,jnrH;
1344     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1345     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1346     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1347     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1348     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1349     real             rcutoff_scalar;
1350     real             *shiftvec,*fshift,*x,*f;
1351     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1352     real             scratch[4*DIM];
1353     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1354     real *           vdwioffsetptr0;
1355     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1356     real *           vdwioffsetptr1;
1357     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1358     real *           vdwioffsetptr2;
1359     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1360     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1361     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1362     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1363     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1364     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1365     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1366     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1367     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1368     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1369     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1370     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1371     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1372     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1373     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1374     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1375     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1376     real             *charge;
1377     int              nvdwtype;
1378     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1379     int              *vdwtype;
1380     real             *vdwparam;
1381     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1382     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1383     __m256i          vfitab;
1384     __m128i          vfitab_lo,vfitab_hi;
1385     __m128i          ifour       = _mm_set1_epi32(4);
1386     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1387     real             *vftab;
1388     __m256i          ewitab;
1389     __m128i          ewitab_lo,ewitab_hi;
1390     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1391     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1392     real             *ewtab;
1393     __m256           dummy_mask,cutoff_mask;
1394     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1395     __m256           one     = _mm256_set1_ps(1.0);
1396     __m256           two     = _mm256_set1_ps(2.0);
1397     x                = xx[0];
1398     f                = ff[0];
1399
1400     nri              = nlist->nri;
1401     iinr             = nlist->iinr;
1402     jindex           = nlist->jindex;
1403     jjnr             = nlist->jjnr;
1404     shiftidx         = nlist->shift;
1405     gid              = nlist->gid;
1406     shiftvec         = fr->shift_vec[0];
1407     fshift           = fr->fshift[0];
1408     facel            = _mm256_set1_ps(fr->epsfac);
1409     charge           = mdatoms->chargeA;
1410     nvdwtype         = fr->ntype;
1411     vdwparam         = fr->nbfp;
1412     vdwtype          = mdatoms->typeA;
1413
1414     vftab            = kernel_data->table_vdw->data;
1415     vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
1416
1417     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1418     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
1419     beta2            = _mm256_mul_ps(beta,beta);
1420     beta3            = _mm256_mul_ps(beta,beta2);
1421
1422     ewtab            = fr->ic->tabq_coul_F;
1423     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1424     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1425
1426     /* Setup water-specific parameters */
1427     inr              = nlist->iinr[0];
1428     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1429     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1430     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1431     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1432
1433     jq0              = _mm256_set1_ps(charge[inr+0]);
1434     jq1              = _mm256_set1_ps(charge[inr+1]);
1435     jq2              = _mm256_set1_ps(charge[inr+2]);
1436     vdwjidx0A        = 2*vdwtype[inr+0];
1437     qq00             = _mm256_mul_ps(iq0,jq0);
1438     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1439     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1440     qq01             = _mm256_mul_ps(iq0,jq1);
1441     qq02             = _mm256_mul_ps(iq0,jq2);
1442     qq10             = _mm256_mul_ps(iq1,jq0);
1443     qq11             = _mm256_mul_ps(iq1,jq1);
1444     qq12             = _mm256_mul_ps(iq1,jq2);
1445     qq20             = _mm256_mul_ps(iq2,jq0);
1446     qq21             = _mm256_mul_ps(iq2,jq1);
1447     qq22             = _mm256_mul_ps(iq2,jq2);
1448
1449     /* Avoid stupid compiler warnings */
1450     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1451     j_coord_offsetA = 0;
1452     j_coord_offsetB = 0;
1453     j_coord_offsetC = 0;
1454     j_coord_offsetD = 0;
1455     j_coord_offsetE = 0;
1456     j_coord_offsetF = 0;
1457     j_coord_offsetG = 0;
1458     j_coord_offsetH = 0;
1459
1460     outeriter        = 0;
1461     inneriter        = 0;
1462
1463     for(iidx=0;iidx<4*DIM;iidx++)
1464     {
1465         scratch[iidx] = 0.0;
1466     }
1467
1468     /* Start outer loop over neighborlists */
1469     for(iidx=0; iidx<nri; iidx++)
1470     {
1471         /* Load shift vector for this list */
1472         i_shift_offset   = DIM*shiftidx[iidx];
1473
1474         /* Load limits for loop over neighbors */
1475         j_index_start    = jindex[iidx];
1476         j_index_end      = jindex[iidx+1];
1477
1478         /* Get outer coordinate index */
1479         inr              = iinr[iidx];
1480         i_coord_offset   = DIM*inr;
1481
1482         /* Load i particle coords and add shift vector */
1483         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1484                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1485
1486         fix0             = _mm256_setzero_ps();
1487         fiy0             = _mm256_setzero_ps();
1488         fiz0             = _mm256_setzero_ps();
1489         fix1             = _mm256_setzero_ps();
1490         fiy1             = _mm256_setzero_ps();
1491         fiz1             = _mm256_setzero_ps();
1492         fix2             = _mm256_setzero_ps();
1493         fiy2             = _mm256_setzero_ps();
1494         fiz2             = _mm256_setzero_ps();
1495
1496         /* Start inner kernel loop */
1497         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1498         {
1499
1500             /* Get j neighbor index, and coordinate index */
1501             jnrA             = jjnr[jidx];
1502             jnrB             = jjnr[jidx+1];
1503             jnrC             = jjnr[jidx+2];
1504             jnrD             = jjnr[jidx+3];
1505             jnrE             = jjnr[jidx+4];
1506             jnrF             = jjnr[jidx+5];
1507             jnrG             = jjnr[jidx+6];
1508             jnrH             = jjnr[jidx+7];
1509             j_coord_offsetA  = DIM*jnrA;
1510             j_coord_offsetB  = DIM*jnrB;
1511             j_coord_offsetC  = DIM*jnrC;
1512             j_coord_offsetD  = DIM*jnrD;
1513             j_coord_offsetE  = DIM*jnrE;
1514             j_coord_offsetF  = DIM*jnrF;
1515             j_coord_offsetG  = DIM*jnrG;
1516             j_coord_offsetH  = DIM*jnrH;
1517
1518             /* load j atom coordinates */
1519             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1520                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1521                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1522                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1523                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1524
1525             /* Calculate displacement vector */
1526             dx00             = _mm256_sub_ps(ix0,jx0);
1527             dy00             = _mm256_sub_ps(iy0,jy0);
1528             dz00             = _mm256_sub_ps(iz0,jz0);
1529             dx01             = _mm256_sub_ps(ix0,jx1);
1530             dy01             = _mm256_sub_ps(iy0,jy1);
1531             dz01             = _mm256_sub_ps(iz0,jz1);
1532             dx02             = _mm256_sub_ps(ix0,jx2);
1533             dy02             = _mm256_sub_ps(iy0,jy2);
1534             dz02             = _mm256_sub_ps(iz0,jz2);
1535             dx10             = _mm256_sub_ps(ix1,jx0);
1536             dy10             = _mm256_sub_ps(iy1,jy0);
1537             dz10             = _mm256_sub_ps(iz1,jz0);
1538             dx11             = _mm256_sub_ps(ix1,jx1);
1539             dy11             = _mm256_sub_ps(iy1,jy1);
1540             dz11             = _mm256_sub_ps(iz1,jz1);
1541             dx12             = _mm256_sub_ps(ix1,jx2);
1542             dy12             = _mm256_sub_ps(iy1,jy2);
1543             dz12             = _mm256_sub_ps(iz1,jz2);
1544             dx20             = _mm256_sub_ps(ix2,jx0);
1545             dy20             = _mm256_sub_ps(iy2,jy0);
1546             dz20             = _mm256_sub_ps(iz2,jz0);
1547             dx21             = _mm256_sub_ps(ix2,jx1);
1548             dy21             = _mm256_sub_ps(iy2,jy1);
1549             dz21             = _mm256_sub_ps(iz2,jz1);
1550             dx22             = _mm256_sub_ps(ix2,jx2);
1551             dy22             = _mm256_sub_ps(iy2,jy2);
1552             dz22             = _mm256_sub_ps(iz2,jz2);
1553
1554             /* Calculate squared distance and things based on it */
1555             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1556             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1557             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1558             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1559             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1560             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1561             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1562             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1563             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1564
1565             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1566             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1567             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1568             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1569             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1570             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1571             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1572             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1573             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1574
1575             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1576             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1577             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1578             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1579             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1580             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1581             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1582             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1583             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1584
1585             fjx0             = _mm256_setzero_ps();
1586             fjy0             = _mm256_setzero_ps();
1587             fjz0             = _mm256_setzero_ps();
1588             fjx1             = _mm256_setzero_ps();
1589             fjy1             = _mm256_setzero_ps();
1590             fjz1             = _mm256_setzero_ps();
1591             fjx2             = _mm256_setzero_ps();
1592             fjy2             = _mm256_setzero_ps();
1593             fjz2             = _mm256_setzero_ps();
1594
1595             /**************************
1596              * CALCULATE INTERACTIONS *
1597              **************************/
1598
1599             r00              = _mm256_mul_ps(rsq00,rinv00);
1600
1601             /* Calculate table index by multiplying r with table scale and truncate to integer */
1602             rt               = _mm256_mul_ps(r00,vftabscale);
1603             vfitab           = _mm256_cvttps_epi32(rt);
1604             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1605             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1606             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1607             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1608             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
1609             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
1610
1611             /* EWALD ELECTROSTATICS */
1612             
1613             /* Analytical PME correction */
1614             zeta2            = _mm256_mul_ps(beta2,rsq00);
1615             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1616             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1617             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1618             felec            = _mm256_mul_ps(qq00,felec);
1619             
1620             /* CUBIC SPLINE TABLE DISPERSION */
1621             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1622                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1623             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1624                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1625             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1626                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1627             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1628                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1629             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1630             Heps             = _mm256_mul_ps(vfeps,H);
1631             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1632             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1633             fvdw6            = _mm256_mul_ps(c6_00,FF);
1634
1635             /* CUBIC SPLINE TABLE REPULSION */
1636             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1637             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1638             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1639                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1640             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1641                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1642             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1643                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1644             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1645                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1646             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1647             Heps             = _mm256_mul_ps(vfeps,H);
1648             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1649             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1650             fvdw12           = _mm256_mul_ps(c12_00,FF);
1651             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1652
1653             fscal            = _mm256_add_ps(felec,fvdw);
1654
1655             /* Calculate temporary vectorial force */
1656             tx               = _mm256_mul_ps(fscal,dx00);
1657             ty               = _mm256_mul_ps(fscal,dy00);
1658             tz               = _mm256_mul_ps(fscal,dz00);
1659
1660             /* Update vectorial force */
1661             fix0             = _mm256_add_ps(fix0,tx);
1662             fiy0             = _mm256_add_ps(fiy0,ty);
1663             fiz0             = _mm256_add_ps(fiz0,tz);
1664
1665             fjx0             = _mm256_add_ps(fjx0,tx);
1666             fjy0             = _mm256_add_ps(fjy0,ty);
1667             fjz0             = _mm256_add_ps(fjz0,tz);
1668
1669             /**************************
1670              * CALCULATE INTERACTIONS *
1671              **************************/
1672
1673             r01              = _mm256_mul_ps(rsq01,rinv01);
1674
1675             /* EWALD ELECTROSTATICS */
1676             
1677             /* Analytical PME correction */
1678             zeta2            = _mm256_mul_ps(beta2,rsq01);
1679             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1680             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1681             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1682             felec            = _mm256_mul_ps(qq01,felec);
1683             
1684             fscal            = felec;
1685
1686             /* Calculate temporary vectorial force */
1687             tx               = _mm256_mul_ps(fscal,dx01);
1688             ty               = _mm256_mul_ps(fscal,dy01);
1689             tz               = _mm256_mul_ps(fscal,dz01);
1690
1691             /* Update vectorial force */
1692             fix0             = _mm256_add_ps(fix0,tx);
1693             fiy0             = _mm256_add_ps(fiy0,ty);
1694             fiz0             = _mm256_add_ps(fiz0,tz);
1695
1696             fjx1             = _mm256_add_ps(fjx1,tx);
1697             fjy1             = _mm256_add_ps(fjy1,ty);
1698             fjz1             = _mm256_add_ps(fjz1,tz);
1699
1700             /**************************
1701              * CALCULATE INTERACTIONS *
1702              **************************/
1703
1704             r02              = _mm256_mul_ps(rsq02,rinv02);
1705
1706             /* EWALD ELECTROSTATICS */
1707             
1708             /* Analytical PME correction */
1709             zeta2            = _mm256_mul_ps(beta2,rsq02);
1710             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1711             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1712             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1713             felec            = _mm256_mul_ps(qq02,felec);
1714             
1715             fscal            = felec;
1716
1717             /* Calculate temporary vectorial force */
1718             tx               = _mm256_mul_ps(fscal,dx02);
1719             ty               = _mm256_mul_ps(fscal,dy02);
1720             tz               = _mm256_mul_ps(fscal,dz02);
1721
1722             /* Update vectorial force */
1723             fix0             = _mm256_add_ps(fix0,tx);
1724             fiy0             = _mm256_add_ps(fiy0,ty);
1725             fiz0             = _mm256_add_ps(fiz0,tz);
1726
1727             fjx2             = _mm256_add_ps(fjx2,tx);
1728             fjy2             = _mm256_add_ps(fjy2,ty);
1729             fjz2             = _mm256_add_ps(fjz2,tz);
1730
1731             /**************************
1732              * CALCULATE INTERACTIONS *
1733              **************************/
1734
1735             r10              = _mm256_mul_ps(rsq10,rinv10);
1736
1737             /* EWALD ELECTROSTATICS */
1738             
1739             /* Analytical PME correction */
1740             zeta2            = _mm256_mul_ps(beta2,rsq10);
1741             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1742             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1743             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1744             felec            = _mm256_mul_ps(qq10,felec);
1745             
1746             fscal            = felec;
1747
1748             /* Calculate temporary vectorial force */
1749             tx               = _mm256_mul_ps(fscal,dx10);
1750             ty               = _mm256_mul_ps(fscal,dy10);
1751             tz               = _mm256_mul_ps(fscal,dz10);
1752
1753             /* Update vectorial force */
1754             fix1             = _mm256_add_ps(fix1,tx);
1755             fiy1             = _mm256_add_ps(fiy1,ty);
1756             fiz1             = _mm256_add_ps(fiz1,tz);
1757
1758             fjx0             = _mm256_add_ps(fjx0,tx);
1759             fjy0             = _mm256_add_ps(fjy0,ty);
1760             fjz0             = _mm256_add_ps(fjz0,tz);
1761
1762             /**************************
1763              * CALCULATE INTERACTIONS *
1764              **************************/
1765
1766             r11              = _mm256_mul_ps(rsq11,rinv11);
1767
1768             /* EWALD ELECTROSTATICS */
1769             
1770             /* Analytical PME correction */
1771             zeta2            = _mm256_mul_ps(beta2,rsq11);
1772             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1773             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1774             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1775             felec            = _mm256_mul_ps(qq11,felec);
1776             
1777             fscal            = felec;
1778
1779             /* Calculate temporary vectorial force */
1780             tx               = _mm256_mul_ps(fscal,dx11);
1781             ty               = _mm256_mul_ps(fscal,dy11);
1782             tz               = _mm256_mul_ps(fscal,dz11);
1783
1784             /* Update vectorial force */
1785             fix1             = _mm256_add_ps(fix1,tx);
1786             fiy1             = _mm256_add_ps(fiy1,ty);
1787             fiz1             = _mm256_add_ps(fiz1,tz);
1788
1789             fjx1             = _mm256_add_ps(fjx1,tx);
1790             fjy1             = _mm256_add_ps(fjy1,ty);
1791             fjz1             = _mm256_add_ps(fjz1,tz);
1792
1793             /**************************
1794              * CALCULATE INTERACTIONS *
1795              **************************/
1796
1797             r12              = _mm256_mul_ps(rsq12,rinv12);
1798
1799             /* EWALD ELECTROSTATICS */
1800             
1801             /* Analytical PME correction */
1802             zeta2            = _mm256_mul_ps(beta2,rsq12);
1803             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1804             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1805             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1806             felec            = _mm256_mul_ps(qq12,felec);
1807             
1808             fscal            = felec;
1809
1810             /* Calculate temporary vectorial force */
1811             tx               = _mm256_mul_ps(fscal,dx12);
1812             ty               = _mm256_mul_ps(fscal,dy12);
1813             tz               = _mm256_mul_ps(fscal,dz12);
1814
1815             /* Update vectorial force */
1816             fix1             = _mm256_add_ps(fix1,tx);
1817             fiy1             = _mm256_add_ps(fiy1,ty);
1818             fiz1             = _mm256_add_ps(fiz1,tz);
1819
1820             fjx2             = _mm256_add_ps(fjx2,tx);
1821             fjy2             = _mm256_add_ps(fjy2,ty);
1822             fjz2             = _mm256_add_ps(fjz2,tz);
1823
1824             /**************************
1825              * CALCULATE INTERACTIONS *
1826              **************************/
1827
1828             r20              = _mm256_mul_ps(rsq20,rinv20);
1829
1830             /* EWALD ELECTROSTATICS */
1831             
1832             /* Analytical PME correction */
1833             zeta2            = _mm256_mul_ps(beta2,rsq20);
1834             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1835             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1836             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1837             felec            = _mm256_mul_ps(qq20,felec);
1838             
1839             fscal            = felec;
1840
1841             /* Calculate temporary vectorial force */
1842             tx               = _mm256_mul_ps(fscal,dx20);
1843             ty               = _mm256_mul_ps(fscal,dy20);
1844             tz               = _mm256_mul_ps(fscal,dz20);
1845
1846             /* Update vectorial force */
1847             fix2             = _mm256_add_ps(fix2,tx);
1848             fiy2             = _mm256_add_ps(fiy2,ty);
1849             fiz2             = _mm256_add_ps(fiz2,tz);
1850
1851             fjx0             = _mm256_add_ps(fjx0,tx);
1852             fjy0             = _mm256_add_ps(fjy0,ty);
1853             fjz0             = _mm256_add_ps(fjz0,tz);
1854
1855             /**************************
1856              * CALCULATE INTERACTIONS *
1857              **************************/
1858
1859             r21              = _mm256_mul_ps(rsq21,rinv21);
1860
1861             /* EWALD ELECTROSTATICS */
1862             
1863             /* Analytical PME correction */
1864             zeta2            = _mm256_mul_ps(beta2,rsq21);
1865             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1866             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1867             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1868             felec            = _mm256_mul_ps(qq21,felec);
1869             
1870             fscal            = felec;
1871
1872             /* Calculate temporary vectorial force */
1873             tx               = _mm256_mul_ps(fscal,dx21);
1874             ty               = _mm256_mul_ps(fscal,dy21);
1875             tz               = _mm256_mul_ps(fscal,dz21);
1876
1877             /* Update vectorial force */
1878             fix2             = _mm256_add_ps(fix2,tx);
1879             fiy2             = _mm256_add_ps(fiy2,ty);
1880             fiz2             = _mm256_add_ps(fiz2,tz);
1881
1882             fjx1             = _mm256_add_ps(fjx1,tx);
1883             fjy1             = _mm256_add_ps(fjy1,ty);
1884             fjz1             = _mm256_add_ps(fjz1,tz);
1885
1886             /**************************
1887              * CALCULATE INTERACTIONS *
1888              **************************/
1889
1890             r22              = _mm256_mul_ps(rsq22,rinv22);
1891
1892             /* EWALD ELECTROSTATICS */
1893             
1894             /* Analytical PME correction */
1895             zeta2            = _mm256_mul_ps(beta2,rsq22);
1896             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1897             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1898             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1899             felec            = _mm256_mul_ps(qq22,felec);
1900             
1901             fscal            = felec;
1902
1903             /* Calculate temporary vectorial force */
1904             tx               = _mm256_mul_ps(fscal,dx22);
1905             ty               = _mm256_mul_ps(fscal,dy22);
1906             tz               = _mm256_mul_ps(fscal,dz22);
1907
1908             /* Update vectorial force */
1909             fix2             = _mm256_add_ps(fix2,tx);
1910             fiy2             = _mm256_add_ps(fiy2,ty);
1911             fiz2             = _mm256_add_ps(fiz2,tz);
1912
1913             fjx2             = _mm256_add_ps(fjx2,tx);
1914             fjy2             = _mm256_add_ps(fjy2,ty);
1915             fjz2             = _mm256_add_ps(fjz2,tz);
1916
1917             fjptrA             = f+j_coord_offsetA;
1918             fjptrB             = f+j_coord_offsetB;
1919             fjptrC             = f+j_coord_offsetC;
1920             fjptrD             = f+j_coord_offsetD;
1921             fjptrE             = f+j_coord_offsetE;
1922             fjptrF             = f+j_coord_offsetF;
1923             fjptrG             = f+j_coord_offsetG;
1924             fjptrH             = f+j_coord_offsetH;
1925
1926             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1927                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1928
1929             /* Inner loop uses 530 flops */
1930         }
1931
1932         if(jidx<j_index_end)
1933         {
1934
1935             /* Get j neighbor index, and coordinate index */
1936             jnrlistA         = jjnr[jidx];
1937             jnrlistB         = jjnr[jidx+1];
1938             jnrlistC         = jjnr[jidx+2];
1939             jnrlistD         = jjnr[jidx+3];
1940             jnrlistE         = jjnr[jidx+4];
1941             jnrlistF         = jjnr[jidx+5];
1942             jnrlistG         = jjnr[jidx+6];
1943             jnrlistH         = jjnr[jidx+7];
1944             /* Sign of each element will be negative for non-real atoms.
1945              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1946              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1947              */
1948             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1949                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1950                                             
1951             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1952             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1953             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1954             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1955             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1956             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1957             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1958             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1959             j_coord_offsetA  = DIM*jnrA;
1960             j_coord_offsetB  = DIM*jnrB;
1961             j_coord_offsetC  = DIM*jnrC;
1962             j_coord_offsetD  = DIM*jnrD;
1963             j_coord_offsetE  = DIM*jnrE;
1964             j_coord_offsetF  = DIM*jnrF;
1965             j_coord_offsetG  = DIM*jnrG;
1966             j_coord_offsetH  = DIM*jnrH;
1967
1968             /* load j atom coordinates */
1969             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1970                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1971                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1972                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1973                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1974
1975             /* Calculate displacement vector */
1976             dx00             = _mm256_sub_ps(ix0,jx0);
1977             dy00             = _mm256_sub_ps(iy0,jy0);
1978             dz00             = _mm256_sub_ps(iz0,jz0);
1979             dx01             = _mm256_sub_ps(ix0,jx1);
1980             dy01             = _mm256_sub_ps(iy0,jy1);
1981             dz01             = _mm256_sub_ps(iz0,jz1);
1982             dx02             = _mm256_sub_ps(ix0,jx2);
1983             dy02             = _mm256_sub_ps(iy0,jy2);
1984             dz02             = _mm256_sub_ps(iz0,jz2);
1985             dx10             = _mm256_sub_ps(ix1,jx0);
1986             dy10             = _mm256_sub_ps(iy1,jy0);
1987             dz10             = _mm256_sub_ps(iz1,jz0);
1988             dx11             = _mm256_sub_ps(ix1,jx1);
1989             dy11             = _mm256_sub_ps(iy1,jy1);
1990             dz11             = _mm256_sub_ps(iz1,jz1);
1991             dx12             = _mm256_sub_ps(ix1,jx2);
1992             dy12             = _mm256_sub_ps(iy1,jy2);
1993             dz12             = _mm256_sub_ps(iz1,jz2);
1994             dx20             = _mm256_sub_ps(ix2,jx0);
1995             dy20             = _mm256_sub_ps(iy2,jy0);
1996             dz20             = _mm256_sub_ps(iz2,jz0);
1997             dx21             = _mm256_sub_ps(ix2,jx1);
1998             dy21             = _mm256_sub_ps(iy2,jy1);
1999             dz21             = _mm256_sub_ps(iz2,jz1);
2000             dx22             = _mm256_sub_ps(ix2,jx2);
2001             dy22             = _mm256_sub_ps(iy2,jy2);
2002             dz22             = _mm256_sub_ps(iz2,jz2);
2003
2004             /* Calculate squared distance and things based on it */
2005             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2006             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2007             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2008             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2009             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2010             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2011             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2012             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2013             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2014
2015             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2016             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
2017             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
2018             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
2019             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2020             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2021             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
2022             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2023             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2024
2025             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
2026             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
2027             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
2028             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
2029             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2030             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2031             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
2032             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2033             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2034
2035             fjx0             = _mm256_setzero_ps();
2036             fjy0             = _mm256_setzero_ps();
2037             fjz0             = _mm256_setzero_ps();
2038             fjx1             = _mm256_setzero_ps();
2039             fjy1             = _mm256_setzero_ps();
2040             fjz1             = _mm256_setzero_ps();
2041             fjx2             = _mm256_setzero_ps();
2042             fjy2             = _mm256_setzero_ps();
2043             fjz2             = _mm256_setzero_ps();
2044
2045             /**************************
2046              * CALCULATE INTERACTIONS *
2047              **************************/
2048
2049             r00              = _mm256_mul_ps(rsq00,rinv00);
2050             r00              = _mm256_andnot_ps(dummy_mask,r00);
2051
2052             /* Calculate table index by multiplying r with table scale and truncate to integer */
2053             rt               = _mm256_mul_ps(r00,vftabscale);
2054             vfitab           = _mm256_cvttps_epi32(rt);
2055             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2056             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2057             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2058             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2059             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
2060             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
2061
2062             /* EWALD ELECTROSTATICS */
2063             
2064             /* Analytical PME correction */
2065             zeta2            = _mm256_mul_ps(beta2,rsq00);
2066             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
2067             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2068             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2069             felec            = _mm256_mul_ps(qq00,felec);
2070             
2071             /* CUBIC SPLINE TABLE DISPERSION */
2072             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2073                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2074             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2075                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2076             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2077                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2078             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2079                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2080             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2081             Heps             = _mm256_mul_ps(vfeps,H);
2082             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2083             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2084             fvdw6            = _mm256_mul_ps(c6_00,FF);
2085
2086             /* CUBIC SPLINE TABLE REPULSION */
2087             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
2088             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
2089             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2090                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2091             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2092                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2093             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2094                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2095             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2096                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2097             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2098             Heps             = _mm256_mul_ps(vfeps,H);
2099             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2100             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2101             fvdw12           = _mm256_mul_ps(c12_00,FF);
2102             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2103
2104             fscal            = _mm256_add_ps(felec,fvdw);
2105
2106             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2107
2108             /* Calculate temporary vectorial force */
2109             tx               = _mm256_mul_ps(fscal,dx00);
2110             ty               = _mm256_mul_ps(fscal,dy00);
2111             tz               = _mm256_mul_ps(fscal,dz00);
2112
2113             /* Update vectorial force */
2114             fix0             = _mm256_add_ps(fix0,tx);
2115             fiy0             = _mm256_add_ps(fiy0,ty);
2116             fiz0             = _mm256_add_ps(fiz0,tz);
2117
2118             fjx0             = _mm256_add_ps(fjx0,tx);
2119             fjy0             = _mm256_add_ps(fjy0,ty);
2120             fjz0             = _mm256_add_ps(fjz0,tz);
2121
2122             /**************************
2123              * CALCULATE INTERACTIONS *
2124              **************************/
2125
2126             r01              = _mm256_mul_ps(rsq01,rinv01);
2127             r01              = _mm256_andnot_ps(dummy_mask,r01);
2128
2129             /* EWALD ELECTROSTATICS */
2130             
2131             /* Analytical PME correction */
2132             zeta2            = _mm256_mul_ps(beta2,rsq01);
2133             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
2134             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2135             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2136             felec            = _mm256_mul_ps(qq01,felec);
2137             
2138             fscal            = felec;
2139
2140             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2141
2142             /* Calculate temporary vectorial force */
2143             tx               = _mm256_mul_ps(fscal,dx01);
2144             ty               = _mm256_mul_ps(fscal,dy01);
2145             tz               = _mm256_mul_ps(fscal,dz01);
2146
2147             /* Update vectorial force */
2148             fix0             = _mm256_add_ps(fix0,tx);
2149             fiy0             = _mm256_add_ps(fiy0,ty);
2150             fiz0             = _mm256_add_ps(fiz0,tz);
2151
2152             fjx1             = _mm256_add_ps(fjx1,tx);
2153             fjy1             = _mm256_add_ps(fjy1,ty);
2154             fjz1             = _mm256_add_ps(fjz1,tz);
2155
2156             /**************************
2157              * CALCULATE INTERACTIONS *
2158              **************************/
2159
2160             r02              = _mm256_mul_ps(rsq02,rinv02);
2161             r02              = _mm256_andnot_ps(dummy_mask,r02);
2162
2163             /* EWALD ELECTROSTATICS */
2164             
2165             /* Analytical PME correction */
2166             zeta2            = _mm256_mul_ps(beta2,rsq02);
2167             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
2168             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2169             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2170             felec            = _mm256_mul_ps(qq02,felec);
2171             
2172             fscal            = felec;
2173
2174             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2175
2176             /* Calculate temporary vectorial force */
2177             tx               = _mm256_mul_ps(fscal,dx02);
2178             ty               = _mm256_mul_ps(fscal,dy02);
2179             tz               = _mm256_mul_ps(fscal,dz02);
2180
2181             /* Update vectorial force */
2182             fix0             = _mm256_add_ps(fix0,tx);
2183             fiy0             = _mm256_add_ps(fiy0,ty);
2184             fiz0             = _mm256_add_ps(fiz0,tz);
2185
2186             fjx2             = _mm256_add_ps(fjx2,tx);
2187             fjy2             = _mm256_add_ps(fjy2,ty);
2188             fjz2             = _mm256_add_ps(fjz2,tz);
2189
2190             /**************************
2191              * CALCULATE INTERACTIONS *
2192              **************************/
2193
2194             r10              = _mm256_mul_ps(rsq10,rinv10);
2195             r10              = _mm256_andnot_ps(dummy_mask,r10);
2196
2197             /* EWALD ELECTROSTATICS */
2198             
2199             /* Analytical PME correction */
2200             zeta2            = _mm256_mul_ps(beta2,rsq10);
2201             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2202             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2203             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2204             felec            = _mm256_mul_ps(qq10,felec);
2205             
2206             fscal            = felec;
2207
2208             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2209
2210             /* Calculate temporary vectorial force */
2211             tx               = _mm256_mul_ps(fscal,dx10);
2212             ty               = _mm256_mul_ps(fscal,dy10);
2213             tz               = _mm256_mul_ps(fscal,dz10);
2214
2215             /* Update vectorial force */
2216             fix1             = _mm256_add_ps(fix1,tx);
2217             fiy1             = _mm256_add_ps(fiy1,ty);
2218             fiz1             = _mm256_add_ps(fiz1,tz);
2219
2220             fjx0             = _mm256_add_ps(fjx0,tx);
2221             fjy0             = _mm256_add_ps(fjy0,ty);
2222             fjz0             = _mm256_add_ps(fjz0,tz);
2223
2224             /**************************
2225              * CALCULATE INTERACTIONS *
2226              **************************/
2227
2228             r11              = _mm256_mul_ps(rsq11,rinv11);
2229             r11              = _mm256_andnot_ps(dummy_mask,r11);
2230
2231             /* EWALD ELECTROSTATICS */
2232             
2233             /* Analytical PME correction */
2234             zeta2            = _mm256_mul_ps(beta2,rsq11);
2235             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2236             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2237             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2238             felec            = _mm256_mul_ps(qq11,felec);
2239             
2240             fscal            = felec;
2241
2242             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2243
2244             /* Calculate temporary vectorial force */
2245             tx               = _mm256_mul_ps(fscal,dx11);
2246             ty               = _mm256_mul_ps(fscal,dy11);
2247             tz               = _mm256_mul_ps(fscal,dz11);
2248
2249             /* Update vectorial force */
2250             fix1             = _mm256_add_ps(fix1,tx);
2251             fiy1             = _mm256_add_ps(fiy1,ty);
2252             fiz1             = _mm256_add_ps(fiz1,tz);
2253
2254             fjx1             = _mm256_add_ps(fjx1,tx);
2255             fjy1             = _mm256_add_ps(fjy1,ty);
2256             fjz1             = _mm256_add_ps(fjz1,tz);
2257
2258             /**************************
2259              * CALCULATE INTERACTIONS *
2260              **************************/
2261
2262             r12              = _mm256_mul_ps(rsq12,rinv12);
2263             r12              = _mm256_andnot_ps(dummy_mask,r12);
2264
2265             /* EWALD ELECTROSTATICS */
2266             
2267             /* Analytical PME correction */
2268             zeta2            = _mm256_mul_ps(beta2,rsq12);
2269             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2270             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2271             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2272             felec            = _mm256_mul_ps(qq12,felec);
2273             
2274             fscal            = felec;
2275
2276             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2277
2278             /* Calculate temporary vectorial force */
2279             tx               = _mm256_mul_ps(fscal,dx12);
2280             ty               = _mm256_mul_ps(fscal,dy12);
2281             tz               = _mm256_mul_ps(fscal,dz12);
2282
2283             /* Update vectorial force */
2284             fix1             = _mm256_add_ps(fix1,tx);
2285             fiy1             = _mm256_add_ps(fiy1,ty);
2286             fiz1             = _mm256_add_ps(fiz1,tz);
2287
2288             fjx2             = _mm256_add_ps(fjx2,tx);
2289             fjy2             = _mm256_add_ps(fjy2,ty);
2290             fjz2             = _mm256_add_ps(fjz2,tz);
2291
2292             /**************************
2293              * CALCULATE INTERACTIONS *
2294              **************************/
2295
2296             r20              = _mm256_mul_ps(rsq20,rinv20);
2297             r20              = _mm256_andnot_ps(dummy_mask,r20);
2298
2299             /* EWALD ELECTROSTATICS */
2300             
2301             /* Analytical PME correction */
2302             zeta2            = _mm256_mul_ps(beta2,rsq20);
2303             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2304             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2305             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2306             felec            = _mm256_mul_ps(qq20,felec);
2307             
2308             fscal            = felec;
2309
2310             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2311
2312             /* Calculate temporary vectorial force */
2313             tx               = _mm256_mul_ps(fscal,dx20);
2314             ty               = _mm256_mul_ps(fscal,dy20);
2315             tz               = _mm256_mul_ps(fscal,dz20);
2316
2317             /* Update vectorial force */
2318             fix2             = _mm256_add_ps(fix2,tx);
2319             fiy2             = _mm256_add_ps(fiy2,ty);
2320             fiz2             = _mm256_add_ps(fiz2,tz);
2321
2322             fjx0             = _mm256_add_ps(fjx0,tx);
2323             fjy0             = _mm256_add_ps(fjy0,ty);
2324             fjz0             = _mm256_add_ps(fjz0,tz);
2325
2326             /**************************
2327              * CALCULATE INTERACTIONS *
2328              **************************/
2329
2330             r21              = _mm256_mul_ps(rsq21,rinv21);
2331             r21              = _mm256_andnot_ps(dummy_mask,r21);
2332
2333             /* EWALD ELECTROSTATICS */
2334             
2335             /* Analytical PME correction */
2336             zeta2            = _mm256_mul_ps(beta2,rsq21);
2337             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2338             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2339             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2340             felec            = _mm256_mul_ps(qq21,felec);
2341             
2342             fscal            = felec;
2343
2344             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2345
2346             /* Calculate temporary vectorial force */
2347             tx               = _mm256_mul_ps(fscal,dx21);
2348             ty               = _mm256_mul_ps(fscal,dy21);
2349             tz               = _mm256_mul_ps(fscal,dz21);
2350
2351             /* Update vectorial force */
2352             fix2             = _mm256_add_ps(fix2,tx);
2353             fiy2             = _mm256_add_ps(fiy2,ty);
2354             fiz2             = _mm256_add_ps(fiz2,tz);
2355
2356             fjx1             = _mm256_add_ps(fjx1,tx);
2357             fjy1             = _mm256_add_ps(fjy1,ty);
2358             fjz1             = _mm256_add_ps(fjz1,tz);
2359
2360             /**************************
2361              * CALCULATE INTERACTIONS *
2362              **************************/
2363
2364             r22              = _mm256_mul_ps(rsq22,rinv22);
2365             r22              = _mm256_andnot_ps(dummy_mask,r22);
2366
2367             /* EWALD ELECTROSTATICS */
2368             
2369             /* Analytical PME correction */
2370             zeta2            = _mm256_mul_ps(beta2,rsq22);
2371             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2372             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2373             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2374             felec            = _mm256_mul_ps(qq22,felec);
2375             
2376             fscal            = felec;
2377
2378             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2379
2380             /* Calculate temporary vectorial force */
2381             tx               = _mm256_mul_ps(fscal,dx22);
2382             ty               = _mm256_mul_ps(fscal,dy22);
2383             tz               = _mm256_mul_ps(fscal,dz22);
2384
2385             /* Update vectorial force */
2386             fix2             = _mm256_add_ps(fix2,tx);
2387             fiy2             = _mm256_add_ps(fiy2,ty);
2388             fiz2             = _mm256_add_ps(fiz2,tz);
2389
2390             fjx2             = _mm256_add_ps(fjx2,tx);
2391             fjy2             = _mm256_add_ps(fjy2,ty);
2392             fjz2             = _mm256_add_ps(fjz2,tz);
2393
2394             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2395             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2396             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2397             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2398             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2399             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2400             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2401             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2402
2403             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2404                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2405
2406             /* Inner loop uses 539 flops */
2407         }
2408
2409         /* End of innermost loop */
2410
2411         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2412                                                  f+i_coord_offset,fshift+i_shift_offset);
2413
2414         /* Increment number of inner iterations */
2415         inneriter                  += j_index_end - j_index_start;
2416
2417         /* Outer loop uses 18 flops */
2418     }
2419
2420     /* Increment number of outer iterations */
2421     outeriter        += nri;
2422
2423     /* Update outer/inner flops */
2424
2425     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*539);
2426 }