Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_double
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LJEwald
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82     real             scratch[4*DIM];
83     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84     real *           vdwioffsetptr0;
85     real *           vdwgridioffsetptr0;
86     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     real *           vdwioffsetptr1;
88     real *           vdwgridioffsetptr1;
89     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     real *           vdwgridioffsetptr2;
92     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
93     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
101     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
102     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
103     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
106     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
115     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
116     __m256d           c6grid_00;
117     __m256d           c6grid_01;
118     __m256d           c6grid_02;
119     __m256d           c6grid_10;
120     __m256d           c6grid_11;
121     __m256d           c6grid_12;
122     __m256d           c6grid_20;
123     __m256d           c6grid_21;
124     __m256d           c6grid_22;
125     real             *vdwgridparam;
126     __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
127     __m256d           one_half  = _mm256_set1_pd(0.5);
128     __m256d           minus_one = _mm256_set1_pd(-1.0);
129     __m128i          ewitab;
130     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
131     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
132     real             *ewtab;
133     __m256d          dummy_mask,cutoff_mask;
134     __m128           tmpmask0,tmpmask1;
135     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
136     __m256d          one     = _mm256_set1_pd(1.0);
137     __m256d          two     = _mm256_set1_pd(2.0);
138     x                = xx[0];
139     f                = ff[0];
140
141     nri              = nlist->nri;
142     iinr             = nlist->iinr;
143     jindex           = nlist->jindex;
144     jjnr             = nlist->jjnr;
145     shiftidx         = nlist->shift;
146     gid              = nlist->gid;
147     shiftvec         = fr->shift_vec[0];
148     fshift           = fr->fshift[0];
149     facel            = _mm256_set1_pd(fr->epsfac);
150     charge           = mdatoms->chargeA;
151     nvdwtype         = fr->ntype;
152     vdwparam         = fr->nbfp;
153     vdwtype          = mdatoms->typeA;
154     vdwgridparam     = fr->ljpme_c6grid;
155     sh_lj_ewald      = _mm256_set1_pd(fr->ic->sh_lj_ewald);
156     ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
157     ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
158
159     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
160     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
161     beta2            = _mm256_mul_pd(beta,beta);
162     beta3            = _mm256_mul_pd(beta,beta2);
163
164     ewtab            = fr->ic->tabq_coul_FDV0;
165     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
166     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
167
168     /* Setup water-specific parameters */
169     inr              = nlist->iinr[0];
170     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
171     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
172     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
173     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
174     vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
175
176     jq0              = _mm256_set1_pd(charge[inr+0]);
177     jq1              = _mm256_set1_pd(charge[inr+1]);
178     jq2              = _mm256_set1_pd(charge[inr+2]);
179     vdwjidx0A        = 2*vdwtype[inr+0];
180     qq00             = _mm256_mul_pd(iq0,jq0);
181     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
182     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
183     c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
184     qq01             = _mm256_mul_pd(iq0,jq1);
185     qq02             = _mm256_mul_pd(iq0,jq2);
186     qq10             = _mm256_mul_pd(iq1,jq0);
187     qq11             = _mm256_mul_pd(iq1,jq1);
188     qq12             = _mm256_mul_pd(iq1,jq2);
189     qq20             = _mm256_mul_pd(iq2,jq0);
190     qq21             = _mm256_mul_pd(iq2,jq1);
191     qq22             = _mm256_mul_pd(iq2,jq2);
192
193     /* Avoid stupid compiler warnings */
194     jnrA = jnrB = jnrC = jnrD = 0;
195     j_coord_offsetA = 0;
196     j_coord_offsetB = 0;
197     j_coord_offsetC = 0;
198     j_coord_offsetD = 0;
199
200     outeriter        = 0;
201     inneriter        = 0;
202
203     for(iidx=0;iidx<4*DIM;iidx++)
204     {
205         scratch[iidx] = 0.0;
206     }
207
208     /* Start outer loop over neighborlists */
209     for(iidx=0; iidx<nri; iidx++)
210     {
211         /* Load shift vector for this list */
212         i_shift_offset   = DIM*shiftidx[iidx];
213
214         /* Load limits for loop over neighbors */
215         j_index_start    = jindex[iidx];
216         j_index_end      = jindex[iidx+1];
217
218         /* Get outer coordinate index */
219         inr              = iinr[iidx];
220         i_coord_offset   = DIM*inr;
221
222         /* Load i particle coords and add shift vector */
223         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
224                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
225
226         fix0             = _mm256_setzero_pd();
227         fiy0             = _mm256_setzero_pd();
228         fiz0             = _mm256_setzero_pd();
229         fix1             = _mm256_setzero_pd();
230         fiy1             = _mm256_setzero_pd();
231         fiz1             = _mm256_setzero_pd();
232         fix2             = _mm256_setzero_pd();
233         fiy2             = _mm256_setzero_pd();
234         fiz2             = _mm256_setzero_pd();
235
236         /* Reset potential sums */
237         velecsum         = _mm256_setzero_pd();
238         vvdwsum          = _mm256_setzero_pd();
239
240         /* Start inner kernel loop */
241         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
242         {
243
244             /* Get j neighbor index, and coordinate index */
245             jnrA             = jjnr[jidx];
246             jnrB             = jjnr[jidx+1];
247             jnrC             = jjnr[jidx+2];
248             jnrD             = jjnr[jidx+3];
249             j_coord_offsetA  = DIM*jnrA;
250             j_coord_offsetB  = DIM*jnrB;
251             j_coord_offsetC  = DIM*jnrC;
252             j_coord_offsetD  = DIM*jnrD;
253
254             /* load j atom coordinates */
255             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
256                                                  x+j_coord_offsetC,x+j_coord_offsetD,
257                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
258
259             /* Calculate displacement vector */
260             dx00             = _mm256_sub_pd(ix0,jx0);
261             dy00             = _mm256_sub_pd(iy0,jy0);
262             dz00             = _mm256_sub_pd(iz0,jz0);
263             dx01             = _mm256_sub_pd(ix0,jx1);
264             dy01             = _mm256_sub_pd(iy0,jy1);
265             dz01             = _mm256_sub_pd(iz0,jz1);
266             dx02             = _mm256_sub_pd(ix0,jx2);
267             dy02             = _mm256_sub_pd(iy0,jy2);
268             dz02             = _mm256_sub_pd(iz0,jz2);
269             dx10             = _mm256_sub_pd(ix1,jx0);
270             dy10             = _mm256_sub_pd(iy1,jy0);
271             dz10             = _mm256_sub_pd(iz1,jz0);
272             dx11             = _mm256_sub_pd(ix1,jx1);
273             dy11             = _mm256_sub_pd(iy1,jy1);
274             dz11             = _mm256_sub_pd(iz1,jz1);
275             dx12             = _mm256_sub_pd(ix1,jx2);
276             dy12             = _mm256_sub_pd(iy1,jy2);
277             dz12             = _mm256_sub_pd(iz1,jz2);
278             dx20             = _mm256_sub_pd(ix2,jx0);
279             dy20             = _mm256_sub_pd(iy2,jy0);
280             dz20             = _mm256_sub_pd(iz2,jz0);
281             dx21             = _mm256_sub_pd(ix2,jx1);
282             dy21             = _mm256_sub_pd(iy2,jy1);
283             dz21             = _mm256_sub_pd(iz2,jz1);
284             dx22             = _mm256_sub_pd(ix2,jx2);
285             dy22             = _mm256_sub_pd(iy2,jy2);
286             dz22             = _mm256_sub_pd(iz2,jz2);
287
288             /* Calculate squared distance and things based on it */
289             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
290             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
291             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
292             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
293             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
294             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
295             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
296             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
297             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
298
299             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
300             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
301             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
302             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
303             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
304             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
305             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
306             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
307             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
308
309             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
310             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
311             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
312             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
313             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
314             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
315             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
316             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
317             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
318
319             fjx0             = _mm256_setzero_pd();
320             fjy0             = _mm256_setzero_pd();
321             fjz0             = _mm256_setzero_pd();
322             fjx1             = _mm256_setzero_pd();
323             fjy1             = _mm256_setzero_pd();
324             fjz1             = _mm256_setzero_pd();
325             fjx2             = _mm256_setzero_pd();
326             fjy2             = _mm256_setzero_pd();
327             fjz2             = _mm256_setzero_pd();
328
329             /**************************
330              * CALCULATE INTERACTIONS *
331              **************************/
332
333             r00              = _mm256_mul_pd(rsq00,rinv00);
334
335             /* EWALD ELECTROSTATICS */
336
337             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
338             ewrt             = _mm256_mul_pd(r00,ewtabscale);
339             ewitab           = _mm256_cvttpd_epi32(ewrt);
340             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
341             ewitab           = _mm_slli_epi32(ewitab,2);
342             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
343             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
344             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
345             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
346             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
347             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
348             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
349             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
350             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
351
352             /* Analytical LJ-PME */
353             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
354             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
355             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
356             exponent         = gmx_simd_exp_d(ewcljrsq);
357             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
358             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
359             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
360             vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
361             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
362             vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
363             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
364             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
365
366             /* Update potential sum for this i atom from the interaction with this j atom. */
367             velecsum         = _mm256_add_pd(velecsum,velec);
368             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
369
370             fscal            = _mm256_add_pd(felec,fvdw);
371
372             /* Calculate temporary vectorial force */
373             tx               = _mm256_mul_pd(fscal,dx00);
374             ty               = _mm256_mul_pd(fscal,dy00);
375             tz               = _mm256_mul_pd(fscal,dz00);
376
377             /* Update vectorial force */
378             fix0             = _mm256_add_pd(fix0,tx);
379             fiy0             = _mm256_add_pd(fiy0,ty);
380             fiz0             = _mm256_add_pd(fiz0,tz);
381
382             fjx0             = _mm256_add_pd(fjx0,tx);
383             fjy0             = _mm256_add_pd(fjy0,ty);
384             fjz0             = _mm256_add_pd(fjz0,tz);
385
386             /**************************
387              * CALCULATE INTERACTIONS *
388              **************************/
389
390             r01              = _mm256_mul_pd(rsq01,rinv01);
391
392             /* EWALD ELECTROSTATICS */
393
394             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
395             ewrt             = _mm256_mul_pd(r01,ewtabscale);
396             ewitab           = _mm256_cvttpd_epi32(ewrt);
397             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
398             ewitab           = _mm_slli_epi32(ewitab,2);
399             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
400             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
401             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
402             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
403             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
404             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
405             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
406             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
407             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
408
409             /* Update potential sum for this i atom from the interaction with this j atom. */
410             velecsum         = _mm256_add_pd(velecsum,velec);
411
412             fscal            = felec;
413
414             /* Calculate temporary vectorial force */
415             tx               = _mm256_mul_pd(fscal,dx01);
416             ty               = _mm256_mul_pd(fscal,dy01);
417             tz               = _mm256_mul_pd(fscal,dz01);
418
419             /* Update vectorial force */
420             fix0             = _mm256_add_pd(fix0,tx);
421             fiy0             = _mm256_add_pd(fiy0,ty);
422             fiz0             = _mm256_add_pd(fiz0,tz);
423
424             fjx1             = _mm256_add_pd(fjx1,tx);
425             fjy1             = _mm256_add_pd(fjy1,ty);
426             fjz1             = _mm256_add_pd(fjz1,tz);
427
428             /**************************
429              * CALCULATE INTERACTIONS *
430              **************************/
431
432             r02              = _mm256_mul_pd(rsq02,rinv02);
433
434             /* EWALD ELECTROSTATICS */
435
436             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
437             ewrt             = _mm256_mul_pd(r02,ewtabscale);
438             ewitab           = _mm256_cvttpd_epi32(ewrt);
439             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
440             ewitab           = _mm_slli_epi32(ewitab,2);
441             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
442             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
443             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
444             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
445             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
446             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
447             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
448             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
449             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
450
451             /* Update potential sum for this i atom from the interaction with this j atom. */
452             velecsum         = _mm256_add_pd(velecsum,velec);
453
454             fscal            = felec;
455
456             /* Calculate temporary vectorial force */
457             tx               = _mm256_mul_pd(fscal,dx02);
458             ty               = _mm256_mul_pd(fscal,dy02);
459             tz               = _mm256_mul_pd(fscal,dz02);
460
461             /* Update vectorial force */
462             fix0             = _mm256_add_pd(fix0,tx);
463             fiy0             = _mm256_add_pd(fiy0,ty);
464             fiz0             = _mm256_add_pd(fiz0,tz);
465
466             fjx2             = _mm256_add_pd(fjx2,tx);
467             fjy2             = _mm256_add_pd(fjy2,ty);
468             fjz2             = _mm256_add_pd(fjz2,tz);
469
470             /**************************
471              * CALCULATE INTERACTIONS *
472              **************************/
473
474             r10              = _mm256_mul_pd(rsq10,rinv10);
475
476             /* EWALD ELECTROSTATICS */
477
478             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
479             ewrt             = _mm256_mul_pd(r10,ewtabscale);
480             ewitab           = _mm256_cvttpd_epi32(ewrt);
481             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
482             ewitab           = _mm_slli_epi32(ewitab,2);
483             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
484             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
485             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
486             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
487             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
488             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
489             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
490             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
491             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
492
493             /* Update potential sum for this i atom from the interaction with this j atom. */
494             velecsum         = _mm256_add_pd(velecsum,velec);
495
496             fscal            = felec;
497
498             /* Calculate temporary vectorial force */
499             tx               = _mm256_mul_pd(fscal,dx10);
500             ty               = _mm256_mul_pd(fscal,dy10);
501             tz               = _mm256_mul_pd(fscal,dz10);
502
503             /* Update vectorial force */
504             fix1             = _mm256_add_pd(fix1,tx);
505             fiy1             = _mm256_add_pd(fiy1,ty);
506             fiz1             = _mm256_add_pd(fiz1,tz);
507
508             fjx0             = _mm256_add_pd(fjx0,tx);
509             fjy0             = _mm256_add_pd(fjy0,ty);
510             fjz0             = _mm256_add_pd(fjz0,tz);
511
512             /**************************
513              * CALCULATE INTERACTIONS *
514              **************************/
515
516             r11              = _mm256_mul_pd(rsq11,rinv11);
517
518             /* EWALD ELECTROSTATICS */
519
520             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
521             ewrt             = _mm256_mul_pd(r11,ewtabscale);
522             ewitab           = _mm256_cvttpd_epi32(ewrt);
523             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
524             ewitab           = _mm_slli_epi32(ewitab,2);
525             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
526             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
527             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
528             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
529             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
530             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
531             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
532             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
533             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
534
535             /* Update potential sum for this i atom from the interaction with this j atom. */
536             velecsum         = _mm256_add_pd(velecsum,velec);
537
538             fscal            = felec;
539
540             /* Calculate temporary vectorial force */
541             tx               = _mm256_mul_pd(fscal,dx11);
542             ty               = _mm256_mul_pd(fscal,dy11);
543             tz               = _mm256_mul_pd(fscal,dz11);
544
545             /* Update vectorial force */
546             fix1             = _mm256_add_pd(fix1,tx);
547             fiy1             = _mm256_add_pd(fiy1,ty);
548             fiz1             = _mm256_add_pd(fiz1,tz);
549
550             fjx1             = _mm256_add_pd(fjx1,tx);
551             fjy1             = _mm256_add_pd(fjy1,ty);
552             fjz1             = _mm256_add_pd(fjz1,tz);
553
554             /**************************
555              * CALCULATE INTERACTIONS *
556              **************************/
557
558             r12              = _mm256_mul_pd(rsq12,rinv12);
559
560             /* EWALD ELECTROSTATICS */
561
562             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
563             ewrt             = _mm256_mul_pd(r12,ewtabscale);
564             ewitab           = _mm256_cvttpd_epi32(ewrt);
565             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
566             ewitab           = _mm_slli_epi32(ewitab,2);
567             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
568             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
569             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
570             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
571             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
572             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
573             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
574             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
575             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
576
577             /* Update potential sum for this i atom from the interaction with this j atom. */
578             velecsum         = _mm256_add_pd(velecsum,velec);
579
580             fscal            = felec;
581
582             /* Calculate temporary vectorial force */
583             tx               = _mm256_mul_pd(fscal,dx12);
584             ty               = _mm256_mul_pd(fscal,dy12);
585             tz               = _mm256_mul_pd(fscal,dz12);
586
587             /* Update vectorial force */
588             fix1             = _mm256_add_pd(fix1,tx);
589             fiy1             = _mm256_add_pd(fiy1,ty);
590             fiz1             = _mm256_add_pd(fiz1,tz);
591
592             fjx2             = _mm256_add_pd(fjx2,tx);
593             fjy2             = _mm256_add_pd(fjy2,ty);
594             fjz2             = _mm256_add_pd(fjz2,tz);
595
596             /**************************
597              * CALCULATE INTERACTIONS *
598              **************************/
599
600             r20              = _mm256_mul_pd(rsq20,rinv20);
601
602             /* EWALD ELECTROSTATICS */
603
604             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
605             ewrt             = _mm256_mul_pd(r20,ewtabscale);
606             ewitab           = _mm256_cvttpd_epi32(ewrt);
607             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
608             ewitab           = _mm_slli_epi32(ewitab,2);
609             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
610             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
611             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
612             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
613             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
614             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
615             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
616             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
617             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
618
619             /* Update potential sum for this i atom from the interaction with this j atom. */
620             velecsum         = _mm256_add_pd(velecsum,velec);
621
622             fscal            = felec;
623
624             /* Calculate temporary vectorial force */
625             tx               = _mm256_mul_pd(fscal,dx20);
626             ty               = _mm256_mul_pd(fscal,dy20);
627             tz               = _mm256_mul_pd(fscal,dz20);
628
629             /* Update vectorial force */
630             fix2             = _mm256_add_pd(fix2,tx);
631             fiy2             = _mm256_add_pd(fiy2,ty);
632             fiz2             = _mm256_add_pd(fiz2,tz);
633
634             fjx0             = _mm256_add_pd(fjx0,tx);
635             fjy0             = _mm256_add_pd(fjy0,ty);
636             fjz0             = _mm256_add_pd(fjz0,tz);
637
638             /**************************
639              * CALCULATE INTERACTIONS *
640              **************************/
641
642             r21              = _mm256_mul_pd(rsq21,rinv21);
643
644             /* EWALD ELECTROSTATICS */
645
646             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
647             ewrt             = _mm256_mul_pd(r21,ewtabscale);
648             ewitab           = _mm256_cvttpd_epi32(ewrt);
649             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
650             ewitab           = _mm_slli_epi32(ewitab,2);
651             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
652             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
653             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
654             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
655             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
656             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
657             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
658             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
659             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
660
661             /* Update potential sum for this i atom from the interaction with this j atom. */
662             velecsum         = _mm256_add_pd(velecsum,velec);
663
664             fscal            = felec;
665
666             /* Calculate temporary vectorial force */
667             tx               = _mm256_mul_pd(fscal,dx21);
668             ty               = _mm256_mul_pd(fscal,dy21);
669             tz               = _mm256_mul_pd(fscal,dz21);
670
671             /* Update vectorial force */
672             fix2             = _mm256_add_pd(fix2,tx);
673             fiy2             = _mm256_add_pd(fiy2,ty);
674             fiz2             = _mm256_add_pd(fiz2,tz);
675
676             fjx1             = _mm256_add_pd(fjx1,tx);
677             fjy1             = _mm256_add_pd(fjy1,ty);
678             fjz1             = _mm256_add_pd(fjz1,tz);
679
680             /**************************
681              * CALCULATE INTERACTIONS *
682              **************************/
683
684             r22              = _mm256_mul_pd(rsq22,rinv22);
685
686             /* EWALD ELECTROSTATICS */
687
688             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
689             ewrt             = _mm256_mul_pd(r22,ewtabscale);
690             ewitab           = _mm256_cvttpd_epi32(ewrt);
691             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
692             ewitab           = _mm_slli_epi32(ewitab,2);
693             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
694             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
695             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
696             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
697             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
698             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
699             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
700             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
701             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
702
703             /* Update potential sum for this i atom from the interaction with this j atom. */
704             velecsum         = _mm256_add_pd(velecsum,velec);
705
706             fscal            = felec;
707
708             /* Calculate temporary vectorial force */
709             tx               = _mm256_mul_pd(fscal,dx22);
710             ty               = _mm256_mul_pd(fscal,dy22);
711             tz               = _mm256_mul_pd(fscal,dz22);
712
713             /* Update vectorial force */
714             fix2             = _mm256_add_pd(fix2,tx);
715             fiy2             = _mm256_add_pd(fiy2,ty);
716             fiz2             = _mm256_add_pd(fiz2,tz);
717
718             fjx2             = _mm256_add_pd(fjx2,tx);
719             fjy2             = _mm256_add_pd(fjy2,ty);
720             fjz2             = _mm256_add_pd(fjz2,tz);
721
722             fjptrA             = f+j_coord_offsetA;
723             fjptrB             = f+j_coord_offsetB;
724             fjptrC             = f+j_coord_offsetC;
725             fjptrD             = f+j_coord_offsetD;
726
727             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
728                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
729
730             /* Inner loop uses 400 flops */
731         }
732
733         if(jidx<j_index_end)
734         {
735
736             /* Get j neighbor index, and coordinate index */
737             jnrlistA         = jjnr[jidx];
738             jnrlistB         = jjnr[jidx+1];
739             jnrlistC         = jjnr[jidx+2];
740             jnrlistD         = jjnr[jidx+3];
741             /* Sign of each element will be negative for non-real atoms.
742              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
743              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
744              */
745             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
746
747             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
748             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
749             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
750
751             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
752             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
753             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
754             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
755             j_coord_offsetA  = DIM*jnrA;
756             j_coord_offsetB  = DIM*jnrB;
757             j_coord_offsetC  = DIM*jnrC;
758             j_coord_offsetD  = DIM*jnrD;
759
760             /* load j atom coordinates */
761             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
762                                                  x+j_coord_offsetC,x+j_coord_offsetD,
763                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
764
765             /* Calculate displacement vector */
766             dx00             = _mm256_sub_pd(ix0,jx0);
767             dy00             = _mm256_sub_pd(iy0,jy0);
768             dz00             = _mm256_sub_pd(iz0,jz0);
769             dx01             = _mm256_sub_pd(ix0,jx1);
770             dy01             = _mm256_sub_pd(iy0,jy1);
771             dz01             = _mm256_sub_pd(iz0,jz1);
772             dx02             = _mm256_sub_pd(ix0,jx2);
773             dy02             = _mm256_sub_pd(iy0,jy2);
774             dz02             = _mm256_sub_pd(iz0,jz2);
775             dx10             = _mm256_sub_pd(ix1,jx0);
776             dy10             = _mm256_sub_pd(iy1,jy0);
777             dz10             = _mm256_sub_pd(iz1,jz0);
778             dx11             = _mm256_sub_pd(ix1,jx1);
779             dy11             = _mm256_sub_pd(iy1,jy1);
780             dz11             = _mm256_sub_pd(iz1,jz1);
781             dx12             = _mm256_sub_pd(ix1,jx2);
782             dy12             = _mm256_sub_pd(iy1,jy2);
783             dz12             = _mm256_sub_pd(iz1,jz2);
784             dx20             = _mm256_sub_pd(ix2,jx0);
785             dy20             = _mm256_sub_pd(iy2,jy0);
786             dz20             = _mm256_sub_pd(iz2,jz0);
787             dx21             = _mm256_sub_pd(ix2,jx1);
788             dy21             = _mm256_sub_pd(iy2,jy1);
789             dz21             = _mm256_sub_pd(iz2,jz1);
790             dx22             = _mm256_sub_pd(ix2,jx2);
791             dy22             = _mm256_sub_pd(iy2,jy2);
792             dz22             = _mm256_sub_pd(iz2,jz2);
793
794             /* Calculate squared distance and things based on it */
795             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
796             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
797             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
798             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
799             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
800             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
801             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
802             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
803             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
804
805             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
806             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
807             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
808             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
809             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
810             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
811             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
812             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
813             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
814
815             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
816             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
817             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
818             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
819             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
820             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
821             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
822             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
823             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
824
825             fjx0             = _mm256_setzero_pd();
826             fjy0             = _mm256_setzero_pd();
827             fjz0             = _mm256_setzero_pd();
828             fjx1             = _mm256_setzero_pd();
829             fjy1             = _mm256_setzero_pd();
830             fjz1             = _mm256_setzero_pd();
831             fjx2             = _mm256_setzero_pd();
832             fjy2             = _mm256_setzero_pd();
833             fjz2             = _mm256_setzero_pd();
834
835             /**************************
836              * CALCULATE INTERACTIONS *
837              **************************/
838
839             r00              = _mm256_mul_pd(rsq00,rinv00);
840             r00              = _mm256_andnot_pd(dummy_mask,r00);
841
842             /* EWALD ELECTROSTATICS */
843
844             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
845             ewrt             = _mm256_mul_pd(r00,ewtabscale);
846             ewitab           = _mm256_cvttpd_epi32(ewrt);
847             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
848             ewitab           = _mm_slli_epi32(ewitab,2);
849             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
850             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
851             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
852             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
853             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
854             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
855             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
856             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
857             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
858
859             /* Analytical LJ-PME */
860             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
861             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
862             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
863             exponent         = gmx_simd_exp_d(ewcljrsq);
864             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
865             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
866             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
867             vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
868             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
869             vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
870             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
871             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
872
873             /* Update potential sum for this i atom from the interaction with this j atom. */
874             velec            = _mm256_andnot_pd(dummy_mask,velec);
875             velecsum         = _mm256_add_pd(velecsum,velec);
876             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
877             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
878
879             fscal            = _mm256_add_pd(felec,fvdw);
880
881             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
882
883             /* Calculate temporary vectorial force */
884             tx               = _mm256_mul_pd(fscal,dx00);
885             ty               = _mm256_mul_pd(fscal,dy00);
886             tz               = _mm256_mul_pd(fscal,dz00);
887
888             /* Update vectorial force */
889             fix0             = _mm256_add_pd(fix0,tx);
890             fiy0             = _mm256_add_pd(fiy0,ty);
891             fiz0             = _mm256_add_pd(fiz0,tz);
892
893             fjx0             = _mm256_add_pd(fjx0,tx);
894             fjy0             = _mm256_add_pd(fjy0,ty);
895             fjz0             = _mm256_add_pd(fjz0,tz);
896
897             /**************************
898              * CALCULATE INTERACTIONS *
899              **************************/
900
901             r01              = _mm256_mul_pd(rsq01,rinv01);
902             r01              = _mm256_andnot_pd(dummy_mask,r01);
903
904             /* EWALD ELECTROSTATICS */
905
906             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
907             ewrt             = _mm256_mul_pd(r01,ewtabscale);
908             ewitab           = _mm256_cvttpd_epi32(ewrt);
909             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
910             ewitab           = _mm_slli_epi32(ewitab,2);
911             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
912             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
913             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
914             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
915             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
916             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
917             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
918             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
919             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
920
921             /* Update potential sum for this i atom from the interaction with this j atom. */
922             velec            = _mm256_andnot_pd(dummy_mask,velec);
923             velecsum         = _mm256_add_pd(velecsum,velec);
924
925             fscal            = felec;
926
927             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
928
929             /* Calculate temporary vectorial force */
930             tx               = _mm256_mul_pd(fscal,dx01);
931             ty               = _mm256_mul_pd(fscal,dy01);
932             tz               = _mm256_mul_pd(fscal,dz01);
933
934             /* Update vectorial force */
935             fix0             = _mm256_add_pd(fix0,tx);
936             fiy0             = _mm256_add_pd(fiy0,ty);
937             fiz0             = _mm256_add_pd(fiz0,tz);
938
939             fjx1             = _mm256_add_pd(fjx1,tx);
940             fjy1             = _mm256_add_pd(fjy1,ty);
941             fjz1             = _mm256_add_pd(fjz1,tz);
942
943             /**************************
944              * CALCULATE INTERACTIONS *
945              **************************/
946
947             r02              = _mm256_mul_pd(rsq02,rinv02);
948             r02              = _mm256_andnot_pd(dummy_mask,r02);
949
950             /* EWALD ELECTROSTATICS */
951
952             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
953             ewrt             = _mm256_mul_pd(r02,ewtabscale);
954             ewitab           = _mm256_cvttpd_epi32(ewrt);
955             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
956             ewitab           = _mm_slli_epi32(ewitab,2);
957             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
958             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
959             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
960             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
961             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
962             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
963             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
964             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
965             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
966
967             /* Update potential sum for this i atom from the interaction with this j atom. */
968             velec            = _mm256_andnot_pd(dummy_mask,velec);
969             velecsum         = _mm256_add_pd(velecsum,velec);
970
971             fscal            = felec;
972
973             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
974
975             /* Calculate temporary vectorial force */
976             tx               = _mm256_mul_pd(fscal,dx02);
977             ty               = _mm256_mul_pd(fscal,dy02);
978             tz               = _mm256_mul_pd(fscal,dz02);
979
980             /* Update vectorial force */
981             fix0             = _mm256_add_pd(fix0,tx);
982             fiy0             = _mm256_add_pd(fiy0,ty);
983             fiz0             = _mm256_add_pd(fiz0,tz);
984
985             fjx2             = _mm256_add_pd(fjx2,tx);
986             fjy2             = _mm256_add_pd(fjy2,ty);
987             fjz2             = _mm256_add_pd(fjz2,tz);
988
989             /**************************
990              * CALCULATE INTERACTIONS *
991              **************************/
992
993             r10              = _mm256_mul_pd(rsq10,rinv10);
994             r10              = _mm256_andnot_pd(dummy_mask,r10);
995
996             /* EWALD ELECTROSTATICS */
997
998             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
999             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1000             ewitab           = _mm256_cvttpd_epi32(ewrt);
1001             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1002             ewitab           = _mm_slli_epi32(ewitab,2);
1003             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1004             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1005             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1006             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1007             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1008             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1009             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1010             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1011             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1012
1013             /* Update potential sum for this i atom from the interaction with this j atom. */
1014             velec            = _mm256_andnot_pd(dummy_mask,velec);
1015             velecsum         = _mm256_add_pd(velecsum,velec);
1016
1017             fscal            = felec;
1018
1019             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1020
1021             /* Calculate temporary vectorial force */
1022             tx               = _mm256_mul_pd(fscal,dx10);
1023             ty               = _mm256_mul_pd(fscal,dy10);
1024             tz               = _mm256_mul_pd(fscal,dz10);
1025
1026             /* Update vectorial force */
1027             fix1             = _mm256_add_pd(fix1,tx);
1028             fiy1             = _mm256_add_pd(fiy1,ty);
1029             fiz1             = _mm256_add_pd(fiz1,tz);
1030
1031             fjx0             = _mm256_add_pd(fjx0,tx);
1032             fjy0             = _mm256_add_pd(fjy0,ty);
1033             fjz0             = _mm256_add_pd(fjz0,tz);
1034
1035             /**************************
1036              * CALCULATE INTERACTIONS *
1037              **************************/
1038
1039             r11              = _mm256_mul_pd(rsq11,rinv11);
1040             r11              = _mm256_andnot_pd(dummy_mask,r11);
1041
1042             /* EWALD ELECTROSTATICS */
1043
1044             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1045             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1046             ewitab           = _mm256_cvttpd_epi32(ewrt);
1047             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1048             ewitab           = _mm_slli_epi32(ewitab,2);
1049             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1050             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1051             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1052             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1053             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1054             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1055             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1056             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1057             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1058
1059             /* Update potential sum for this i atom from the interaction with this j atom. */
1060             velec            = _mm256_andnot_pd(dummy_mask,velec);
1061             velecsum         = _mm256_add_pd(velecsum,velec);
1062
1063             fscal            = felec;
1064
1065             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1066
1067             /* Calculate temporary vectorial force */
1068             tx               = _mm256_mul_pd(fscal,dx11);
1069             ty               = _mm256_mul_pd(fscal,dy11);
1070             tz               = _mm256_mul_pd(fscal,dz11);
1071
1072             /* Update vectorial force */
1073             fix1             = _mm256_add_pd(fix1,tx);
1074             fiy1             = _mm256_add_pd(fiy1,ty);
1075             fiz1             = _mm256_add_pd(fiz1,tz);
1076
1077             fjx1             = _mm256_add_pd(fjx1,tx);
1078             fjy1             = _mm256_add_pd(fjy1,ty);
1079             fjz1             = _mm256_add_pd(fjz1,tz);
1080
1081             /**************************
1082              * CALCULATE INTERACTIONS *
1083              **************************/
1084
1085             r12              = _mm256_mul_pd(rsq12,rinv12);
1086             r12              = _mm256_andnot_pd(dummy_mask,r12);
1087
1088             /* EWALD ELECTROSTATICS */
1089
1090             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1091             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1092             ewitab           = _mm256_cvttpd_epi32(ewrt);
1093             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1094             ewitab           = _mm_slli_epi32(ewitab,2);
1095             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1096             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1097             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1098             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1099             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1100             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1101             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1102             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1103             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1104
1105             /* Update potential sum for this i atom from the interaction with this j atom. */
1106             velec            = _mm256_andnot_pd(dummy_mask,velec);
1107             velecsum         = _mm256_add_pd(velecsum,velec);
1108
1109             fscal            = felec;
1110
1111             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1112
1113             /* Calculate temporary vectorial force */
1114             tx               = _mm256_mul_pd(fscal,dx12);
1115             ty               = _mm256_mul_pd(fscal,dy12);
1116             tz               = _mm256_mul_pd(fscal,dz12);
1117
1118             /* Update vectorial force */
1119             fix1             = _mm256_add_pd(fix1,tx);
1120             fiy1             = _mm256_add_pd(fiy1,ty);
1121             fiz1             = _mm256_add_pd(fiz1,tz);
1122
1123             fjx2             = _mm256_add_pd(fjx2,tx);
1124             fjy2             = _mm256_add_pd(fjy2,ty);
1125             fjz2             = _mm256_add_pd(fjz2,tz);
1126
1127             /**************************
1128              * CALCULATE INTERACTIONS *
1129              **************************/
1130
1131             r20              = _mm256_mul_pd(rsq20,rinv20);
1132             r20              = _mm256_andnot_pd(dummy_mask,r20);
1133
1134             /* EWALD ELECTROSTATICS */
1135
1136             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1137             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1138             ewitab           = _mm256_cvttpd_epi32(ewrt);
1139             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1140             ewitab           = _mm_slli_epi32(ewitab,2);
1141             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1142             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1143             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1144             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1145             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1146             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1147             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1148             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1149             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1150
1151             /* Update potential sum for this i atom from the interaction with this j atom. */
1152             velec            = _mm256_andnot_pd(dummy_mask,velec);
1153             velecsum         = _mm256_add_pd(velecsum,velec);
1154
1155             fscal            = felec;
1156
1157             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1158
1159             /* Calculate temporary vectorial force */
1160             tx               = _mm256_mul_pd(fscal,dx20);
1161             ty               = _mm256_mul_pd(fscal,dy20);
1162             tz               = _mm256_mul_pd(fscal,dz20);
1163
1164             /* Update vectorial force */
1165             fix2             = _mm256_add_pd(fix2,tx);
1166             fiy2             = _mm256_add_pd(fiy2,ty);
1167             fiz2             = _mm256_add_pd(fiz2,tz);
1168
1169             fjx0             = _mm256_add_pd(fjx0,tx);
1170             fjy0             = _mm256_add_pd(fjy0,ty);
1171             fjz0             = _mm256_add_pd(fjz0,tz);
1172
1173             /**************************
1174              * CALCULATE INTERACTIONS *
1175              **************************/
1176
1177             r21              = _mm256_mul_pd(rsq21,rinv21);
1178             r21              = _mm256_andnot_pd(dummy_mask,r21);
1179
1180             /* EWALD ELECTROSTATICS */
1181
1182             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1183             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1184             ewitab           = _mm256_cvttpd_epi32(ewrt);
1185             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1186             ewitab           = _mm_slli_epi32(ewitab,2);
1187             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1188             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1189             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1190             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1191             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1192             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1193             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1194             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1195             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1196
1197             /* Update potential sum for this i atom from the interaction with this j atom. */
1198             velec            = _mm256_andnot_pd(dummy_mask,velec);
1199             velecsum         = _mm256_add_pd(velecsum,velec);
1200
1201             fscal            = felec;
1202
1203             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1204
1205             /* Calculate temporary vectorial force */
1206             tx               = _mm256_mul_pd(fscal,dx21);
1207             ty               = _mm256_mul_pd(fscal,dy21);
1208             tz               = _mm256_mul_pd(fscal,dz21);
1209
1210             /* Update vectorial force */
1211             fix2             = _mm256_add_pd(fix2,tx);
1212             fiy2             = _mm256_add_pd(fiy2,ty);
1213             fiz2             = _mm256_add_pd(fiz2,tz);
1214
1215             fjx1             = _mm256_add_pd(fjx1,tx);
1216             fjy1             = _mm256_add_pd(fjy1,ty);
1217             fjz1             = _mm256_add_pd(fjz1,tz);
1218
1219             /**************************
1220              * CALCULATE INTERACTIONS *
1221              **************************/
1222
1223             r22              = _mm256_mul_pd(rsq22,rinv22);
1224             r22              = _mm256_andnot_pd(dummy_mask,r22);
1225
1226             /* EWALD ELECTROSTATICS */
1227
1228             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1229             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1230             ewitab           = _mm256_cvttpd_epi32(ewrt);
1231             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1232             ewitab           = _mm_slli_epi32(ewitab,2);
1233             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1234             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1235             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1236             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1237             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1238             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1239             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1240             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1241             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1242
1243             /* Update potential sum for this i atom from the interaction with this j atom. */
1244             velec            = _mm256_andnot_pd(dummy_mask,velec);
1245             velecsum         = _mm256_add_pd(velecsum,velec);
1246
1247             fscal            = felec;
1248
1249             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1250
1251             /* Calculate temporary vectorial force */
1252             tx               = _mm256_mul_pd(fscal,dx22);
1253             ty               = _mm256_mul_pd(fscal,dy22);
1254             tz               = _mm256_mul_pd(fscal,dz22);
1255
1256             /* Update vectorial force */
1257             fix2             = _mm256_add_pd(fix2,tx);
1258             fiy2             = _mm256_add_pd(fiy2,ty);
1259             fiz2             = _mm256_add_pd(fiz2,tz);
1260
1261             fjx2             = _mm256_add_pd(fjx2,tx);
1262             fjy2             = _mm256_add_pd(fjy2,ty);
1263             fjz2             = _mm256_add_pd(fjz2,tz);
1264
1265             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1266             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1267             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1268             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1269
1270             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1271                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1272
1273             /* Inner loop uses 409 flops */
1274         }
1275
1276         /* End of innermost loop */
1277
1278         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1279                                                  f+i_coord_offset,fshift+i_shift_offset);
1280
1281         ggid                        = gid[iidx];
1282         /* Update potential energies */
1283         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1284         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1285
1286         /* Increment number of inner iterations */
1287         inneriter                  += j_index_end - j_index_start;
1288
1289         /* Outer loop uses 20 flops */
1290     }
1291
1292     /* Increment number of outer iterations */
1293     outeriter        += nri;
1294
1295     /* Update outer/inner flops */
1296
1297     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1298 }
1299 /*
1300  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_double
1301  * Electrostatics interaction: Ewald
1302  * VdW interaction:            LJEwald
1303  * Geometry:                   Water3-Water3
1304  * Calculate force/pot:        Force
1305  */
1306 void
1307 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_double
1308                     (t_nblist                    * gmx_restrict       nlist,
1309                      rvec                        * gmx_restrict          xx,
1310                      rvec                        * gmx_restrict          ff,
1311                      t_forcerec                  * gmx_restrict          fr,
1312                      t_mdatoms                   * gmx_restrict     mdatoms,
1313                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1314                      t_nrnb                      * gmx_restrict        nrnb)
1315 {
1316     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1317      * just 0 for non-waters.
1318      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1319      * jnr indices corresponding to data put in the four positions in the SIMD register.
1320      */
1321     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1322     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1323     int              jnrA,jnrB,jnrC,jnrD;
1324     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1325     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1326     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1327     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1328     real             rcutoff_scalar;
1329     real             *shiftvec,*fshift,*x,*f;
1330     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1331     real             scratch[4*DIM];
1332     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1333     real *           vdwioffsetptr0;
1334     real *           vdwgridioffsetptr0;
1335     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1336     real *           vdwioffsetptr1;
1337     real *           vdwgridioffsetptr1;
1338     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1339     real *           vdwioffsetptr2;
1340     real *           vdwgridioffsetptr2;
1341     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1342     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1343     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1344     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1345     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1346     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1347     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1348     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1349     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1350     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1351     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1352     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1353     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1354     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1355     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1356     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1357     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1358     real             *charge;
1359     int              nvdwtype;
1360     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1361     int              *vdwtype;
1362     real             *vdwparam;
1363     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1364     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1365     __m256d           c6grid_00;
1366     __m256d           c6grid_01;
1367     __m256d           c6grid_02;
1368     __m256d           c6grid_10;
1369     __m256d           c6grid_11;
1370     __m256d           c6grid_12;
1371     __m256d           c6grid_20;
1372     __m256d           c6grid_21;
1373     __m256d           c6grid_22;
1374     real             *vdwgridparam;
1375     __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1376     __m256d           one_half  = _mm256_set1_pd(0.5);
1377     __m256d           minus_one = _mm256_set1_pd(-1.0);
1378     __m128i          ewitab;
1379     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1380     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1381     real             *ewtab;
1382     __m256d          dummy_mask,cutoff_mask;
1383     __m128           tmpmask0,tmpmask1;
1384     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1385     __m256d          one     = _mm256_set1_pd(1.0);
1386     __m256d          two     = _mm256_set1_pd(2.0);
1387     x                = xx[0];
1388     f                = ff[0];
1389
1390     nri              = nlist->nri;
1391     iinr             = nlist->iinr;
1392     jindex           = nlist->jindex;
1393     jjnr             = nlist->jjnr;
1394     shiftidx         = nlist->shift;
1395     gid              = nlist->gid;
1396     shiftvec         = fr->shift_vec[0];
1397     fshift           = fr->fshift[0];
1398     facel            = _mm256_set1_pd(fr->epsfac);
1399     charge           = mdatoms->chargeA;
1400     nvdwtype         = fr->ntype;
1401     vdwparam         = fr->nbfp;
1402     vdwtype          = mdatoms->typeA;
1403     vdwgridparam     = fr->ljpme_c6grid;
1404     sh_lj_ewald      = _mm256_set1_pd(fr->ic->sh_lj_ewald);
1405     ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
1406     ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
1407
1408     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1409     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1410     beta2            = _mm256_mul_pd(beta,beta);
1411     beta3            = _mm256_mul_pd(beta,beta2);
1412
1413     ewtab            = fr->ic->tabq_coul_F;
1414     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1415     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1416
1417     /* Setup water-specific parameters */
1418     inr              = nlist->iinr[0];
1419     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1420     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1421     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1422     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1423     vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
1424
1425     jq0              = _mm256_set1_pd(charge[inr+0]);
1426     jq1              = _mm256_set1_pd(charge[inr+1]);
1427     jq2              = _mm256_set1_pd(charge[inr+2]);
1428     vdwjidx0A        = 2*vdwtype[inr+0];
1429     qq00             = _mm256_mul_pd(iq0,jq0);
1430     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1431     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1432     c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
1433     qq01             = _mm256_mul_pd(iq0,jq1);
1434     qq02             = _mm256_mul_pd(iq0,jq2);
1435     qq10             = _mm256_mul_pd(iq1,jq0);
1436     qq11             = _mm256_mul_pd(iq1,jq1);
1437     qq12             = _mm256_mul_pd(iq1,jq2);
1438     qq20             = _mm256_mul_pd(iq2,jq0);
1439     qq21             = _mm256_mul_pd(iq2,jq1);
1440     qq22             = _mm256_mul_pd(iq2,jq2);
1441
1442     /* Avoid stupid compiler warnings */
1443     jnrA = jnrB = jnrC = jnrD = 0;
1444     j_coord_offsetA = 0;
1445     j_coord_offsetB = 0;
1446     j_coord_offsetC = 0;
1447     j_coord_offsetD = 0;
1448
1449     outeriter        = 0;
1450     inneriter        = 0;
1451
1452     for(iidx=0;iidx<4*DIM;iidx++)
1453     {
1454         scratch[iidx] = 0.0;
1455     }
1456
1457     /* Start outer loop over neighborlists */
1458     for(iidx=0; iidx<nri; iidx++)
1459     {
1460         /* Load shift vector for this list */
1461         i_shift_offset   = DIM*shiftidx[iidx];
1462
1463         /* Load limits for loop over neighbors */
1464         j_index_start    = jindex[iidx];
1465         j_index_end      = jindex[iidx+1];
1466
1467         /* Get outer coordinate index */
1468         inr              = iinr[iidx];
1469         i_coord_offset   = DIM*inr;
1470
1471         /* Load i particle coords and add shift vector */
1472         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1473                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1474
1475         fix0             = _mm256_setzero_pd();
1476         fiy0             = _mm256_setzero_pd();
1477         fiz0             = _mm256_setzero_pd();
1478         fix1             = _mm256_setzero_pd();
1479         fiy1             = _mm256_setzero_pd();
1480         fiz1             = _mm256_setzero_pd();
1481         fix2             = _mm256_setzero_pd();
1482         fiy2             = _mm256_setzero_pd();
1483         fiz2             = _mm256_setzero_pd();
1484
1485         /* Start inner kernel loop */
1486         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1487         {
1488
1489             /* Get j neighbor index, and coordinate index */
1490             jnrA             = jjnr[jidx];
1491             jnrB             = jjnr[jidx+1];
1492             jnrC             = jjnr[jidx+2];
1493             jnrD             = jjnr[jidx+3];
1494             j_coord_offsetA  = DIM*jnrA;
1495             j_coord_offsetB  = DIM*jnrB;
1496             j_coord_offsetC  = DIM*jnrC;
1497             j_coord_offsetD  = DIM*jnrD;
1498
1499             /* load j atom coordinates */
1500             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1501                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1502                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1503
1504             /* Calculate displacement vector */
1505             dx00             = _mm256_sub_pd(ix0,jx0);
1506             dy00             = _mm256_sub_pd(iy0,jy0);
1507             dz00             = _mm256_sub_pd(iz0,jz0);
1508             dx01             = _mm256_sub_pd(ix0,jx1);
1509             dy01             = _mm256_sub_pd(iy0,jy1);
1510             dz01             = _mm256_sub_pd(iz0,jz1);
1511             dx02             = _mm256_sub_pd(ix0,jx2);
1512             dy02             = _mm256_sub_pd(iy0,jy2);
1513             dz02             = _mm256_sub_pd(iz0,jz2);
1514             dx10             = _mm256_sub_pd(ix1,jx0);
1515             dy10             = _mm256_sub_pd(iy1,jy0);
1516             dz10             = _mm256_sub_pd(iz1,jz0);
1517             dx11             = _mm256_sub_pd(ix1,jx1);
1518             dy11             = _mm256_sub_pd(iy1,jy1);
1519             dz11             = _mm256_sub_pd(iz1,jz1);
1520             dx12             = _mm256_sub_pd(ix1,jx2);
1521             dy12             = _mm256_sub_pd(iy1,jy2);
1522             dz12             = _mm256_sub_pd(iz1,jz2);
1523             dx20             = _mm256_sub_pd(ix2,jx0);
1524             dy20             = _mm256_sub_pd(iy2,jy0);
1525             dz20             = _mm256_sub_pd(iz2,jz0);
1526             dx21             = _mm256_sub_pd(ix2,jx1);
1527             dy21             = _mm256_sub_pd(iy2,jy1);
1528             dz21             = _mm256_sub_pd(iz2,jz1);
1529             dx22             = _mm256_sub_pd(ix2,jx2);
1530             dy22             = _mm256_sub_pd(iy2,jy2);
1531             dz22             = _mm256_sub_pd(iz2,jz2);
1532
1533             /* Calculate squared distance and things based on it */
1534             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1535             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1536             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1537             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1538             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1539             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1540             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1541             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1542             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1543
1544             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1545             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1546             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1547             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1548             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1549             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1550             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1551             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1552             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1553
1554             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1555             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1556             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1557             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1558             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1559             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1560             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1561             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1562             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1563
1564             fjx0             = _mm256_setzero_pd();
1565             fjy0             = _mm256_setzero_pd();
1566             fjz0             = _mm256_setzero_pd();
1567             fjx1             = _mm256_setzero_pd();
1568             fjy1             = _mm256_setzero_pd();
1569             fjz1             = _mm256_setzero_pd();
1570             fjx2             = _mm256_setzero_pd();
1571             fjy2             = _mm256_setzero_pd();
1572             fjz2             = _mm256_setzero_pd();
1573
1574             /**************************
1575              * CALCULATE INTERACTIONS *
1576              **************************/
1577
1578             r00              = _mm256_mul_pd(rsq00,rinv00);
1579
1580             /* EWALD ELECTROSTATICS */
1581
1582             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1583             ewrt             = _mm256_mul_pd(r00,ewtabscale);
1584             ewitab           = _mm256_cvttpd_epi32(ewrt);
1585             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1586             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1587                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1588                                             &ewtabF,&ewtabFn);
1589             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1590             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1591
1592             /* Analytical LJ-PME */
1593             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1594             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
1595             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
1596             exponent         = gmx_simd_exp_d(ewcljrsq);
1597             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1598             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1599             /* f6A = 6 * C6grid * (1 - poly) */
1600             f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
1601             /* f6B = C6grid * exponent * beta^6 */
1602             f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
1603             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1604             fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1605
1606             fscal            = _mm256_add_pd(felec,fvdw);
1607
1608             /* Calculate temporary vectorial force */
1609             tx               = _mm256_mul_pd(fscal,dx00);
1610             ty               = _mm256_mul_pd(fscal,dy00);
1611             tz               = _mm256_mul_pd(fscal,dz00);
1612
1613             /* Update vectorial force */
1614             fix0             = _mm256_add_pd(fix0,tx);
1615             fiy0             = _mm256_add_pd(fiy0,ty);
1616             fiz0             = _mm256_add_pd(fiz0,tz);
1617
1618             fjx0             = _mm256_add_pd(fjx0,tx);
1619             fjy0             = _mm256_add_pd(fjy0,ty);
1620             fjz0             = _mm256_add_pd(fjz0,tz);
1621
1622             /**************************
1623              * CALCULATE INTERACTIONS *
1624              **************************/
1625
1626             r01              = _mm256_mul_pd(rsq01,rinv01);
1627
1628             /* EWALD ELECTROSTATICS */
1629
1630             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1631             ewrt             = _mm256_mul_pd(r01,ewtabscale);
1632             ewitab           = _mm256_cvttpd_epi32(ewrt);
1633             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1634             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1635                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1636                                             &ewtabF,&ewtabFn);
1637             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1638             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1639
1640             fscal            = felec;
1641
1642             /* Calculate temporary vectorial force */
1643             tx               = _mm256_mul_pd(fscal,dx01);
1644             ty               = _mm256_mul_pd(fscal,dy01);
1645             tz               = _mm256_mul_pd(fscal,dz01);
1646
1647             /* Update vectorial force */
1648             fix0             = _mm256_add_pd(fix0,tx);
1649             fiy0             = _mm256_add_pd(fiy0,ty);
1650             fiz0             = _mm256_add_pd(fiz0,tz);
1651
1652             fjx1             = _mm256_add_pd(fjx1,tx);
1653             fjy1             = _mm256_add_pd(fjy1,ty);
1654             fjz1             = _mm256_add_pd(fjz1,tz);
1655
1656             /**************************
1657              * CALCULATE INTERACTIONS *
1658              **************************/
1659
1660             r02              = _mm256_mul_pd(rsq02,rinv02);
1661
1662             /* EWALD ELECTROSTATICS */
1663
1664             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1665             ewrt             = _mm256_mul_pd(r02,ewtabscale);
1666             ewitab           = _mm256_cvttpd_epi32(ewrt);
1667             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1668             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1669                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1670                                             &ewtabF,&ewtabFn);
1671             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1672             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1673
1674             fscal            = felec;
1675
1676             /* Calculate temporary vectorial force */
1677             tx               = _mm256_mul_pd(fscal,dx02);
1678             ty               = _mm256_mul_pd(fscal,dy02);
1679             tz               = _mm256_mul_pd(fscal,dz02);
1680
1681             /* Update vectorial force */
1682             fix0             = _mm256_add_pd(fix0,tx);
1683             fiy0             = _mm256_add_pd(fiy0,ty);
1684             fiz0             = _mm256_add_pd(fiz0,tz);
1685
1686             fjx2             = _mm256_add_pd(fjx2,tx);
1687             fjy2             = _mm256_add_pd(fjy2,ty);
1688             fjz2             = _mm256_add_pd(fjz2,tz);
1689
1690             /**************************
1691              * CALCULATE INTERACTIONS *
1692              **************************/
1693
1694             r10              = _mm256_mul_pd(rsq10,rinv10);
1695
1696             /* EWALD ELECTROSTATICS */
1697
1698             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1699             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1700             ewitab           = _mm256_cvttpd_epi32(ewrt);
1701             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1702             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1703                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1704                                             &ewtabF,&ewtabFn);
1705             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1706             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1707
1708             fscal            = felec;
1709
1710             /* Calculate temporary vectorial force */
1711             tx               = _mm256_mul_pd(fscal,dx10);
1712             ty               = _mm256_mul_pd(fscal,dy10);
1713             tz               = _mm256_mul_pd(fscal,dz10);
1714
1715             /* Update vectorial force */
1716             fix1             = _mm256_add_pd(fix1,tx);
1717             fiy1             = _mm256_add_pd(fiy1,ty);
1718             fiz1             = _mm256_add_pd(fiz1,tz);
1719
1720             fjx0             = _mm256_add_pd(fjx0,tx);
1721             fjy0             = _mm256_add_pd(fjy0,ty);
1722             fjz0             = _mm256_add_pd(fjz0,tz);
1723
1724             /**************************
1725              * CALCULATE INTERACTIONS *
1726              **************************/
1727
1728             r11              = _mm256_mul_pd(rsq11,rinv11);
1729
1730             /* EWALD ELECTROSTATICS */
1731
1732             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1733             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1734             ewitab           = _mm256_cvttpd_epi32(ewrt);
1735             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1736             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1737                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1738                                             &ewtabF,&ewtabFn);
1739             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1740             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1741
1742             fscal            = felec;
1743
1744             /* Calculate temporary vectorial force */
1745             tx               = _mm256_mul_pd(fscal,dx11);
1746             ty               = _mm256_mul_pd(fscal,dy11);
1747             tz               = _mm256_mul_pd(fscal,dz11);
1748
1749             /* Update vectorial force */
1750             fix1             = _mm256_add_pd(fix1,tx);
1751             fiy1             = _mm256_add_pd(fiy1,ty);
1752             fiz1             = _mm256_add_pd(fiz1,tz);
1753
1754             fjx1             = _mm256_add_pd(fjx1,tx);
1755             fjy1             = _mm256_add_pd(fjy1,ty);
1756             fjz1             = _mm256_add_pd(fjz1,tz);
1757
1758             /**************************
1759              * CALCULATE INTERACTIONS *
1760              **************************/
1761
1762             r12              = _mm256_mul_pd(rsq12,rinv12);
1763
1764             /* EWALD ELECTROSTATICS */
1765
1766             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1767             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1768             ewitab           = _mm256_cvttpd_epi32(ewrt);
1769             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1770             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1771                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1772                                             &ewtabF,&ewtabFn);
1773             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1774             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1775
1776             fscal            = felec;
1777
1778             /* Calculate temporary vectorial force */
1779             tx               = _mm256_mul_pd(fscal,dx12);
1780             ty               = _mm256_mul_pd(fscal,dy12);
1781             tz               = _mm256_mul_pd(fscal,dz12);
1782
1783             /* Update vectorial force */
1784             fix1             = _mm256_add_pd(fix1,tx);
1785             fiy1             = _mm256_add_pd(fiy1,ty);
1786             fiz1             = _mm256_add_pd(fiz1,tz);
1787
1788             fjx2             = _mm256_add_pd(fjx2,tx);
1789             fjy2             = _mm256_add_pd(fjy2,ty);
1790             fjz2             = _mm256_add_pd(fjz2,tz);
1791
1792             /**************************
1793              * CALCULATE INTERACTIONS *
1794              **************************/
1795
1796             r20              = _mm256_mul_pd(rsq20,rinv20);
1797
1798             /* EWALD ELECTROSTATICS */
1799
1800             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1801             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1802             ewitab           = _mm256_cvttpd_epi32(ewrt);
1803             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1804             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1805                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1806                                             &ewtabF,&ewtabFn);
1807             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1808             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1809
1810             fscal            = felec;
1811
1812             /* Calculate temporary vectorial force */
1813             tx               = _mm256_mul_pd(fscal,dx20);
1814             ty               = _mm256_mul_pd(fscal,dy20);
1815             tz               = _mm256_mul_pd(fscal,dz20);
1816
1817             /* Update vectorial force */
1818             fix2             = _mm256_add_pd(fix2,tx);
1819             fiy2             = _mm256_add_pd(fiy2,ty);
1820             fiz2             = _mm256_add_pd(fiz2,tz);
1821
1822             fjx0             = _mm256_add_pd(fjx0,tx);
1823             fjy0             = _mm256_add_pd(fjy0,ty);
1824             fjz0             = _mm256_add_pd(fjz0,tz);
1825
1826             /**************************
1827              * CALCULATE INTERACTIONS *
1828              **************************/
1829
1830             r21              = _mm256_mul_pd(rsq21,rinv21);
1831
1832             /* EWALD ELECTROSTATICS */
1833
1834             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1835             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1836             ewitab           = _mm256_cvttpd_epi32(ewrt);
1837             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1838             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1839                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1840                                             &ewtabF,&ewtabFn);
1841             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1842             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1843
1844             fscal            = felec;
1845
1846             /* Calculate temporary vectorial force */
1847             tx               = _mm256_mul_pd(fscal,dx21);
1848             ty               = _mm256_mul_pd(fscal,dy21);
1849             tz               = _mm256_mul_pd(fscal,dz21);
1850
1851             /* Update vectorial force */
1852             fix2             = _mm256_add_pd(fix2,tx);
1853             fiy2             = _mm256_add_pd(fiy2,ty);
1854             fiz2             = _mm256_add_pd(fiz2,tz);
1855
1856             fjx1             = _mm256_add_pd(fjx1,tx);
1857             fjy1             = _mm256_add_pd(fjy1,ty);
1858             fjz1             = _mm256_add_pd(fjz1,tz);
1859
1860             /**************************
1861              * CALCULATE INTERACTIONS *
1862              **************************/
1863
1864             r22              = _mm256_mul_pd(rsq22,rinv22);
1865
1866             /* EWALD ELECTROSTATICS */
1867
1868             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1869             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1870             ewitab           = _mm256_cvttpd_epi32(ewrt);
1871             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1872             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1873                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1874                                             &ewtabF,&ewtabFn);
1875             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1876             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1877
1878             fscal            = felec;
1879
1880             /* Calculate temporary vectorial force */
1881             tx               = _mm256_mul_pd(fscal,dx22);
1882             ty               = _mm256_mul_pd(fscal,dy22);
1883             tz               = _mm256_mul_pd(fscal,dz22);
1884
1885             /* Update vectorial force */
1886             fix2             = _mm256_add_pd(fix2,tx);
1887             fiy2             = _mm256_add_pd(fiy2,ty);
1888             fiz2             = _mm256_add_pd(fiz2,tz);
1889
1890             fjx2             = _mm256_add_pd(fjx2,tx);
1891             fjy2             = _mm256_add_pd(fjy2,ty);
1892             fjz2             = _mm256_add_pd(fjz2,tz);
1893
1894             fjptrA             = f+j_coord_offsetA;
1895             fjptrB             = f+j_coord_offsetB;
1896             fjptrC             = f+j_coord_offsetC;
1897             fjptrD             = f+j_coord_offsetD;
1898
1899             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1900                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1901
1902             /* Inner loop uses 347 flops */
1903         }
1904
1905         if(jidx<j_index_end)
1906         {
1907
1908             /* Get j neighbor index, and coordinate index */
1909             jnrlistA         = jjnr[jidx];
1910             jnrlistB         = jjnr[jidx+1];
1911             jnrlistC         = jjnr[jidx+2];
1912             jnrlistD         = jjnr[jidx+3];
1913             /* Sign of each element will be negative for non-real atoms.
1914              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1915              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1916              */
1917             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1918
1919             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1920             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1921             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1922
1923             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1924             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1925             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1926             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1927             j_coord_offsetA  = DIM*jnrA;
1928             j_coord_offsetB  = DIM*jnrB;
1929             j_coord_offsetC  = DIM*jnrC;
1930             j_coord_offsetD  = DIM*jnrD;
1931
1932             /* load j atom coordinates */
1933             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1934                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1935                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1936
1937             /* Calculate displacement vector */
1938             dx00             = _mm256_sub_pd(ix0,jx0);
1939             dy00             = _mm256_sub_pd(iy0,jy0);
1940             dz00             = _mm256_sub_pd(iz0,jz0);
1941             dx01             = _mm256_sub_pd(ix0,jx1);
1942             dy01             = _mm256_sub_pd(iy0,jy1);
1943             dz01             = _mm256_sub_pd(iz0,jz1);
1944             dx02             = _mm256_sub_pd(ix0,jx2);
1945             dy02             = _mm256_sub_pd(iy0,jy2);
1946             dz02             = _mm256_sub_pd(iz0,jz2);
1947             dx10             = _mm256_sub_pd(ix1,jx0);
1948             dy10             = _mm256_sub_pd(iy1,jy0);
1949             dz10             = _mm256_sub_pd(iz1,jz0);
1950             dx11             = _mm256_sub_pd(ix1,jx1);
1951             dy11             = _mm256_sub_pd(iy1,jy1);
1952             dz11             = _mm256_sub_pd(iz1,jz1);
1953             dx12             = _mm256_sub_pd(ix1,jx2);
1954             dy12             = _mm256_sub_pd(iy1,jy2);
1955             dz12             = _mm256_sub_pd(iz1,jz2);
1956             dx20             = _mm256_sub_pd(ix2,jx0);
1957             dy20             = _mm256_sub_pd(iy2,jy0);
1958             dz20             = _mm256_sub_pd(iz2,jz0);
1959             dx21             = _mm256_sub_pd(ix2,jx1);
1960             dy21             = _mm256_sub_pd(iy2,jy1);
1961             dz21             = _mm256_sub_pd(iz2,jz1);
1962             dx22             = _mm256_sub_pd(ix2,jx2);
1963             dy22             = _mm256_sub_pd(iy2,jy2);
1964             dz22             = _mm256_sub_pd(iz2,jz2);
1965
1966             /* Calculate squared distance and things based on it */
1967             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1968             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1969             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1970             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1971             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1972             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1973             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1974             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1975             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1976
1977             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1978             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1979             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1980             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1981             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1982             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1983             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1984             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1985             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1986
1987             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1988             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1989             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1990             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1991             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1992             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1993             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1994             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1995             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1996
1997             fjx0             = _mm256_setzero_pd();
1998             fjy0             = _mm256_setzero_pd();
1999             fjz0             = _mm256_setzero_pd();
2000             fjx1             = _mm256_setzero_pd();
2001             fjy1             = _mm256_setzero_pd();
2002             fjz1             = _mm256_setzero_pd();
2003             fjx2             = _mm256_setzero_pd();
2004             fjy2             = _mm256_setzero_pd();
2005             fjz2             = _mm256_setzero_pd();
2006
2007             /**************************
2008              * CALCULATE INTERACTIONS *
2009              **************************/
2010
2011             r00              = _mm256_mul_pd(rsq00,rinv00);
2012             r00              = _mm256_andnot_pd(dummy_mask,r00);
2013
2014             /* EWALD ELECTROSTATICS */
2015
2016             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2017             ewrt             = _mm256_mul_pd(r00,ewtabscale);
2018             ewitab           = _mm256_cvttpd_epi32(ewrt);
2019             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2020             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2021                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2022                                             &ewtabF,&ewtabFn);
2023             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2024             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2025
2026             /* Analytical LJ-PME */
2027             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2028             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
2029             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
2030             exponent         = gmx_simd_exp_d(ewcljrsq);
2031             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2032             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
2033             /* f6A = 6 * C6grid * (1 - poly) */
2034             f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
2035             /* f6B = C6grid * exponent * beta^6 */
2036             f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
2037             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2038             fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2039
2040             fscal            = _mm256_add_pd(felec,fvdw);
2041
2042             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2043
2044             /* Calculate temporary vectorial force */
2045             tx               = _mm256_mul_pd(fscal,dx00);
2046             ty               = _mm256_mul_pd(fscal,dy00);
2047             tz               = _mm256_mul_pd(fscal,dz00);
2048
2049             /* Update vectorial force */
2050             fix0             = _mm256_add_pd(fix0,tx);
2051             fiy0             = _mm256_add_pd(fiy0,ty);
2052             fiz0             = _mm256_add_pd(fiz0,tz);
2053
2054             fjx0             = _mm256_add_pd(fjx0,tx);
2055             fjy0             = _mm256_add_pd(fjy0,ty);
2056             fjz0             = _mm256_add_pd(fjz0,tz);
2057
2058             /**************************
2059              * CALCULATE INTERACTIONS *
2060              **************************/
2061
2062             r01              = _mm256_mul_pd(rsq01,rinv01);
2063             r01              = _mm256_andnot_pd(dummy_mask,r01);
2064
2065             /* EWALD ELECTROSTATICS */
2066
2067             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2068             ewrt             = _mm256_mul_pd(r01,ewtabscale);
2069             ewitab           = _mm256_cvttpd_epi32(ewrt);
2070             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2071             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2072                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2073                                             &ewtabF,&ewtabFn);
2074             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2075             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2076
2077             fscal            = felec;
2078
2079             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2080
2081             /* Calculate temporary vectorial force */
2082             tx               = _mm256_mul_pd(fscal,dx01);
2083             ty               = _mm256_mul_pd(fscal,dy01);
2084             tz               = _mm256_mul_pd(fscal,dz01);
2085
2086             /* Update vectorial force */
2087             fix0             = _mm256_add_pd(fix0,tx);
2088             fiy0             = _mm256_add_pd(fiy0,ty);
2089             fiz0             = _mm256_add_pd(fiz0,tz);
2090
2091             fjx1             = _mm256_add_pd(fjx1,tx);
2092             fjy1             = _mm256_add_pd(fjy1,ty);
2093             fjz1             = _mm256_add_pd(fjz1,tz);
2094
2095             /**************************
2096              * CALCULATE INTERACTIONS *
2097              **************************/
2098
2099             r02              = _mm256_mul_pd(rsq02,rinv02);
2100             r02              = _mm256_andnot_pd(dummy_mask,r02);
2101
2102             /* EWALD ELECTROSTATICS */
2103
2104             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2105             ewrt             = _mm256_mul_pd(r02,ewtabscale);
2106             ewitab           = _mm256_cvttpd_epi32(ewrt);
2107             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2108             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2109                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2110                                             &ewtabF,&ewtabFn);
2111             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2112             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2113
2114             fscal            = felec;
2115
2116             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2117
2118             /* Calculate temporary vectorial force */
2119             tx               = _mm256_mul_pd(fscal,dx02);
2120             ty               = _mm256_mul_pd(fscal,dy02);
2121             tz               = _mm256_mul_pd(fscal,dz02);
2122
2123             /* Update vectorial force */
2124             fix0             = _mm256_add_pd(fix0,tx);
2125             fiy0             = _mm256_add_pd(fiy0,ty);
2126             fiz0             = _mm256_add_pd(fiz0,tz);
2127
2128             fjx2             = _mm256_add_pd(fjx2,tx);
2129             fjy2             = _mm256_add_pd(fjy2,ty);
2130             fjz2             = _mm256_add_pd(fjz2,tz);
2131
2132             /**************************
2133              * CALCULATE INTERACTIONS *
2134              **************************/
2135
2136             r10              = _mm256_mul_pd(rsq10,rinv10);
2137             r10              = _mm256_andnot_pd(dummy_mask,r10);
2138
2139             /* EWALD ELECTROSTATICS */
2140
2141             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2142             ewrt             = _mm256_mul_pd(r10,ewtabscale);
2143             ewitab           = _mm256_cvttpd_epi32(ewrt);
2144             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2145             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2146                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2147                                             &ewtabF,&ewtabFn);
2148             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2149             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2150
2151             fscal            = felec;
2152
2153             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2154
2155             /* Calculate temporary vectorial force */
2156             tx               = _mm256_mul_pd(fscal,dx10);
2157             ty               = _mm256_mul_pd(fscal,dy10);
2158             tz               = _mm256_mul_pd(fscal,dz10);
2159
2160             /* Update vectorial force */
2161             fix1             = _mm256_add_pd(fix1,tx);
2162             fiy1             = _mm256_add_pd(fiy1,ty);
2163             fiz1             = _mm256_add_pd(fiz1,tz);
2164
2165             fjx0             = _mm256_add_pd(fjx0,tx);
2166             fjy0             = _mm256_add_pd(fjy0,ty);
2167             fjz0             = _mm256_add_pd(fjz0,tz);
2168
2169             /**************************
2170              * CALCULATE INTERACTIONS *
2171              **************************/
2172
2173             r11              = _mm256_mul_pd(rsq11,rinv11);
2174             r11              = _mm256_andnot_pd(dummy_mask,r11);
2175
2176             /* EWALD ELECTROSTATICS */
2177
2178             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2179             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2180             ewitab           = _mm256_cvttpd_epi32(ewrt);
2181             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2182             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2183                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2184                                             &ewtabF,&ewtabFn);
2185             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2186             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2187
2188             fscal            = felec;
2189
2190             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2191
2192             /* Calculate temporary vectorial force */
2193             tx               = _mm256_mul_pd(fscal,dx11);
2194             ty               = _mm256_mul_pd(fscal,dy11);
2195             tz               = _mm256_mul_pd(fscal,dz11);
2196
2197             /* Update vectorial force */
2198             fix1             = _mm256_add_pd(fix1,tx);
2199             fiy1             = _mm256_add_pd(fiy1,ty);
2200             fiz1             = _mm256_add_pd(fiz1,tz);
2201
2202             fjx1             = _mm256_add_pd(fjx1,tx);
2203             fjy1             = _mm256_add_pd(fjy1,ty);
2204             fjz1             = _mm256_add_pd(fjz1,tz);
2205
2206             /**************************
2207              * CALCULATE INTERACTIONS *
2208              **************************/
2209
2210             r12              = _mm256_mul_pd(rsq12,rinv12);
2211             r12              = _mm256_andnot_pd(dummy_mask,r12);
2212
2213             /* EWALD ELECTROSTATICS */
2214
2215             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2216             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2217             ewitab           = _mm256_cvttpd_epi32(ewrt);
2218             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2219             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2220                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2221                                             &ewtabF,&ewtabFn);
2222             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2223             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2224
2225             fscal            = felec;
2226
2227             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2228
2229             /* Calculate temporary vectorial force */
2230             tx               = _mm256_mul_pd(fscal,dx12);
2231             ty               = _mm256_mul_pd(fscal,dy12);
2232             tz               = _mm256_mul_pd(fscal,dz12);
2233
2234             /* Update vectorial force */
2235             fix1             = _mm256_add_pd(fix1,tx);
2236             fiy1             = _mm256_add_pd(fiy1,ty);
2237             fiz1             = _mm256_add_pd(fiz1,tz);
2238
2239             fjx2             = _mm256_add_pd(fjx2,tx);
2240             fjy2             = _mm256_add_pd(fjy2,ty);
2241             fjz2             = _mm256_add_pd(fjz2,tz);
2242
2243             /**************************
2244              * CALCULATE INTERACTIONS *
2245              **************************/
2246
2247             r20              = _mm256_mul_pd(rsq20,rinv20);
2248             r20              = _mm256_andnot_pd(dummy_mask,r20);
2249
2250             /* EWALD ELECTROSTATICS */
2251
2252             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2253             ewrt             = _mm256_mul_pd(r20,ewtabscale);
2254             ewitab           = _mm256_cvttpd_epi32(ewrt);
2255             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2256             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2257                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2258                                             &ewtabF,&ewtabFn);
2259             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2260             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2261
2262             fscal            = felec;
2263
2264             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2265
2266             /* Calculate temporary vectorial force */
2267             tx               = _mm256_mul_pd(fscal,dx20);
2268             ty               = _mm256_mul_pd(fscal,dy20);
2269             tz               = _mm256_mul_pd(fscal,dz20);
2270
2271             /* Update vectorial force */
2272             fix2             = _mm256_add_pd(fix2,tx);
2273             fiy2             = _mm256_add_pd(fiy2,ty);
2274             fiz2             = _mm256_add_pd(fiz2,tz);
2275
2276             fjx0             = _mm256_add_pd(fjx0,tx);
2277             fjy0             = _mm256_add_pd(fjy0,ty);
2278             fjz0             = _mm256_add_pd(fjz0,tz);
2279
2280             /**************************
2281              * CALCULATE INTERACTIONS *
2282              **************************/
2283
2284             r21              = _mm256_mul_pd(rsq21,rinv21);
2285             r21              = _mm256_andnot_pd(dummy_mask,r21);
2286
2287             /* EWALD ELECTROSTATICS */
2288
2289             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2290             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2291             ewitab           = _mm256_cvttpd_epi32(ewrt);
2292             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2293             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2294                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2295                                             &ewtabF,&ewtabFn);
2296             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2297             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2298
2299             fscal            = felec;
2300
2301             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2302
2303             /* Calculate temporary vectorial force */
2304             tx               = _mm256_mul_pd(fscal,dx21);
2305             ty               = _mm256_mul_pd(fscal,dy21);
2306             tz               = _mm256_mul_pd(fscal,dz21);
2307
2308             /* Update vectorial force */
2309             fix2             = _mm256_add_pd(fix2,tx);
2310             fiy2             = _mm256_add_pd(fiy2,ty);
2311             fiz2             = _mm256_add_pd(fiz2,tz);
2312
2313             fjx1             = _mm256_add_pd(fjx1,tx);
2314             fjy1             = _mm256_add_pd(fjy1,ty);
2315             fjz1             = _mm256_add_pd(fjz1,tz);
2316
2317             /**************************
2318              * CALCULATE INTERACTIONS *
2319              **************************/
2320
2321             r22              = _mm256_mul_pd(rsq22,rinv22);
2322             r22              = _mm256_andnot_pd(dummy_mask,r22);
2323
2324             /* EWALD ELECTROSTATICS */
2325
2326             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2327             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2328             ewitab           = _mm256_cvttpd_epi32(ewrt);
2329             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2330             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2331                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2332                                             &ewtabF,&ewtabFn);
2333             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2334             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2335
2336             fscal            = felec;
2337
2338             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2339
2340             /* Calculate temporary vectorial force */
2341             tx               = _mm256_mul_pd(fscal,dx22);
2342             ty               = _mm256_mul_pd(fscal,dy22);
2343             tz               = _mm256_mul_pd(fscal,dz22);
2344
2345             /* Update vectorial force */
2346             fix2             = _mm256_add_pd(fix2,tx);
2347             fiy2             = _mm256_add_pd(fiy2,ty);
2348             fiz2             = _mm256_add_pd(fiz2,tz);
2349
2350             fjx2             = _mm256_add_pd(fjx2,tx);
2351             fjy2             = _mm256_add_pd(fjy2,ty);
2352             fjz2             = _mm256_add_pd(fjz2,tz);
2353
2354             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2355             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2356             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2357             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2358
2359             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2360                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2361
2362             /* Inner loop uses 356 flops */
2363         }
2364
2365         /* End of innermost loop */
2366
2367         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2368                                                  f+i_coord_offset,fshift+i_shift_offset);
2369
2370         /* Increment number of inner iterations */
2371         inneriter                  += j_index_end - j_index_start;
2372
2373         /* Outer loop uses 18 flops */
2374     }
2375
2376     /* Increment number of outer iterations */
2377     outeriter        += nri;
2378
2379     /* Update outer/inner flops */
2380
2381     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*356);
2382 }