Use full path for legacyheaders
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LJEwald
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82     real             scratch[4*DIM];
83     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84     real *           vdwioffsetptr0;
85     real *           vdwgridioffsetptr0;
86     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     real *           vdwioffsetptr1;
88     real *           vdwgridioffsetptr1;
89     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     real *           vdwgridioffsetptr2;
92     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
93     real *           vdwioffsetptr3;
94     real *           vdwgridioffsetptr3;
95     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
96     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
97     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
98     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
99     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
100     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
101     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
102     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
103     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
104     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
105     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
106     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
107     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
108     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
109     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
110     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
111     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
112     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
113     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
114     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
115     real             *charge;
116     int              nvdwtype;
117     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
118     int              *vdwtype;
119     real             *vdwparam;
120     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
121     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
122     __m256d           c6grid_00;
123     __m256d           c6grid_11;
124     __m256d           c6grid_12;
125     __m256d           c6grid_13;
126     __m256d           c6grid_21;
127     __m256d           c6grid_22;
128     __m256d           c6grid_23;
129     __m256d           c6grid_31;
130     __m256d           c6grid_32;
131     __m256d           c6grid_33;
132     real             *vdwgridparam;
133     __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
134     __m256d           one_half  = _mm256_set1_pd(0.5);
135     __m256d           minus_one = _mm256_set1_pd(-1.0);
136     __m128i          ewitab;
137     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
138     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
139     real             *ewtab;
140     __m256d          dummy_mask,cutoff_mask;
141     __m128           tmpmask0,tmpmask1;
142     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
143     __m256d          one     = _mm256_set1_pd(1.0);
144     __m256d          two     = _mm256_set1_pd(2.0);
145     x                = xx[0];
146     f                = ff[0];
147
148     nri              = nlist->nri;
149     iinr             = nlist->iinr;
150     jindex           = nlist->jindex;
151     jjnr             = nlist->jjnr;
152     shiftidx         = nlist->shift;
153     gid              = nlist->gid;
154     shiftvec         = fr->shift_vec[0];
155     fshift           = fr->fshift[0];
156     facel            = _mm256_set1_pd(fr->epsfac);
157     charge           = mdatoms->chargeA;
158     nvdwtype         = fr->ntype;
159     vdwparam         = fr->nbfp;
160     vdwtype          = mdatoms->typeA;
161     vdwgridparam     = fr->ljpme_c6grid;
162     sh_lj_ewald      = _mm256_set1_pd(fr->ic->sh_lj_ewald);
163     ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
164     ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
165
166     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
167     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
168     beta2            = _mm256_mul_pd(beta,beta);
169     beta3            = _mm256_mul_pd(beta,beta2);
170
171     ewtab            = fr->ic->tabq_coul_FDV0;
172     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
173     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
174
175     /* Setup water-specific parameters */
176     inr              = nlist->iinr[0];
177     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
178     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
179     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
180     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
181     vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
182
183     jq1              = _mm256_set1_pd(charge[inr+1]);
184     jq2              = _mm256_set1_pd(charge[inr+2]);
185     jq3              = _mm256_set1_pd(charge[inr+3]);
186     vdwjidx0A        = 2*vdwtype[inr+0];
187     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
188     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
189     c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
190     qq11             = _mm256_mul_pd(iq1,jq1);
191     qq12             = _mm256_mul_pd(iq1,jq2);
192     qq13             = _mm256_mul_pd(iq1,jq3);
193     qq21             = _mm256_mul_pd(iq2,jq1);
194     qq22             = _mm256_mul_pd(iq2,jq2);
195     qq23             = _mm256_mul_pd(iq2,jq3);
196     qq31             = _mm256_mul_pd(iq3,jq1);
197     qq32             = _mm256_mul_pd(iq3,jq2);
198     qq33             = _mm256_mul_pd(iq3,jq3);
199
200     /* Avoid stupid compiler warnings */
201     jnrA = jnrB = jnrC = jnrD = 0;
202     j_coord_offsetA = 0;
203     j_coord_offsetB = 0;
204     j_coord_offsetC = 0;
205     j_coord_offsetD = 0;
206
207     outeriter        = 0;
208     inneriter        = 0;
209
210     for(iidx=0;iidx<4*DIM;iidx++)
211     {
212         scratch[iidx] = 0.0;
213     }
214
215     /* Start outer loop over neighborlists */
216     for(iidx=0; iidx<nri; iidx++)
217     {
218         /* Load shift vector for this list */
219         i_shift_offset   = DIM*shiftidx[iidx];
220
221         /* Load limits for loop over neighbors */
222         j_index_start    = jindex[iidx];
223         j_index_end      = jindex[iidx+1];
224
225         /* Get outer coordinate index */
226         inr              = iinr[iidx];
227         i_coord_offset   = DIM*inr;
228
229         /* Load i particle coords and add shift vector */
230         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
231                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
232
233         fix0             = _mm256_setzero_pd();
234         fiy0             = _mm256_setzero_pd();
235         fiz0             = _mm256_setzero_pd();
236         fix1             = _mm256_setzero_pd();
237         fiy1             = _mm256_setzero_pd();
238         fiz1             = _mm256_setzero_pd();
239         fix2             = _mm256_setzero_pd();
240         fiy2             = _mm256_setzero_pd();
241         fiz2             = _mm256_setzero_pd();
242         fix3             = _mm256_setzero_pd();
243         fiy3             = _mm256_setzero_pd();
244         fiz3             = _mm256_setzero_pd();
245
246         /* Reset potential sums */
247         velecsum         = _mm256_setzero_pd();
248         vvdwsum          = _mm256_setzero_pd();
249
250         /* Start inner kernel loop */
251         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
252         {
253
254             /* Get j neighbor index, and coordinate index */
255             jnrA             = jjnr[jidx];
256             jnrB             = jjnr[jidx+1];
257             jnrC             = jjnr[jidx+2];
258             jnrD             = jjnr[jidx+3];
259             j_coord_offsetA  = DIM*jnrA;
260             j_coord_offsetB  = DIM*jnrB;
261             j_coord_offsetC  = DIM*jnrC;
262             j_coord_offsetD  = DIM*jnrD;
263
264             /* load j atom coordinates */
265             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
266                                                  x+j_coord_offsetC,x+j_coord_offsetD,
267                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
268                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
269
270             /* Calculate displacement vector */
271             dx00             = _mm256_sub_pd(ix0,jx0);
272             dy00             = _mm256_sub_pd(iy0,jy0);
273             dz00             = _mm256_sub_pd(iz0,jz0);
274             dx11             = _mm256_sub_pd(ix1,jx1);
275             dy11             = _mm256_sub_pd(iy1,jy1);
276             dz11             = _mm256_sub_pd(iz1,jz1);
277             dx12             = _mm256_sub_pd(ix1,jx2);
278             dy12             = _mm256_sub_pd(iy1,jy2);
279             dz12             = _mm256_sub_pd(iz1,jz2);
280             dx13             = _mm256_sub_pd(ix1,jx3);
281             dy13             = _mm256_sub_pd(iy1,jy3);
282             dz13             = _mm256_sub_pd(iz1,jz3);
283             dx21             = _mm256_sub_pd(ix2,jx1);
284             dy21             = _mm256_sub_pd(iy2,jy1);
285             dz21             = _mm256_sub_pd(iz2,jz1);
286             dx22             = _mm256_sub_pd(ix2,jx2);
287             dy22             = _mm256_sub_pd(iy2,jy2);
288             dz22             = _mm256_sub_pd(iz2,jz2);
289             dx23             = _mm256_sub_pd(ix2,jx3);
290             dy23             = _mm256_sub_pd(iy2,jy3);
291             dz23             = _mm256_sub_pd(iz2,jz3);
292             dx31             = _mm256_sub_pd(ix3,jx1);
293             dy31             = _mm256_sub_pd(iy3,jy1);
294             dz31             = _mm256_sub_pd(iz3,jz1);
295             dx32             = _mm256_sub_pd(ix3,jx2);
296             dy32             = _mm256_sub_pd(iy3,jy2);
297             dz32             = _mm256_sub_pd(iz3,jz2);
298             dx33             = _mm256_sub_pd(ix3,jx3);
299             dy33             = _mm256_sub_pd(iy3,jy3);
300             dz33             = _mm256_sub_pd(iz3,jz3);
301
302             /* Calculate squared distance and things based on it */
303             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
304             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
305             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
306             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
307             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
308             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
309             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
310             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
311             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
312             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
313
314             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
315             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
316             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
317             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
318             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
319             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
320             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
321             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
322             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
323             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
324
325             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
326             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
327             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
328             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
329             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
330             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
331             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
332             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
333             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
334             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
335
336             fjx0             = _mm256_setzero_pd();
337             fjy0             = _mm256_setzero_pd();
338             fjz0             = _mm256_setzero_pd();
339             fjx1             = _mm256_setzero_pd();
340             fjy1             = _mm256_setzero_pd();
341             fjz1             = _mm256_setzero_pd();
342             fjx2             = _mm256_setzero_pd();
343             fjy2             = _mm256_setzero_pd();
344             fjz2             = _mm256_setzero_pd();
345             fjx3             = _mm256_setzero_pd();
346             fjy3             = _mm256_setzero_pd();
347             fjz3             = _mm256_setzero_pd();
348
349             /**************************
350              * CALCULATE INTERACTIONS *
351              **************************/
352
353             r00              = _mm256_mul_pd(rsq00,rinv00);
354
355             /* Analytical LJ-PME */
356             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
357             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
358             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
359             exponent         = gmx_simd_exp_d(ewcljrsq);
360             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
361             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
362             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
363             vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
364             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
365             vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
366             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
367             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
368
369             /* Update potential sum for this i atom from the interaction with this j atom. */
370             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
371
372             fscal            = fvdw;
373
374             /* Calculate temporary vectorial force */
375             tx               = _mm256_mul_pd(fscal,dx00);
376             ty               = _mm256_mul_pd(fscal,dy00);
377             tz               = _mm256_mul_pd(fscal,dz00);
378
379             /* Update vectorial force */
380             fix0             = _mm256_add_pd(fix0,tx);
381             fiy0             = _mm256_add_pd(fiy0,ty);
382             fiz0             = _mm256_add_pd(fiz0,tz);
383
384             fjx0             = _mm256_add_pd(fjx0,tx);
385             fjy0             = _mm256_add_pd(fjy0,ty);
386             fjz0             = _mm256_add_pd(fjz0,tz);
387
388             /**************************
389              * CALCULATE INTERACTIONS *
390              **************************/
391
392             r11              = _mm256_mul_pd(rsq11,rinv11);
393
394             /* EWALD ELECTROSTATICS */
395
396             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
397             ewrt             = _mm256_mul_pd(r11,ewtabscale);
398             ewitab           = _mm256_cvttpd_epi32(ewrt);
399             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
400             ewitab           = _mm_slli_epi32(ewitab,2);
401             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
402             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
403             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
404             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
405             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
406             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
407             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
408             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
409             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
410
411             /* Update potential sum for this i atom from the interaction with this j atom. */
412             velecsum         = _mm256_add_pd(velecsum,velec);
413
414             fscal            = felec;
415
416             /* Calculate temporary vectorial force */
417             tx               = _mm256_mul_pd(fscal,dx11);
418             ty               = _mm256_mul_pd(fscal,dy11);
419             tz               = _mm256_mul_pd(fscal,dz11);
420
421             /* Update vectorial force */
422             fix1             = _mm256_add_pd(fix1,tx);
423             fiy1             = _mm256_add_pd(fiy1,ty);
424             fiz1             = _mm256_add_pd(fiz1,tz);
425
426             fjx1             = _mm256_add_pd(fjx1,tx);
427             fjy1             = _mm256_add_pd(fjy1,ty);
428             fjz1             = _mm256_add_pd(fjz1,tz);
429
430             /**************************
431              * CALCULATE INTERACTIONS *
432              **************************/
433
434             r12              = _mm256_mul_pd(rsq12,rinv12);
435
436             /* EWALD ELECTROSTATICS */
437
438             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
439             ewrt             = _mm256_mul_pd(r12,ewtabscale);
440             ewitab           = _mm256_cvttpd_epi32(ewrt);
441             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
442             ewitab           = _mm_slli_epi32(ewitab,2);
443             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
444             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
445             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
446             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
447             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
448             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
449             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
450             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
451             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
452
453             /* Update potential sum for this i atom from the interaction with this j atom. */
454             velecsum         = _mm256_add_pd(velecsum,velec);
455
456             fscal            = felec;
457
458             /* Calculate temporary vectorial force */
459             tx               = _mm256_mul_pd(fscal,dx12);
460             ty               = _mm256_mul_pd(fscal,dy12);
461             tz               = _mm256_mul_pd(fscal,dz12);
462
463             /* Update vectorial force */
464             fix1             = _mm256_add_pd(fix1,tx);
465             fiy1             = _mm256_add_pd(fiy1,ty);
466             fiz1             = _mm256_add_pd(fiz1,tz);
467
468             fjx2             = _mm256_add_pd(fjx2,tx);
469             fjy2             = _mm256_add_pd(fjy2,ty);
470             fjz2             = _mm256_add_pd(fjz2,tz);
471
472             /**************************
473              * CALCULATE INTERACTIONS *
474              **************************/
475
476             r13              = _mm256_mul_pd(rsq13,rinv13);
477
478             /* EWALD ELECTROSTATICS */
479
480             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
481             ewrt             = _mm256_mul_pd(r13,ewtabscale);
482             ewitab           = _mm256_cvttpd_epi32(ewrt);
483             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
484             ewitab           = _mm_slli_epi32(ewitab,2);
485             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
486             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
487             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
488             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
489             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
490             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
491             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
492             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
493             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
494
495             /* Update potential sum for this i atom from the interaction with this j atom. */
496             velecsum         = _mm256_add_pd(velecsum,velec);
497
498             fscal            = felec;
499
500             /* Calculate temporary vectorial force */
501             tx               = _mm256_mul_pd(fscal,dx13);
502             ty               = _mm256_mul_pd(fscal,dy13);
503             tz               = _mm256_mul_pd(fscal,dz13);
504
505             /* Update vectorial force */
506             fix1             = _mm256_add_pd(fix1,tx);
507             fiy1             = _mm256_add_pd(fiy1,ty);
508             fiz1             = _mm256_add_pd(fiz1,tz);
509
510             fjx3             = _mm256_add_pd(fjx3,tx);
511             fjy3             = _mm256_add_pd(fjy3,ty);
512             fjz3             = _mm256_add_pd(fjz3,tz);
513
514             /**************************
515              * CALCULATE INTERACTIONS *
516              **************************/
517
518             r21              = _mm256_mul_pd(rsq21,rinv21);
519
520             /* EWALD ELECTROSTATICS */
521
522             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
523             ewrt             = _mm256_mul_pd(r21,ewtabscale);
524             ewitab           = _mm256_cvttpd_epi32(ewrt);
525             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
526             ewitab           = _mm_slli_epi32(ewitab,2);
527             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
528             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
529             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
530             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
531             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
532             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
533             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
534             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
535             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
536
537             /* Update potential sum for this i atom from the interaction with this j atom. */
538             velecsum         = _mm256_add_pd(velecsum,velec);
539
540             fscal            = felec;
541
542             /* Calculate temporary vectorial force */
543             tx               = _mm256_mul_pd(fscal,dx21);
544             ty               = _mm256_mul_pd(fscal,dy21);
545             tz               = _mm256_mul_pd(fscal,dz21);
546
547             /* Update vectorial force */
548             fix2             = _mm256_add_pd(fix2,tx);
549             fiy2             = _mm256_add_pd(fiy2,ty);
550             fiz2             = _mm256_add_pd(fiz2,tz);
551
552             fjx1             = _mm256_add_pd(fjx1,tx);
553             fjy1             = _mm256_add_pd(fjy1,ty);
554             fjz1             = _mm256_add_pd(fjz1,tz);
555
556             /**************************
557              * CALCULATE INTERACTIONS *
558              **************************/
559
560             r22              = _mm256_mul_pd(rsq22,rinv22);
561
562             /* EWALD ELECTROSTATICS */
563
564             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
565             ewrt             = _mm256_mul_pd(r22,ewtabscale);
566             ewitab           = _mm256_cvttpd_epi32(ewrt);
567             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
568             ewitab           = _mm_slli_epi32(ewitab,2);
569             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
570             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
571             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
572             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
573             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
574             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
575             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
576             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
577             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
578
579             /* Update potential sum for this i atom from the interaction with this j atom. */
580             velecsum         = _mm256_add_pd(velecsum,velec);
581
582             fscal            = felec;
583
584             /* Calculate temporary vectorial force */
585             tx               = _mm256_mul_pd(fscal,dx22);
586             ty               = _mm256_mul_pd(fscal,dy22);
587             tz               = _mm256_mul_pd(fscal,dz22);
588
589             /* Update vectorial force */
590             fix2             = _mm256_add_pd(fix2,tx);
591             fiy2             = _mm256_add_pd(fiy2,ty);
592             fiz2             = _mm256_add_pd(fiz2,tz);
593
594             fjx2             = _mm256_add_pd(fjx2,tx);
595             fjy2             = _mm256_add_pd(fjy2,ty);
596             fjz2             = _mm256_add_pd(fjz2,tz);
597
598             /**************************
599              * CALCULATE INTERACTIONS *
600              **************************/
601
602             r23              = _mm256_mul_pd(rsq23,rinv23);
603
604             /* EWALD ELECTROSTATICS */
605
606             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
607             ewrt             = _mm256_mul_pd(r23,ewtabscale);
608             ewitab           = _mm256_cvttpd_epi32(ewrt);
609             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
610             ewitab           = _mm_slli_epi32(ewitab,2);
611             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
612             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
613             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
614             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
615             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
616             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
617             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
618             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
619             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
620
621             /* Update potential sum for this i atom from the interaction with this j atom. */
622             velecsum         = _mm256_add_pd(velecsum,velec);
623
624             fscal            = felec;
625
626             /* Calculate temporary vectorial force */
627             tx               = _mm256_mul_pd(fscal,dx23);
628             ty               = _mm256_mul_pd(fscal,dy23);
629             tz               = _mm256_mul_pd(fscal,dz23);
630
631             /* Update vectorial force */
632             fix2             = _mm256_add_pd(fix2,tx);
633             fiy2             = _mm256_add_pd(fiy2,ty);
634             fiz2             = _mm256_add_pd(fiz2,tz);
635
636             fjx3             = _mm256_add_pd(fjx3,tx);
637             fjy3             = _mm256_add_pd(fjy3,ty);
638             fjz3             = _mm256_add_pd(fjz3,tz);
639
640             /**************************
641              * CALCULATE INTERACTIONS *
642              **************************/
643
644             r31              = _mm256_mul_pd(rsq31,rinv31);
645
646             /* EWALD ELECTROSTATICS */
647
648             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
649             ewrt             = _mm256_mul_pd(r31,ewtabscale);
650             ewitab           = _mm256_cvttpd_epi32(ewrt);
651             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
652             ewitab           = _mm_slli_epi32(ewitab,2);
653             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
654             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
655             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
656             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
657             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
658             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
659             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
660             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
661             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
662
663             /* Update potential sum for this i atom from the interaction with this j atom. */
664             velecsum         = _mm256_add_pd(velecsum,velec);
665
666             fscal            = felec;
667
668             /* Calculate temporary vectorial force */
669             tx               = _mm256_mul_pd(fscal,dx31);
670             ty               = _mm256_mul_pd(fscal,dy31);
671             tz               = _mm256_mul_pd(fscal,dz31);
672
673             /* Update vectorial force */
674             fix3             = _mm256_add_pd(fix3,tx);
675             fiy3             = _mm256_add_pd(fiy3,ty);
676             fiz3             = _mm256_add_pd(fiz3,tz);
677
678             fjx1             = _mm256_add_pd(fjx1,tx);
679             fjy1             = _mm256_add_pd(fjy1,ty);
680             fjz1             = _mm256_add_pd(fjz1,tz);
681
682             /**************************
683              * CALCULATE INTERACTIONS *
684              **************************/
685
686             r32              = _mm256_mul_pd(rsq32,rinv32);
687
688             /* EWALD ELECTROSTATICS */
689
690             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
691             ewrt             = _mm256_mul_pd(r32,ewtabscale);
692             ewitab           = _mm256_cvttpd_epi32(ewrt);
693             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
694             ewitab           = _mm_slli_epi32(ewitab,2);
695             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
696             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
697             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
698             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
699             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
700             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
701             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
702             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
703             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
704
705             /* Update potential sum for this i atom from the interaction with this j atom. */
706             velecsum         = _mm256_add_pd(velecsum,velec);
707
708             fscal            = felec;
709
710             /* Calculate temporary vectorial force */
711             tx               = _mm256_mul_pd(fscal,dx32);
712             ty               = _mm256_mul_pd(fscal,dy32);
713             tz               = _mm256_mul_pd(fscal,dz32);
714
715             /* Update vectorial force */
716             fix3             = _mm256_add_pd(fix3,tx);
717             fiy3             = _mm256_add_pd(fiy3,ty);
718             fiz3             = _mm256_add_pd(fiz3,tz);
719
720             fjx2             = _mm256_add_pd(fjx2,tx);
721             fjy2             = _mm256_add_pd(fjy2,ty);
722             fjz2             = _mm256_add_pd(fjz2,tz);
723
724             /**************************
725              * CALCULATE INTERACTIONS *
726              **************************/
727
728             r33              = _mm256_mul_pd(rsq33,rinv33);
729
730             /* EWALD ELECTROSTATICS */
731
732             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
733             ewrt             = _mm256_mul_pd(r33,ewtabscale);
734             ewitab           = _mm256_cvttpd_epi32(ewrt);
735             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
736             ewitab           = _mm_slli_epi32(ewitab,2);
737             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
738             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
739             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
740             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
741             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
742             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
743             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
744             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
745             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
746
747             /* Update potential sum for this i atom from the interaction with this j atom. */
748             velecsum         = _mm256_add_pd(velecsum,velec);
749
750             fscal            = felec;
751
752             /* Calculate temporary vectorial force */
753             tx               = _mm256_mul_pd(fscal,dx33);
754             ty               = _mm256_mul_pd(fscal,dy33);
755             tz               = _mm256_mul_pd(fscal,dz33);
756
757             /* Update vectorial force */
758             fix3             = _mm256_add_pd(fix3,tx);
759             fiy3             = _mm256_add_pd(fiy3,ty);
760             fiz3             = _mm256_add_pd(fiz3,tz);
761
762             fjx3             = _mm256_add_pd(fjx3,tx);
763             fjy3             = _mm256_add_pd(fjy3,ty);
764             fjz3             = _mm256_add_pd(fjz3,tz);
765
766             fjptrA             = f+j_coord_offsetA;
767             fjptrB             = f+j_coord_offsetB;
768             fjptrC             = f+j_coord_offsetC;
769             fjptrD             = f+j_coord_offsetD;
770
771             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
772                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
773                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
774
775             /* Inner loop uses 426 flops */
776         }
777
778         if(jidx<j_index_end)
779         {
780
781             /* Get j neighbor index, and coordinate index */
782             jnrlistA         = jjnr[jidx];
783             jnrlistB         = jjnr[jidx+1];
784             jnrlistC         = jjnr[jidx+2];
785             jnrlistD         = jjnr[jidx+3];
786             /* Sign of each element will be negative for non-real atoms.
787              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
788              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
789              */
790             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
791
792             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
793             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
794             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
795
796             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
797             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
798             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
799             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
800             j_coord_offsetA  = DIM*jnrA;
801             j_coord_offsetB  = DIM*jnrB;
802             j_coord_offsetC  = DIM*jnrC;
803             j_coord_offsetD  = DIM*jnrD;
804
805             /* load j atom coordinates */
806             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
807                                                  x+j_coord_offsetC,x+j_coord_offsetD,
808                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
809                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
810
811             /* Calculate displacement vector */
812             dx00             = _mm256_sub_pd(ix0,jx0);
813             dy00             = _mm256_sub_pd(iy0,jy0);
814             dz00             = _mm256_sub_pd(iz0,jz0);
815             dx11             = _mm256_sub_pd(ix1,jx1);
816             dy11             = _mm256_sub_pd(iy1,jy1);
817             dz11             = _mm256_sub_pd(iz1,jz1);
818             dx12             = _mm256_sub_pd(ix1,jx2);
819             dy12             = _mm256_sub_pd(iy1,jy2);
820             dz12             = _mm256_sub_pd(iz1,jz2);
821             dx13             = _mm256_sub_pd(ix1,jx3);
822             dy13             = _mm256_sub_pd(iy1,jy3);
823             dz13             = _mm256_sub_pd(iz1,jz3);
824             dx21             = _mm256_sub_pd(ix2,jx1);
825             dy21             = _mm256_sub_pd(iy2,jy1);
826             dz21             = _mm256_sub_pd(iz2,jz1);
827             dx22             = _mm256_sub_pd(ix2,jx2);
828             dy22             = _mm256_sub_pd(iy2,jy2);
829             dz22             = _mm256_sub_pd(iz2,jz2);
830             dx23             = _mm256_sub_pd(ix2,jx3);
831             dy23             = _mm256_sub_pd(iy2,jy3);
832             dz23             = _mm256_sub_pd(iz2,jz3);
833             dx31             = _mm256_sub_pd(ix3,jx1);
834             dy31             = _mm256_sub_pd(iy3,jy1);
835             dz31             = _mm256_sub_pd(iz3,jz1);
836             dx32             = _mm256_sub_pd(ix3,jx2);
837             dy32             = _mm256_sub_pd(iy3,jy2);
838             dz32             = _mm256_sub_pd(iz3,jz2);
839             dx33             = _mm256_sub_pd(ix3,jx3);
840             dy33             = _mm256_sub_pd(iy3,jy3);
841             dz33             = _mm256_sub_pd(iz3,jz3);
842
843             /* Calculate squared distance and things based on it */
844             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
845             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
846             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
847             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
848             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
849             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
850             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
851             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
852             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
853             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
854
855             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
856             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
857             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
858             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
859             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
860             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
861             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
862             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
863             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
864             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
865
866             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
867             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
868             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
869             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
870             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
871             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
872             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
873             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
874             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
875             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
876
877             fjx0             = _mm256_setzero_pd();
878             fjy0             = _mm256_setzero_pd();
879             fjz0             = _mm256_setzero_pd();
880             fjx1             = _mm256_setzero_pd();
881             fjy1             = _mm256_setzero_pd();
882             fjz1             = _mm256_setzero_pd();
883             fjx2             = _mm256_setzero_pd();
884             fjy2             = _mm256_setzero_pd();
885             fjz2             = _mm256_setzero_pd();
886             fjx3             = _mm256_setzero_pd();
887             fjy3             = _mm256_setzero_pd();
888             fjz3             = _mm256_setzero_pd();
889
890             /**************************
891              * CALCULATE INTERACTIONS *
892              **************************/
893
894             r00              = _mm256_mul_pd(rsq00,rinv00);
895             r00              = _mm256_andnot_pd(dummy_mask,r00);
896
897             /* Analytical LJ-PME */
898             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
899             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
900             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
901             exponent         = gmx_simd_exp_d(ewcljrsq);
902             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
903             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
904             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
905             vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
906             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
907             vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
908             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
909             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
910
911             /* Update potential sum for this i atom from the interaction with this j atom. */
912             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
913             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
914
915             fscal            = fvdw;
916
917             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
918
919             /* Calculate temporary vectorial force */
920             tx               = _mm256_mul_pd(fscal,dx00);
921             ty               = _mm256_mul_pd(fscal,dy00);
922             tz               = _mm256_mul_pd(fscal,dz00);
923
924             /* Update vectorial force */
925             fix0             = _mm256_add_pd(fix0,tx);
926             fiy0             = _mm256_add_pd(fiy0,ty);
927             fiz0             = _mm256_add_pd(fiz0,tz);
928
929             fjx0             = _mm256_add_pd(fjx0,tx);
930             fjy0             = _mm256_add_pd(fjy0,ty);
931             fjz0             = _mm256_add_pd(fjz0,tz);
932
933             /**************************
934              * CALCULATE INTERACTIONS *
935              **************************/
936
937             r11              = _mm256_mul_pd(rsq11,rinv11);
938             r11              = _mm256_andnot_pd(dummy_mask,r11);
939
940             /* EWALD ELECTROSTATICS */
941
942             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
943             ewrt             = _mm256_mul_pd(r11,ewtabscale);
944             ewitab           = _mm256_cvttpd_epi32(ewrt);
945             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
946             ewitab           = _mm_slli_epi32(ewitab,2);
947             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
948             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
949             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
950             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
951             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
952             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
953             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
954             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
955             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
956
957             /* Update potential sum for this i atom from the interaction with this j atom. */
958             velec            = _mm256_andnot_pd(dummy_mask,velec);
959             velecsum         = _mm256_add_pd(velecsum,velec);
960
961             fscal            = felec;
962
963             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
964
965             /* Calculate temporary vectorial force */
966             tx               = _mm256_mul_pd(fscal,dx11);
967             ty               = _mm256_mul_pd(fscal,dy11);
968             tz               = _mm256_mul_pd(fscal,dz11);
969
970             /* Update vectorial force */
971             fix1             = _mm256_add_pd(fix1,tx);
972             fiy1             = _mm256_add_pd(fiy1,ty);
973             fiz1             = _mm256_add_pd(fiz1,tz);
974
975             fjx1             = _mm256_add_pd(fjx1,tx);
976             fjy1             = _mm256_add_pd(fjy1,ty);
977             fjz1             = _mm256_add_pd(fjz1,tz);
978
979             /**************************
980              * CALCULATE INTERACTIONS *
981              **************************/
982
983             r12              = _mm256_mul_pd(rsq12,rinv12);
984             r12              = _mm256_andnot_pd(dummy_mask,r12);
985
986             /* EWALD ELECTROSTATICS */
987
988             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
989             ewrt             = _mm256_mul_pd(r12,ewtabscale);
990             ewitab           = _mm256_cvttpd_epi32(ewrt);
991             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
992             ewitab           = _mm_slli_epi32(ewitab,2);
993             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
994             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
995             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
996             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
997             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
998             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
999             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1000             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1001             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1002
1003             /* Update potential sum for this i atom from the interaction with this j atom. */
1004             velec            = _mm256_andnot_pd(dummy_mask,velec);
1005             velecsum         = _mm256_add_pd(velecsum,velec);
1006
1007             fscal            = felec;
1008
1009             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1010
1011             /* Calculate temporary vectorial force */
1012             tx               = _mm256_mul_pd(fscal,dx12);
1013             ty               = _mm256_mul_pd(fscal,dy12);
1014             tz               = _mm256_mul_pd(fscal,dz12);
1015
1016             /* Update vectorial force */
1017             fix1             = _mm256_add_pd(fix1,tx);
1018             fiy1             = _mm256_add_pd(fiy1,ty);
1019             fiz1             = _mm256_add_pd(fiz1,tz);
1020
1021             fjx2             = _mm256_add_pd(fjx2,tx);
1022             fjy2             = _mm256_add_pd(fjy2,ty);
1023             fjz2             = _mm256_add_pd(fjz2,tz);
1024
1025             /**************************
1026              * CALCULATE INTERACTIONS *
1027              **************************/
1028
1029             r13              = _mm256_mul_pd(rsq13,rinv13);
1030             r13              = _mm256_andnot_pd(dummy_mask,r13);
1031
1032             /* EWALD ELECTROSTATICS */
1033
1034             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1035             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1036             ewitab           = _mm256_cvttpd_epi32(ewrt);
1037             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1038             ewitab           = _mm_slli_epi32(ewitab,2);
1039             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1040             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1041             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1042             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1043             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1044             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1045             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1046             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1047             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1048
1049             /* Update potential sum for this i atom from the interaction with this j atom. */
1050             velec            = _mm256_andnot_pd(dummy_mask,velec);
1051             velecsum         = _mm256_add_pd(velecsum,velec);
1052
1053             fscal            = felec;
1054
1055             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1056
1057             /* Calculate temporary vectorial force */
1058             tx               = _mm256_mul_pd(fscal,dx13);
1059             ty               = _mm256_mul_pd(fscal,dy13);
1060             tz               = _mm256_mul_pd(fscal,dz13);
1061
1062             /* Update vectorial force */
1063             fix1             = _mm256_add_pd(fix1,tx);
1064             fiy1             = _mm256_add_pd(fiy1,ty);
1065             fiz1             = _mm256_add_pd(fiz1,tz);
1066
1067             fjx3             = _mm256_add_pd(fjx3,tx);
1068             fjy3             = _mm256_add_pd(fjy3,ty);
1069             fjz3             = _mm256_add_pd(fjz3,tz);
1070
1071             /**************************
1072              * CALCULATE INTERACTIONS *
1073              **************************/
1074
1075             r21              = _mm256_mul_pd(rsq21,rinv21);
1076             r21              = _mm256_andnot_pd(dummy_mask,r21);
1077
1078             /* EWALD ELECTROSTATICS */
1079
1080             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1081             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1082             ewitab           = _mm256_cvttpd_epi32(ewrt);
1083             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1084             ewitab           = _mm_slli_epi32(ewitab,2);
1085             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1086             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1087             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1088             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1089             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1090             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1091             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1092             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1093             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1094
1095             /* Update potential sum for this i atom from the interaction with this j atom. */
1096             velec            = _mm256_andnot_pd(dummy_mask,velec);
1097             velecsum         = _mm256_add_pd(velecsum,velec);
1098
1099             fscal            = felec;
1100
1101             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1102
1103             /* Calculate temporary vectorial force */
1104             tx               = _mm256_mul_pd(fscal,dx21);
1105             ty               = _mm256_mul_pd(fscal,dy21);
1106             tz               = _mm256_mul_pd(fscal,dz21);
1107
1108             /* Update vectorial force */
1109             fix2             = _mm256_add_pd(fix2,tx);
1110             fiy2             = _mm256_add_pd(fiy2,ty);
1111             fiz2             = _mm256_add_pd(fiz2,tz);
1112
1113             fjx1             = _mm256_add_pd(fjx1,tx);
1114             fjy1             = _mm256_add_pd(fjy1,ty);
1115             fjz1             = _mm256_add_pd(fjz1,tz);
1116
1117             /**************************
1118              * CALCULATE INTERACTIONS *
1119              **************************/
1120
1121             r22              = _mm256_mul_pd(rsq22,rinv22);
1122             r22              = _mm256_andnot_pd(dummy_mask,r22);
1123
1124             /* EWALD ELECTROSTATICS */
1125
1126             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1127             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1128             ewitab           = _mm256_cvttpd_epi32(ewrt);
1129             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1130             ewitab           = _mm_slli_epi32(ewitab,2);
1131             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1132             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1133             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1134             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1135             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1136             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1137             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1138             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1139             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1140
1141             /* Update potential sum for this i atom from the interaction with this j atom. */
1142             velec            = _mm256_andnot_pd(dummy_mask,velec);
1143             velecsum         = _mm256_add_pd(velecsum,velec);
1144
1145             fscal            = felec;
1146
1147             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1148
1149             /* Calculate temporary vectorial force */
1150             tx               = _mm256_mul_pd(fscal,dx22);
1151             ty               = _mm256_mul_pd(fscal,dy22);
1152             tz               = _mm256_mul_pd(fscal,dz22);
1153
1154             /* Update vectorial force */
1155             fix2             = _mm256_add_pd(fix2,tx);
1156             fiy2             = _mm256_add_pd(fiy2,ty);
1157             fiz2             = _mm256_add_pd(fiz2,tz);
1158
1159             fjx2             = _mm256_add_pd(fjx2,tx);
1160             fjy2             = _mm256_add_pd(fjy2,ty);
1161             fjz2             = _mm256_add_pd(fjz2,tz);
1162
1163             /**************************
1164              * CALCULATE INTERACTIONS *
1165              **************************/
1166
1167             r23              = _mm256_mul_pd(rsq23,rinv23);
1168             r23              = _mm256_andnot_pd(dummy_mask,r23);
1169
1170             /* EWALD ELECTROSTATICS */
1171
1172             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1173             ewrt             = _mm256_mul_pd(r23,ewtabscale);
1174             ewitab           = _mm256_cvttpd_epi32(ewrt);
1175             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1176             ewitab           = _mm_slli_epi32(ewitab,2);
1177             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1178             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1179             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1180             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1181             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1182             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1183             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1184             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1185             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1186
1187             /* Update potential sum for this i atom from the interaction with this j atom. */
1188             velec            = _mm256_andnot_pd(dummy_mask,velec);
1189             velecsum         = _mm256_add_pd(velecsum,velec);
1190
1191             fscal            = felec;
1192
1193             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1194
1195             /* Calculate temporary vectorial force */
1196             tx               = _mm256_mul_pd(fscal,dx23);
1197             ty               = _mm256_mul_pd(fscal,dy23);
1198             tz               = _mm256_mul_pd(fscal,dz23);
1199
1200             /* Update vectorial force */
1201             fix2             = _mm256_add_pd(fix2,tx);
1202             fiy2             = _mm256_add_pd(fiy2,ty);
1203             fiz2             = _mm256_add_pd(fiz2,tz);
1204
1205             fjx3             = _mm256_add_pd(fjx3,tx);
1206             fjy3             = _mm256_add_pd(fjy3,ty);
1207             fjz3             = _mm256_add_pd(fjz3,tz);
1208
1209             /**************************
1210              * CALCULATE INTERACTIONS *
1211              **************************/
1212
1213             r31              = _mm256_mul_pd(rsq31,rinv31);
1214             r31              = _mm256_andnot_pd(dummy_mask,r31);
1215
1216             /* EWALD ELECTROSTATICS */
1217
1218             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1219             ewrt             = _mm256_mul_pd(r31,ewtabscale);
1220             ewitab           = _mm256_cvttpd_epi32(ewrt);
1221             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1222             ewitab           = _mm_slli_epi32(ewitab,2);
1223             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1224             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1225             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1226             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1227             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1228             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1229             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1230             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1231             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1232
1233             /* Update potential sum for this i atom from the interaction with this j atom. */
1234             velec            = _mm256_andnot_pd(dummy_mask,velec);
1235             velecsum         = _mm256_add_pd(velecsum,velec);
1236
1237             fscal            = felec;
1238
1239             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1240
1241             /* Calculate temporary vectorial force */
1242             tx               = _mm256_mul_pd(fscal,dx31);
1243             ty               = _mm256_mul_pd(fscal,dy31);
1244             tz               = _mm256_mul_pd(fscal,dz31);
1245
1246             /* Update vectorial force */
1247             fix3             = _mm256_add_pd(fix3,tx);
1248             fiy3             = _mm256_add_pd(fiy3,ty);
1249             fiz3             = _mm256_add_pd(fiz3,tz);
1250
1251             fjx1             = _mm256_add_pd(fjx1,tx);
1252             fjy1             = _mm256_add_pd(fjy1,ty);
1253             fjz1             = _mm256_add_pd(fjz1,tz);
1254
1255             /**************************
1256              * CALCULATE INTERACTIONS *
1257              **************************/
1258
1259             r32              = _mm256_mul_pd(rsq32,rinv32);
1260             r32              = _mm256_andnot_pd(dummy_mask,r32);
1261
1262             /* EWALD ELECTROSTATICS */
1263
1264             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1265             ewrt             = _mm256_mul_pd(r32,ewtabscale);
1266             ewitab           = _mm256_cvttpd_epi32(ewrt);
1267             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1268             ewitab           = _mm_slli_epi32(ewitab,2);
1269             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1270             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1271             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1272             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1273             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1274             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1275             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1276             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1277             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1278
1279             /* Update potential sum for this i atom from the interaction with this j atom. */
1280             velec            = _mm256_andnot_pd(dummy_mask,velec);
1281             velecsum         = _mm256_add_pd(velecsum,velec);
1282
1283             fscal            = felec;
1284
1285             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1286
1287             /* Calculate temporary vectorial force */
1288             tx               = _mm256_mul_pd(fscal,dx32);
1289             ty               = _mm256_mul_pd(fscal,dy32);
1290             tz               = _mm256_mul_pd(fscal,dz32);
1291
1292             /* Update vectorial force */
1293             fix3             = _mm256_add_pd(fix3,tx);
1294             fiy3             = _mm256_add_pd(fiy3,ty);
1295             fiz3             = _mm256_add_pd(fiz3,tz);
1296
1297             fjx2             = _mm256_add_pd(fjx2,tx);
1298             fjy2             = _mm256_add_pd(fjy2,ty);
1299             fjz2             = _mm256_add_pd(fjz2,tz);
1300
1301             /**************************
1302              * CALCULATE INTERACTIONS *
1303              **************************/
1304
1305             r33              = _mm256_mul_pd(rsq33,rinv33);
1306             r33              = _mm256_andnot_pd(dummy_mask,r33);
1307
1308             /* EWALD ELECTROSTATICS */
1309
1310             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1311             ewrt             = _mm256_mul_pd(r33,ewtabscale);
1312             ewitab           = _mm256_cvttpd_epi32(ewrt);
1313             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1314             ewitab           = _mm_slli_epi32(ewitab,2);
1315             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1316             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1317             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1318             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1319             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1320             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1321             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1322             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1323             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1324
1325             /* Update potential sum for this i atom from the interaction with this j atom. */
1326             velec            = _mm256_andnot_pd(dummy_mask,velec);
1327             velecsum         = _mm256_add_pd(velecsum,velec);
1328
1329             fscal            = felec;
1330
1331             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1332
1333             /* Calculate temporary vectorial force */
1334             tx               = _mm256_mul_pd(fscal,dx33);
1335             ty               = _mm256_mul_pd(fscal,dy33);
1336             tz               = _mm256_mul_pd(fscal,dz33);
1337
1338             /* Update vectorial force */
1339             fix3             = _mm256_add_pd(fix3,tx);
1340             fiy3             = _mm256_add_pd(fiy3,ty);
1341             fiz3             = _mm256_add_pd(fiz3,tz);
1342
1343             fjx3             = _mm256_add_pd(fjx3,tx);
1344             fjy3             = _mm256_add_pd(fjy3,ty);
1345             fjz3             = _mm256_add_pd(fjz3,tz);
1346
1347             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1348             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1349             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1350             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1351
1352             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1353                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1354                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1355
1356             /* Inner loop uses 436 flops */
1357         }
1358
1359         /* End of innermost loop */
1360
1361         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1362                                                  f+i_coord_offset,fshift+i_shift_offset);
1363
1364         ggid                        = gid[iidx];
1365         /* Update potential energies */
1366         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1367         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1368
1369         /* Increment number of inner iterations */
1370         inneriter                  += j_index_end - j_index_start;
1371
1372         /* Outer loop uses 26 flops */
1373     }
1374
1375     /* Increment number of outer iterations */
1376     outeriter        += nri;
1377
1378     /* Update outer/inner flops */
1379
1380     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*436);
1381 }
1382 /*
1383  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double
1384  * Electrostatics interaction: Ewald
1385  * VdW interaction:            LJEwald
1386  * Geometry:                   Water4-Water4
1387  * Calculate force/pot:        Force
1388  */
1389 void
1390 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double
1391                     (t_nblist                    * gmx_restrict       nlist,
1392                      rvec                        * gmx_restrict          xx,
1393                      rvec                        * gmx_restrict          ff,
1394                      t_forcerec                  * gmx_restrict          fr,
1395                      t_mdatoms                   * gmx_restrict     mdatoms,
1396                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1397                      t_nrnb                      * gmx_restrict        nrnb)
1398 {
1399     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1400      * just 0 for non-waters.
1401      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1402      * jnr indices corresponding to data put in the four positions in the SIMD register.
1403      */
1404     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1405     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1406     int              jnrA,jnrB,jnrC,jnrD;
1407     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1408     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1409     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1410     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1411     real             rcutoff_scalar;
1412     real             *shiftvec,*fshift,*x,*f;
1413     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1414     real             scratch[4*DIM];
1415     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1416     real *           vdwioffsetptr0;
1417     real *           vdwgridioffsetptr0;
1418     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1419     real *           vdwioffsetptr1;
1420     real *           vdwgridioffsetptr1;
1421     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1422     real *           vdwioffsetptr2;
1423     real *           vdwgridioffsetptr2;
1424     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1425     real *           vdwioffsetptr3;
1426     real *           vdwgridioffsetptr3;
1427     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1428     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1429     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1430     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1431     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1432     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1433     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1434     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1435     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1436     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1437     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1438     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1439     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1440     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1441     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1442     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1443     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1444     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1445     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1446     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1447     real             *charge;
1448     int              nvdwtype;
1449     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1450     int              *vdwtype;
1451     real             *vdwparam;
1452     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1453     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1454     __m256d           c6grid_00;
1455     __m256d           c6grid_11;
1456     __m256d           c6grid_12;
1457     __m256d           c6grid_13;
1458     __m256d           c6grid_21;
1459     __m256d           c6grid_22;
1460     __m256d           c6grid_23;
1461     __m256d           c6grid_31;
1462     __m256d           c6grid_32;
1463     __m256d           c6grid_33;
1464     real             *vdwgridparam;
1465     __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1466     __m256d           one_half  = _mm256_set1_pd(0.5);
1467     __m256d           minus_one = _mm256_set1_pd(-1.0);
1468     __m128i          ewitab;
1469     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1470     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1471     real             *ewtab;
1472     __m256d          dummy_mask,cutoff_mask;
1473     __m128           tmpmask0,tmpmask1;
1474     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1475     __m256d          one     = _mm256_set1_pd(1.0);
1476     __m256d          two     = _mm256_set1_pd(2.0);
1477     x                = xx[0];
1478     f                = ff[0];
1479
1480     nri              = nlist->nri;
1481     iinr             = nlist->iinr;
1482     jindex           = nlist->jindex;
1483     jjnr             = nlist->jjnr;
1484     shiftidx         = nlist->shift;
1485     gid              = nlist->gid;
1486     shiftvec         = fr->shift_vec[0];
1487     fshift           = fr->fshift[0];
1488     facel            = _mm256_set1_pd(fr->epsfac);
1489     charge           = mdatoms->chargeA;
1490     nvdwtype         = fr->ntype;
1491     vdwparam         = fr->nbfp;
1492     vdwtype          = mdatoms->typeA;
1493     vdwgridparam     = fr->ljpme_c6grid;
1494     sh_lj_ewald      = _mm256_set1_pd(fr->ic->sh_lj_ewald);
1495     ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
1496     ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
1497
1498     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1499     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1500     beta2            = _mm256_mul_pd(beta,beta);
1501     beta3            = _mm256_mul_pd(beta,beta2);
1502
1503     ewtab            = fr->ic->tabq_coul_F;
1504     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1505     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1506
1507     /* Setup water-specific parameters */
1508     inr              = nlist->iinr[0];
1509     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1510     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1511     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1512     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1513     vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
1514
1515     jq1              = _mm256_set1_pd(charge[inr+1]);
1516     jq2              = _mm256_set1_pd(charge[inr+2]);
1517     jq3              = _mm256_set1_pd(charge[inr+3]);
1518     vdwjidx0A        = 2*vdwtype[inr+0];
1519     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1520     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1521     c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
1522     qq11             = _mm256_mul_pd(iq1,jq1);
1523     qq12             = _mm256_mul_pd(iq1,jq2);
1524     qq13             = _mm256_mul_pd(iq1,jq3);
1525     qq21             = _mm256_mul_pd(iq2,jq1);
1526     qq22             = _mm256_mul_pd(iq2,jq2);
1527     qq23             = _mm256_mul_pd(iq2,jq3);
1528     qq31             = _mm256_mul_pd(iq3,jq1);
1529     qq32             = _mm256_mul_pd(iq3,jq2);
1530     qq33             = _mm256_mul_pd(iq3,jq3);
1531
1532     /* Avoid stupid compiler warnings */
1533     jnrA = jnrB = jnrC = jnrD = 0;
1534     j_coord_offsetA = 0;
1535     j_coord_offsetB = 0;
1536     j_coord_offsetC = 0;
1537     j_coord_offsetD = 0;
1538
1539     outeriter        = 0;
1540     inneriter        = 0;
1541
1542     for(iidx=0;iidx<4*DIM;iidx++)
1543     {
1544         scratch[iidx] = 0.0;
1545     }
1546
1547     /* Start outer loop over neighborlists */
1548     for(iidx=0; iidx<nri; iidx++)
1549     {
1550         /* Load shift vector for this list */
1551         i_shift_offset   = DIM*shiftidx[iidx];
1552
1553         /* Load limits for loop over neighbors */
1554         j_index_start    = jindex[iidx];
1555         j_index_end      = jindex[iidx+1];
1556
1557         /* Get outer coordinate index */
1558         inr              = iinr[iidx];
1559         i_coord_offset   = DIM*inr;
1560
1561         /* Load i particle coords and add shift vector */
1562         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1563                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1564
1565         fix0             = _mm256_setzero_pd();
1566         fiy0             = _mm256_setzero_pd();
1567         fiz0             = _mm256_setzero_pd();
1568         fix1             = _mm256_setzero_pd();
1569         fiy1             = _mm256_setzero_pd();
1570         fiz1             = _mm256_setzero_pd();
1571         fix2             = _mm256_setzero_pd();
1572         fiy2             = _mm256_setzero_pd();
1573         fiz2             = _mm256_setzero_pd();
1574         fix3             = _mm256_setzero_pd();
1575         fiy3             = _mm256_setzero_pd();
1576         fiz3             = _mm256_setzero_pd();
1577
1578         /* Start inner kernel loop */
1579         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1580         {
1581
1582             /* Get j neighbor index, and coordinate index */
1583             jnrA             = jjnr[jidx];
1584             jnrB             = jjnr[jidx+1];
1585             jnrC             = jjnr[jidx+2];
1586             jnrD             = jjnr[jidx+3];
1587             j_coord_offsetA  = DIM*jnrA;
1588             j_coord_offsetB  = DIM*jnrB;
1589             j_coord_offsetC  = DIM*jnrC;
1590             j_coord_offsetD  = DIM*jnrD;
1591
1592             /* load j atom coordinates */
1593             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1594                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1595                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1596                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1597
1598             /* Calculate displacement vector */
1599             dx00             = _mm256_sub_pd(ix0,jx0);
1600             dy00             = _mm256_sub_pd(iy0,jy0);
1601             dz00             = _mm256_sub_pd(iz0,jz0);
1602             dx11             = _mm256_sub_pd(ix1,jx1);
1603             dy11             = _mm256_sub_pd(iy1,jy1);
1604             dz11             = _mm256_sub_pd(iz1,jz1);
1605             dx12             = _mm256_sub_pd(ix1,jx2);
1606             dy12             = _mm256_sub_pd(iy1,jy2);
1607             dz12             = _mm256_sub_pd(iz1,jz2);
1608             dx13             = _mm256_sub_pd(ix1,jx3);
1609             dy13             = _mm256_sub_pd(iy1,jy3);
1610             dz13             = _mm256_sub_pd(iz1,jz3);
1611             dx21             = _mm256_sub_pd(ix2,jx1);
1612             dy21             = _mm256_sub_pd(iy2,jy1);
1613             dz21             = _mm256_sub_pd(iz2,jz1);
1614             dx22             = _mm256_sub_pd(ix2,jx2);
1615             dy22             = _mm256_sub_pd(iy2,jy2);
1616             dz22             = _mm256_sub_pd(iz2,jz2);
1617             dx23             = _mm256_sub_pd(ix2,jx3);
1618             dy23             = _mm256_sub_pd(iy2,jy3);
1619             dz23             = _mm256_sub_pd(iz2,jz3);
1620             dx31             = _mm256_sub_pd(ix3,jx1);
1621             dy31             = _mm256_sub_pd(iy3,jy1);
1622             dz31             = _mm256_sub_pd(iz3,jz1);
1623             dx32             = _mm256_sub_pd(ix3,jx2);
1624             dy32             = _mm256_sub_pd(iy3,jy2);
1625             dz32             = _mm256_sub_pd(iz3,jz2);
1626             dx33             = _mm256_sub_pd(ix3,jx3);
1627             dy33             = _mm256_sub_pd(iy3,jy3);
1628             dz33             = _mm256_sub_pd(iz3,jz3);
1629
1630             /* Calculate squared distance and things based on it */
1631             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1632             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1633             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1634             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1635             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1636             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1637             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1638             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1639             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1640             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1641
1642             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1643             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1644             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1645             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1646             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1647             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1648             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1649             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1650             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1651             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1652
1653             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1654             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1655             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1656             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1657             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1658             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1659             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1660             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1661             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1662             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1663
1664             fjx0             = _mm256_setzero_pd();
1665             fjy0             = _mm256_setzero_pd();
1666             fjz0             = _mm256_setzero_pd();
1667             fjx1             = _mm256_setzero_pd();
1668             fjy1             = _mm256_setzero_pd();
1669             fjz1             = _mm256_setzero_pd();
1670             fjx2             = _mm256_setzero_pd();
1671             fjy2             = _mm256_setzero_pd();
1672             fjz2             = _mm256_setzero_pd();
1673             fjx3             = _mm256_setzero_pd();
1674             fjy3             = _mm256_setzero_pd();
1675             fjz3             = _mm256_setzero_pd();
1676
1677             /**************************
1678              * CALCULATE INTERACTIONS *
1679              **************************/
1680
1681             r00              = _mm256_mul_pd(rsq00,rinv00);
1682
1683             /* Analytical LJ-PME */
1684             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1685             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
1686             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
1687             exponent         = gmx_simd_exp_d(ewcljrsq);
1688             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1689             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1690             /* f6A = 6 * C6grid * (1 - poly) */
1691             f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
1692             /* f6B = C6grid * exponent * beta^6 */
1693             f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
1694             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1695             fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1696
1697             fscal            = fvdw;
1698
1699             /* Calculate temporary vectorial force */
1700             tx               = _mm256_mul_pd(fscal,dx00);
1701             ty               = _mm256_mul_pd(fscal,dy00);
1702             tz               = _mm256_mul_pd(fscal,dz00);
1703
1704             /* Update vectorial force */
1705             fix0             = _mm256_add_pd(fix0,tx);
1706             fiy0             = _mm256_add_pd(fiy0,ty);
1707             fiz0             = _mm256_add_pd(fiz0,tz);
1708
1709             fjx0             = _mm256_add_pd(fjx0,tx);
1710             fjy0             = _mm256_add_pd(fjy0,ty);
1711             fjz0             = _mm256_add_pd(fjz0,tz);
1712
1713             /**************************
1714              * CALCULATE INTERACTIONS *
1715              **************************/
1716
1717             r11              = _mm256_mul_pd(rsq11,rinv11);
1718
1719             /* EWALD ELECTROSTATICS */
1720
1721             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1722             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1723             ewitab           = _mm256_cvttpd_epi32(ewrt);
1724             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1725             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1726                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1727                                             &ewtabF,&ewtabFn);
1728             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1729             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1730
1731             fscal            = felec;
1732
1733             /* Calculate temporary vectorial force */
1734             tx               = _mm256_mul_pd(fscal,dx11);
1735             ty               = _mm256_mul_pd(fscal,dy11);
1736             tz               = _mm256_mul_pd(fscal,dz11);
1737
1738             /* Update vectorial force */
1739             fix1             = _mm256_add_pd(fix1,tx);
1740             fiy1             = _mm256_add_pd(fiy1,ty);
1741             fiz1             = _mm256_add_pd(fiz1,tz);
1742
1743             fjx1             = _mm256_add_pd(fjx1,tx);
1744             fjy1             = _mm256_add_pd(fjy1,ty);
1745             fjz1             = _mm256_add_pd(fjz1,tz);
1746
1747             /**************************
1748              * CALCULATE INTERACTIONS *
1749              **************************/
1750
1751             r12              = _mm256_mul_pd(rsq12,rinv12);
1752
1753             /* EWALD ELECTROSTATICS */
1754
1755             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1756             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1757             ewitab           = _mm256_cvttpd_epi32(ewrt);
1758             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1759             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1760                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1761                                             &ewtabF,&ewtabFn);
1762             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1763             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1764
1765             fscal            = felec;
1766
1767             /* Calculate temporary vectorial force */
1768             tx               = _mm256_mul_pd(fscal,dx12);
1769             ty               = _mm256_mul_pd(fscal,dy12);
1770             tz               = _mm256_mul_pd(fscal,dz12);
1771
1772             /* Update vectorial force */
1773             fix1             = _mm256_add_pd(fix1,tx);
1774             fiy1             = _mm256_add_pd(fiy1,ty);
1775             fiz1             = _mm256_add_pd(fiz1,tz);
1776
1777             fjx2             = _mm256_add_pd(fjx2,tx);
1778             fjy2             = _mm256_add_pd(fjy2,ty);
1779             fjz2             = _mm256_add_pd(fjz2,tz);
1780
1781             /**************************
1782              * CALCULATE INTERACTIONS *
1783              **************************/
1784
1785             r13              = _mm256_mul_pd(rsq13,rinv13);
1786
1787             /* EWALD ELECTROSTATICS */
1788
1789             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1790             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1791             ewitab           = _mm256_cvttpd_epi32(ewrt);
1792             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1793             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1794                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1795                                             &ewtabF,&ewtabFn);
1796             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1797             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1798
1799             fscal            = felec;
1800
1801             /* Calculate temporary vectorial force */
1802             tx               = _mm256_mul_pd(fscal,dx13);
1803             ty               = _mm256_mul_pd(fscal,dy13);
1804             tz               = _mm256_mul_pd(fscal,dz13);
1805
1806             /* Update vectorial force */
1807             fix1             = _mm256_add_pd(fix1,tx);
1808             fiy1             = _mm256_add_pd(fiy1,ty);
1809             fiz1             = _mm256_add_pd(fiz1,tz);
1810
1811             fjx3             = _mm256_add_pd(fjx3,tx);
1812             fjy3             = _mm256_add_pd(fjy3,ty);
1813             fjz3             = _mm256_add_pd(fjz3,tz);
1814
1815             /**************************
1816              * CALCULATE INTERACTIONS *
1817              **************************/
1818
1819             r21              = _mm256_mul_pd(rsq21,rinv21);
1820
1821             /* EWALD ELECTROSTATICS */
1822
1823             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1824             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1825             ewitab           = _mm256_cvttpd_epi32(ewrt);
1826             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1827             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1828                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1829                                             &ewtabF,&ewtabFn);
1830             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1831             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1832
1833             fscal            = felec;
1834
1835             /* Calculate temporary vectorial force */
1836             tx               = _mm256_mul_pd(fscal,dx21);
1837             ty               = _mm256_mul_pd(fscal,dy21);
1838             tz               = _mm256_mul_pd(fscal,dz21);
1839
1840             /* Update vectorial force */
1841             fix2             = _mm256_add_pd(fix2,tx);
1842             fiy2             = _mm256_add_pd(fiy2,ty);
1843             fiz2             = _mm256_add_pd(fiz2,tz);
1844
1845             fjx1             = _mm256_add_pd(fjx1,tx);
1846             fjy1             = _mm256_add_pd(fjy1,ty);
1847             fjz1             = _mm256_add_pd(fjz1,tz);
1848
1849             /**************************
1850              * CALCULATE INTERACTIONS *
1851              **************************/
1852
1853             r22              = _mm256_mul_pd(rsq22,rinv22);
1854
1855             /* EWALD ELECTROSTATICS */
1856
1857             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1858             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1859             ewitab           = _mm256_cvttpd_epi32(ewrt);
1860             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1861             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1862                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1863                                             &ewtabF,&ewtabFn);
1864             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1865             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1866
1867             fscal            = felec;
1868
1869             /* Calculate temporary vectorial force */
1870             tx               = _mm256_mul_pd(fscal,dx22);
1871             ty               = _mm256_mul_pd(fscal,dy22);
1872             tz               = _mm256_mul_pd(fscal,dz22);
1873
1874             /* Update vectorial force */
1875             fix2             = _mm256_add_pd(fix2,tx);
1876             fiy2             = _mm256_add_pd(fiy2,ty);
1877             fiz2             = _mm256_add_pd(fiz2,tz);
1878
1879             fjx2             = _mm256_add_pd(fjx2,tx);
1880             fjy2             = _mm256_add_pd(fjy2,ty);
1881             fjz2             = _mm256_add_pd(fjz2,tz);
1882
1883             /**************************
1884              * CALCULATE INTERACTIONS *
1885              **************************/
1886
1887             r23              = _mm256_mul_pd(rsq23,rinv23);
1888
1889             /* EWALD ELECTROSTATICS */
1890
1891             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1892             ewrt             = _mm256_mul_pd(r23,ewtabscale);
1893             ewitab           = _mm256_cvttpd_epi32(ewrt);
1894             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1895             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1896                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1897                                             &ewtabF,&ewtabFn);
1898             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1899             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1900
1901             fscal            = felec;
1902
1903             /* Calculate temporary vectorial force */
1904             tx               = _mm256_mul_pd(fscal,dx23);
1905             ty               = _mm256_mul_pd(fscal,dy23);
1906             tz               = _mm256_mul_pd(fscal,dz23);
1907
1908             /* Update vectorial force */
1909             fix2             = _mm256_add_pd(fix2,tx);
1910             fiy2             = _mm256_add_pd(fiy2,ty);
1911             fiz2             = _mm256_add_pd(fiz2,tz);
1912
1913             fjx3             = _mm256_add_pd(fjx3,tx);
1914             fjy3             = _mm256_add_pd(fjy3,ty);
1915             fjz3             = _mm256_add_pd(fjz3,tz);
1916
1917             /**************************
1918              * CALCULATE INTERACTIONS *
1919              **************************/
1920
1921             r31              = _mm256_mul_pd(rsq31,rinv31);
1922
1923             /* EWALD ELECTROSTATICS */
1924
1925             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1926             ewrt             = _mm256_mul_pd(r31,ewtabscale);
1927             ewitab           = _mm256_cvttpd_epi32(ewrt);
1928             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1929             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1930                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1931                                             &ewtabF,&ewtabFn);
1932             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1933             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1934
1935             fscal            = felec;
1936
1937             /* Calculate temporary vectorial force */
1938             tx               = _mm256_mul_pd(fscal,dx31);
1939             ty               = _mm256_mul_pd(fscal,dy31);
1940             tz               = _mm256_mul_pd(fscal,dz31);
1941
1942             /* Update vectorial force */
1943             fix3             = _mm256_add_pd(fix3,tx);
1944             fiy3             = _mm256_add_pd(fiy3,ty);
1945             fiz3             = _mm256_add_pd(fiz3,tz);
1946
1947             fjx1             = _mm256_add_pd(fjx1,tx);
1948             fjy1             = _mm256_add_pd(fjy1,ty);
1949             fjz1             = _mm256_add_pd(fjz1,tz);
1950
1951             /**************************
1952              * CALCULATE INTERACTIONS *
1953              **************************/
1954
1955             r32              = _mm256_mul_pd(rsq32,rinv32);
1956
1957             /* EWALD ELECTROSTATICS */
1958
1959             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1960             ewrt             = _mm256_mul_pd(r32,ewtabscale);
1961             ewitab           = _mm256_cvttpd_epi32(ewrt);
1962             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1963             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1964                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1965                                             &ewtabF,&ewtabFn);
1966             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1967             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1968
1969             fscal            = felec;
1970
1971             /* Calculate temporary vectorial force */
1972             tx               = _mm256_mul_pd(fscal,dx32);
1973             ty               = _mm256_mul_pd(fscal,dy32);
1974             tz               = _mm256_mul_pd(fscal,dz32);
1975
1976             /* Update vectorial force */
1977             fix3             = _mm256_add_pd(fix3,tx);
1978             fiy3             = _mm256_add_pd(fiy3,ty);
1979             fiz3             = _mm256_add_pd(fiz3,tz);
1980
1981             fjx2             = _mm256_add_pd(fjx2,tx);
1982             fjy2             = _mm256_add_pd(fjy2,ty);
1983             fjz2             = _mm256_add_pd(fjz2,tz);
1984
1985             /**************************
1986              * CALCULATE INTERACTIONS *
1987              **************************/
1988
1989             r33              = _mm256_mul_pd(rsq33,rinv33);
1990
1991             /* EWALD ELECTROSTATICS */
1992
1993             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1994             ewrt             = _mm256_mul_pd(r33,ewtabscale);
1995             ewitab           = _mm256_cvttpd_epi32(ewrt);
1996             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1997             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1998                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1999                                             &ewtabF,&ewtabFn);
2000             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2001             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2002
2003             fscal            = felec;
2004
2005             /* Calculate temporary vectorial force */
2006             tx               = _mm256_mul_pd(fscal,dx33);
2007             ty               = _mm256_mul_pd(fscal,dy33);
2008             tz               = _mm256_mul_pd(fscal,dz33);
2009
2010             /* Update vectorial force */
2011             fix3             = _mm256_add_pd(fix3,tx);
2012             fiy3             = _mm256_add_pd(fiy3,ty);
2013             fiz3             = _mm256_add_pd(fiz3,tz);
2014
2015             fjx3             = _mm256_add_pd(fjx3,tx);
2016             fjy3             = _mm256_add_pd(fjy3,ty);
2017             fjz3             = _mm256_add_pd(fjz3,tz);
2018
2019             fjptrA             = f+j_coord_offsetA;
2020             fjptrB             = f+j_coord_offsetB;
2021             fjptrC             = f+j_coord_offsetC;
2022             fjptrD             = f+j_coord_offsetD;
2023
2024             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2025                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2026                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2027
2028             /* Inner loop uses 373 flops */
2029         }
2030
2031         if(jidx<j_index_end)
2032         {
2033
2034             /* Get j neighbor index, and coordinate index */
2035             jnrlistA         = jjnr[jidx];
2036             jnrlistB         = jjnr[jidx+1];
2037             jnrlistC         = jjnr[jidx+2];
2038             jnrlistD         = jjnr[jidx+3];
2039             /* Sign of each element will be negative for non-real atoms.
2040              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2041              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2042              */
2043             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2044
2045             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2046             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2047             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2048
2049             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2050             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2051             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2052             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2053             j_coord_offsetA  = DIM*jnrA;
2054             j_coord_offsetB  = DIM*jnrB;
2055             j_coord_offsetC  = DIM*jnrC;
2056             j_coord_offsetD  = DIM*jnrD;
2057
2058             /* load j atom coordinates */
2059             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2060                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2061                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2062                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2063
2064             /* Calculate displacement vector */
2065             dx00             = _mm256_sub_pd(ix0,jx0);
2066             dy00             = _mm256_sub_pd(iy0,jy0);
2067             dz00             = _mm256_sub_pd(iz0,jz0);
2068             dx11             = _mm256_sub_pd(ix1,jx1);
2069             dy11             = _mm256_sub_pd(iy1,jy1);
2070             dz11             = _mm256_sub_pd(iz1,jz1);
2071             dx12             = _mm256_sub_pd(ix1,jx2);
2072             dy12             = _mm256_sub_pd(iy1,jy2);
2073             dz12             = _mm256_sub_pd(iz1,jz2);
2074             dx13             = _mm256_sub_pd(ix1,jx3);
2075             dy13             = _mm256_sub_pd(iy1,jy3);
2076             dz13             = _mm256_sub_pd(iz1,jz3);
2077             dx21             = _mm256_sub_pd(ix2,jx1);
2078             dy21             = _mm256_sub_pd(iy2,jy1);
2079             dz21             = _mm256_sub_pd(iz2,jz1);
2080             dx22             = _mm256_sub_pd(ix2,jx2);
2081             dy22             = _mm256_sub_pd(iy2,jy2);
2082             dz22             = _mm256_sub_pd(iz2,jz2);
2083             dx23             = _mm256_sub_pd(ix2,jx3);
2084             dy23             = _mm256_sub_pd(iy2,jy3);
2085             dz23             = _mm256_sub_pd(iz2,jz3);
2086             dx31             = _mm256_sub_pd(ix3,jx1);
2087             dy31             = _mm256_sub_pd(iy3,jy1);
2088             dz31             = _mm256_sub_pd(iz3,jz1);
2089             dx32             = _mm256_sub_pd(ix3,jx2);
2090             dy32             = _mm256_sub_pd(iy3,jy2);
2091             dz32             = _mm256_sub_pd(iz3,jz2);
2092             dx33             = _mm256_sub_pd(ix3,jx3);
2093             dy33             = _mm256_sub_pd(iy3,jy3);
2094             dz33             = _mm256_sub_pd(iz3,jz3);
2095
2096             /* Calculate squared distance and things based on it */
2097             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2098             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2099             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2100             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2101             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2102             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2103             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2104             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2105             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2106             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2107
2108             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
2109             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2110             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2111             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
2112             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2113             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2114             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
2115             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
2116             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
2117             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
2118
2119             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2120             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2121             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2122             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2123             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2124             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2125             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2126             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2127             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2128             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2129
2130             fjx0             = _mm256_setzero_pd();
2131             fjy0             = _mm256_setzero_pd();
2132             fjz0             = _mm256_setzero_pd();
2133             fjx1             = _mm256_setzero_pd();
2134             fjy1             = _mm256_setzero_pd();
2135             fjz1             = _mm256_setzero_pd();
2136             fjx2             = _mm256_setzero_pd();
2137             fjy2             = _mm256_setzero_pd();
2138             fjz2             = _mm256_setzero_pd();
2139             fjx3             = _mm256_setzero_pd();
2140             fjy3             = _mm256_setzero_pd();
2141             fjz3             = _mm256_setzero_pd();
2142
2143             /**************************
2144              * CALCULATE INTERACTIONS *
2145              **************************/
2146
2147             r00              = _mm256_mul_pd(rsq00,rinv00);
2148             r00              = _mm256_andnot_pd(dummy_mask,r00);
2149
2150             /* Analytical LJ-PME */
2151             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2152             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
2153             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
2154             exponent         = gmx_simd_exp_d(ewcljrsq);
2155             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2156             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
2157             /* f6A = 6 * C6grid * (1 - poly) */
2158             f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
2159             /* f6B = C6grid * exponent * beta^6 */
2160             f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
2161             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2162             fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2163
2164             fscal            = fvdw;
2165
2166             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2167
2168             /* Calculate temporary vectorial force */
2169             tx               = _mm256_mul_pd(fscal,dx00);
2170             ty               = _mm256_mul_pd(fscal,dy00);
2171             tz               = _mm256_mul_pd(fscal,dz00);
2172
2173             /* Update vectorial force */
2174             fix0             = _mm256_add_pd(fix0,tx);
2175             fiy0             = _mm256_add_pd(fiy0,ty);
2176             fiz0             = _mm256_add_pd(fiz0,tz);
2177
2178             fjx0             = _mm256_add_pd(fjx0,tx);
2179             fjy0             = _mm256_add_pd(fjy0,ty);
2180             fjz0             = _mm256_add_pd(fjz0,tz);
2181
2182             /**************************
2183              * CALCULATE INTERACTIONS *
2184              **************************/
2185
2186             r11              = _mm256_mul_pd(rsq11,rinv11);
2187             r11              = _mm256_andnot_pd(dummy_mask,r11);
2188
2189             /* EWALD ELECTROSTATICS */
2190
2191             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2192             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2193             ewitab           = _mm256_cvttpd_epi32(ewrt);
2194             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2195             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2196                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2197                                             &ewtabF,&ewtabFn);
2198             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2199             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2200
2201             fscal            = felec;
2202
2203             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2204
2205             /* Calculate temporary vectorial force */
2206             tx               = _mm256_mul_pd(fscal,dx11);
2207             ty               = _mm256_mul_pd(fscal,dy11);
2208             tz               = _mm256_mul_pd(fscal,dz11);
2209
2210             /* Update vectorial force */
2211             fix1             = _mm256_add_pd(fix1,tx);
2212             fiy1             = _mm256_add_pd(fiy1,ty);
2213             fiz1             = _mm256_add_pd(fiz1,tz);
2214
2215             fjx1             = _mm256_add_pd(fjx1,tx);
2216             fjy1             = _mm256_add_pd(fjy1,ty);
2217             fjz1             = _mm256_add_pd(fjz1,tz);
2218
2219             /**************************
2220              * CALCULATE INTERACTIONS *
2221              **************************/
2222
2223             r12              = _mm256_mul_pd(rsq12,rinv12);
2224             r12              = _mm256_andnot_pd(dummy_mask,r12);
2225
2226             /* EWALD ELECTROSTATICS */
2227
2228             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2229             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2230             ewitab           = _mm256_cvttpd_epi32(ewrt);
2231             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2232             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2233                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2234                                             &ewtabF,&ewtabFn);
2235             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2236             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2237
2238             fscal            = felec;
2239
2240             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2241
2242             /* Calculate temporary vectorial force */
2243             tx               = _mm256_mul_pd(fscal,dx12);
2244             ty               = _mm256_mul_pd(fscal,dy12);
2245             tz               = _mm256_mul_pd(fscal,dz12);
2246
2247             /* Update vectorial force */
2248             fix1             = _mm256_add_pd(fix1,tx);
2249             fiy1             = _mm256_add_pd(fiy1,ty);
2250             fiz1             = _mm256_add_pd(fiz1,tz);
2251
2252             fjx2             = _mm256_add_pd(fjx2,tx);
2253             fjy2             = _mm256_add_pd(fjy2,ty);
2254             fjz2             = _mm256_add_pd(fjz2,tz);
2255
2256             /**************************
2257              * CALCULATE INTERACTIONS *
2258              **************************/
2259
2260             r13              = _mm256_mul_pd(rsq13,rinv13);
2261             r13              = _mm256_andnot_pd(dummy_mask,r13);
2262
2263             /* EWALD ELECTROSTATICS */
2264
2265             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2266             ewrt             = _mm256_mul_pd(r13,ewtabscale);
2267             ewitab           = _mm256_cvttpd_epi32(ewrt);
2268             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2269             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2270                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2271                                             &ewtabF,&ewtabFn);
2272             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2273             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2274
2275             fscal            = felec;
2276
2277             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2278
2279             /* Calculate temporary vectorial force */
2280             tx               = _mm256_mul_pd(fscal,dx13);
2281             ty               = _mm256_mul_pd(fscal,dy13);
2282             tz               = _mm256_mul_pd(fscal,dz13);
2283
2284             /* Update vectorial force */
2285             fix1             = _mm256_add_pd(fix1,tx);
2286             fiy1             = _mm256_add_pd(fiy1,ty);
2287             fiz1             = _mm256_add_pd(fiz1,tz);
2288
2289             fjx3             = _mm256_add_pd(fjx3,tx);
2290             fjy3             = _mm256_add_pd(fjy3,ty);
2291             fjz3             = _mm256_add_pd(fjz3,tz);
2292
2293             /**************************
2294              * CALCULATE INTERACTIONS *
2295              **************************/
2296
2297             r21              = _mm256_mul_pd(rsq21,rinv21);
2298             r21              = _mm256_andnot_pd(dummy_mask,r21);
2299
2300             /* EWALD ELECTROSTATICS */
2301
2302             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2303             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2304             ewitab           = _mm256_cvttpd_epi32(ewrt);
2305             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2306             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2307                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2308                                             &ewtabF,&ewtabFn);
2309             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2310             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2311
2312             fscal            = felec;
2313
2314             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2315
2316             /* Calculate temporary vectorial force */
2317             tx               = _mm256_mul_pd(fscal,dx21);
2318             ty               = _mm256_mul_pd(fscal,dy21);
2319             tz               = _mm256_mul_pd(fscal,dz21);
2320
2321             /* Update vectorial force */
2322             fix2             = _mm256_add_pd(fix2,tx);
2323             fiy2             = _mm256_add_pd(fiy2,ty);
2324             fiz2             = _mm256_add_pd(fiz2,tz);
2325
2326             fjx1             = _mm256_add_pd(fjx1,tx);
2327             fjy1             = _mm256_add_pd(fjy1,ty);
2328             fjz1             = _mm256_add_pd(fjz1,tz);
2329
2330             /**************************
2331              * CALCULATE INTERACTIONS *
2332              **************************/
2333
2334             r22              = _mm256_mul_pd(rsq22,rinv22);
2335             r22              = _mm256_andnot_pd(dummy_mask,r22);
2336
2337             /* EWALD ELECTROSTATICS */
2338
2339             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2340             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2341             ewitab           = _mm256_cvttpd_epi32(ewrt);
2342             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2343             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2344                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2345                                             &ewtabF,&ewtabFn);
2346             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2347             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2348
2349             fscal            = felec;
2350
2351             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2352
2353             /* Calculate temporary vectorial force */
2354             tx               = _mm256_mul_pd(fscal,dx22);
2355             ty               = _mm256_mul_pd(fscal,dy22);
2356             tz               = _mm256_mul_pd(fscal,dz22);
2357
2358             /* Update vectorial force */
2359             fix2             = _mm256_add_pd(fix2,tx);
2360             fiy2             = _mm256_add_pd(fiy2,ty);
2361             fiz2             = _mm256_add_pd(fiz2,tz);
2362
2363             fjx2             = _mm256_add_pd(fjx2,tx);
2364             fjy2             = _mm256_add_pd(fjy2,ty);
2365             fjz2             = _mm256_add_pd(fjz2,tz);
2366
2367             /**************************
2368              * CALCULATE INTERACTIONS *
2369              **************************/
2370
2371             r23              = _mm256_mul_pd(rsq23,rinv23);
2372             r23              = _mm256_andnot_pd(dummy_mask,r23);
2373
2374             /* EWALD ELECTROSTATICS */
2375
2376             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2377             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2378             ewitab           = _mm256_cvttpd_epi32(ewrt);
2379             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2380             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2381                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2382                                             &ewtabF,&ewtabFn);
2383             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2384             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2385
2386             fscal            = felec;
2387
2388             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2389
2390             /* Calculate temporary vectorial force */
2391             tx               = _mm256_mul_pd(fscal,dx23);
2392             ty               = _mm256_mul_pd(fscal,dy23);
2393             tz               = _mm256_mul_pd(fscal,dz23);
2394
2395             /* Update vectorial force */
2396             fix2             = _mm256_add_pd(fix2,tx);
2397             fiy2             = _mm256_add_pd(fiy2,ty);
2398             fiz2             = _mm256_add_pd(fiz2,tz);
2399
2400             fjx3             = _mm256_add_pd(fjx3,tx);
2401             fjy3             = _mm256_add_pd(fjy3,ty);
2402             fjz3             = _mm256_add_pd(fjz3,tz);
2403
2404             /**************************
2405              * CALCULATE INTERACTIONS *
2406              **************************/
2407
2408             r31              = _mm256_mul_pd(rsq31,rinv31);
2409             r31              = _mm256_andnot_pd(dummy_mask,r31);
2410
2411             /* EWALD ELECTROSTATICS */
2412
2413             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2414             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2415             ewitab           = _mm256_cvttpd_epi32(ewrt);
2416             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2417             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2418                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2419                                             &ewtabF,&ewtabFn);
2420             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2421             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2422
2423             fscal            = felec;
2424
2425             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2426
2427             /* Calculate temporary vectorial force */
2428             tx               = _mm256_mul_pd(fscal,dx31);
2429             ty               = _mm256_mul_pd(fscal,dy31);
2430             tz               = _mm256_mul_pd(fscal,dz31);
2431
2432             /* Update vectorial force */
2433             fix3             = _mm256_add_pd(fix3,tx);
2434             fiy3             = _mm256_add_pd(fiy3,ty);
2435             fiz3             = _mm256_add_pd(fiz3,tz);
2436
2437             fjx1             = _mm256_add_pd(fjx1,tx);
2438             fjy1             = _mm256_add_pd(fjy1,ty);
2439             fjz1             = _mm256_add_pd(fjz1,tz);
2440
2441             /**************************
2442              * CALCULATE INTERACTIONS *
2443              **************************/
2444
2445             r32              = _mm256_mul_pd(rsq32,rinv32);
2446             r32              = _mm256_andnot_pd(dummy_mask,r32);
2447
2448             /* EWALD ELECTROSTATICS */
2449
2450             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2451             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2452             ewitab           = _mm256_cvttpd_epi32(ewrt);
2453             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2454             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2455                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2456                                             &ewtabF,&ewtabFn);
2457             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2458             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2459
2460             fscal            = felec;
2461
2462             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2463
2464             /* Calculate temporary vectorial force */
2465             tx               = _mm256_mul_pd(fscal,dx32);
2466             ty               = _mm256_mul_pd(fscal,dy32);
2467             tz               = _mm256_mul_pd(fscal,dz32);
2468
2469             /* Update vectorial force */
2470             fix3             = _mm256_add_pd(fix3,tx);
2471             fiy3             = _mm256_add_pd(fiy3,ty);
2472             fiz3             = _mm256_add_pd(fiz3,tz);
2473
2474             fjx2             = _mm256_add_pd(fjx2,tx);
2475             fjy2             = _mm256_add_pd(fjy2,ty);
2476             fjz2             = _mm256_add_pd(fjz2,tz);
2477
2478             /**************************
2479              * CALCULATE INTERACTIONS *
2480              **************************/
2481
2482             r33              = _mm256_mul_pd(rsq33,rinv33);
2483             r33              = _mm256_andnot_pd(dummy_mask,r33);
2484
2485             /* EWALD ELECTROSTATICS */
2486
2487             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2488             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2489             ewitab           = _mm256_cvttpd_epi32(ewrt);
2490             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2491             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2492                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2493                                             &ewtabF,&ewtabFn);
2494             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2495             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2496
2497             fscal            = felec;
2498
2499             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2500
2501             /* Calculate temporary vectorial force */
2502             tx               = _mm256_mul_pd(fscal,dx33);
2503             ty               = _mm256_mul_pd(fscal,dy33);
2504             tz               = _mm256_mul_pd(fscal,dz33);
2505
2506             /* Update vectorial force */
2507             fix3             = _mm256_add_pd(fix3,tx);
2508             fiy3             = _mm256_add_pd(fiy3,ty);
2509             fiz3             = _mm256_add_pd(fiz3,tz);
2510
2511             fjx3             = _mm256_add_pd(fjx3,tx);
2512             fjy3             = _mm256_add_pd(fjy3,ty);
2513             fjz3             = _mm256_add_pd(fjz3,tz);
2514
2515             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2516             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2517             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2518             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2519
2520             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2521                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2522                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2523
2524             /* Inner loop uses 383 flops */
2525         }
2526
2527         /* End of innermost loop */
2528
2529         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2530                                                  f+i_coord_offset,fshift+i_shift_offset);
2531
2532         /* Increment number of inner iterations */
2533         inneriter                  += j_index_end - j_index_start;
2534
2535         /* Outer loop uses 24 flops */
2536     }
2537
2538     /* Increment number of outer iterations */
2539     outeriter        += nri;
2540
2541     /* Update outer/inner flops */
2542
2543     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*383);
2544 }