Valgrind suppression for OS X 10.9
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LJEwald
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84     real             scratch[4*DIM];
85     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     real *           vdwgridioffsetptr0;
88     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
89     real *           vdwioffsetptr1;
90     real *           vdwgridioffsetptr1;
91     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92     real *           vdwioffsetptr2;
93     real *           vdwgridioffsetptr2;
94     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
95     real *           vdwioffsetptr3;
96     real *           vdwgridioffsetptr3;
97     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
98     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
99     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
100     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
101     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
102     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
103     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
104     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
105     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
106     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
107     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
108     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
109     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
110     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
111     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
112     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
113     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
114     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
115     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
116     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
117     real             *charge;
118     int              nvdwtype;
119     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
120     int              *vdwtype;
121     real             *vdwparam;
122     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
123     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
124     __m256d           c6grid_00;
125     __m256d           c6grid_11;
126     __m256d           c6grid_12;
127     __m256d           c6grid_13;
128     __m256d           c6grid_21;
129     __m256d           c6grid_22;
130     __m256d           c6grid_23;
131     __m256d           c6grid_31;
132     __m256d           c6grid_32;
133     __m256d           c6grid_33;
134     real             *vdwgridparam;
135     __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
136     __m256d           one_half  = _mm256_set1_pd(0.5);
137     __m256d           minus_one = _mm256_set1_pd(-1.0);
138     __m128i          ewitab;
139     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
140     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
141     real             *ewtab;
142     __m256d          dummy_mask,cutoff_mask;
143     __m128           tmpmask0,tmpmask1;
144     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
145     __m256d          one     = _mm256_set1_pd(1.0);
146     __m256d          two     = _mm256_set1_pd(2.0);
147     x                = xx[0];
148     f                = ff[0];
149
150     nri              = nlist->nri;
151     iinr             = nlist->iinr;
152     jindex           = nlist->jindex;
153     jjnr             = nlist->jjnr;
154     shiftidx         = nlist->shift;
155     gid              = nlist->gid;
156     shiftvec         = fr->shift_vec[0];
157     fshift           = fr->fshift[0];
158     facel            = _mm256_set1_pd(fr->epsfac);
159     charge           = mdatoms->chargeA;
160     nvdwtype         = fr->ntype;
161     vdwparam         = fr->nbfp;
162     vdwtype          = mdatoms->typeA;
163     vdwgridparam     = fr->ljpme_c6grid;
164     sh_lj_ewald      = _mm256_set1_pd(fr->ic->sh_lj_ewald);
165     ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
166     ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
167
168     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
169     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
170     beta2            = _mm256_mul_pd(beta,beta);
171     beta3            = _mm256_mul_pd(beta,beta2);
172
173     ewtab            = fr->ic->tabq_coul_FDV0;
174     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
175     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
176
177     /* Setup water-specific parameters */
178     inr              = nlist->iinr[0];
179     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
180     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
181     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
182     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
183     vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
184
185     jq1              = _mm256_set1_pd(charge[inr+1]);
186     jq2              = _mm256_set1_pd(charge[inr+2]);
187     jq3              = _mm256_set1_pd(charge[inr+3]);
188     vdwjidx0A        = 2*vdwtype[inr+0];
189     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
190     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
191     c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
192     qq11             = _mm256_mul_pd(iq1,jq1);
193     qq12             = _mm256_mul_pd(iq1,jq2);
194     qq13             = _mm256_mul_pd(iq1,jq3);
195     qq21             = _mm256_mul_pd(iq2,jq1);
196     qq22             = _mm256_mul_pd(iq2,jq2);
197     qq23             = _mm256_mul_pd(iq2,jq3);
198     qq31             = _mm256_mul_pd(iq3,jq1);
199     qq32             = _mm256_mul_pd(iq3,jq2);
200     qq33             = _mm256_mul_pd(iq3,jq3);
201
202     /* Avoid stupid compiler warnings */
203     jnrA = jnrB = jnrC = jnrD = 0;
204     j_coord_offsetA = 0;
205     j_coord_offsetB = 0;
206     j_coord_offsetC = 0;
207     j_coord_offsetD = 0;
208
209     outeriter        = 0;
210     inneriter        = 0;
211
212     for(iidx=0;iidx<4*DIM;iidx++)
213     {
214         scratch[iidx] = 0.0;
215     }
216
217     /* Start outer loop over neighborlists */
218     for(iidx=0; iidx<nri; iidx++)
219     {
220         /* Load shift vector for this list */
221         i_shift_offset   = DIM*shiftidx[iidx];
222
223         /* Load limits for loop over neighbors */
224         j_index_start    = jindex[iidx];
225         j_index_end      = jindex[iidx+1];
226
227         /* Get outer coordinate index */
228         inr              = iinr[iidx];
229         i_coord_offset   = DIM*inr;
230
231         /* Load i particle coords and add shift vector */
232         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
233                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
234
235         fix0             = _mm256_setzero_pd();
236         fiy0             = _mm256_setzero_pd();
237         fiz0             = _mm256_setzero_pd();
238         fix1             = _mm256_setzero_pd();
239         fiy1             = _mm256_setzero_pd();
240         fiz1             = _mm256_setzero_pd();
241         fix2             = _mm256_setzero_pd();
242         fiy2             = _mm256_setzero_pd();
243         fiz2             = _mm256_setzero_pd();
244         fix3             = _mm256_setzero_pd();
245         fiy3             = _mm256_setzero_pd();
246         fiz3             = _mm256_setzero_pd();
247
248         /* Reset potential sums */
249         velecsum         = _mm256_setzero_pd();
250         vvdwsum          = _mm256_setzero_pd();
251
252         /* Start inner kernel loop */
253         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
254         {
255
256             /* Get j neighbor index, and coordinate index */
257             jnrA             = jjnr[jidx];
258             jnrB             = jjnr[jidx+1];
259             jnrC             = jjnr[jidx+2];
260             jnrD             = jjnr[jidx+3];
261             j_coord_offsetA  = DIM*jnrA;
262             j_coord_offsetB  = DIM*jnrB;
263             j_coord_offsetC  = DIM*jnrC;
264             j_coord_offsetD  = DIM*jnrD;
265
266             /* load j atom coordinates */
267             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
268                                                  x+j_coord_offsetC,x+j_coord_offsetD,
269                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
270                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
271
272             /* Calculate displacement vector */
273             dx00             = _mm256_sub_pd(ix0,jx0);
274             dy00             = _mm256_sub_pd(iy0,jy0);
275             dz00             = _mm256_sub_pd(iz0,jz0);
276             dx11             = _mm256_sub_pd(ix1,jx1);
277             dy11             = _mm256_sub_pd(iy1,jy1);
278             dz11             = _mm256_sub_pd(iz1,jz1);
279             dx12             = _mm256_sub_pd(ix1,jx2);
280             dy12             = _mm256_sub_pd(iy1,jy2);
281             dz12             = _mm256_sub_pd(iz1,jz2);
282             dx13             = _mm256_sub_pd(ix1,jx3);
283             dy13             = _mm256_sub_pd(iy1,jy3);
284             dz13             = _mm256_sub_pd(iz1,jz3);
285             dx21             = _mm256_sub_pd(ix2,jx1);
286             dy21             = _mm256_sub_pd(iy2,jy1);
287             dz21             = _mm256_sub_pd(iz2,jz1);
288             dx22             = _mm256_sub_pd(ix2,jx2);
289             dy22             = _mm256_sub_pd(iy2,jy2);
290             dz22             = _mm256_sub_pd(iz2,jz2);
291             dx23             = _mm256_sub_pd(ix2,jx3);
292             dy23             = _mm256_sub_pd(iy2,jy3);
293             dz23             = _mm256_sub_pd(iz2,jz3);
294             dx31             = _mm256_sub_pd(ix3,jx1);
295             dy31             = _mm256_sub_pd(iy3,jy1);
296             dz31             = _mm256_sub_pd(iz3,jz1);
297             dx32             = _mm256_sub_pd(ix3,jx2);
298             dy32             = _mm256_sub_pd(iy3,jy2);
299             dz32             = _mm256_sub_pd(iz3,jz2);
300             dx33             = _mm256_sub_pd(ix3,jx3);
301             dy33             = _mm256_sub_pd(iy3,jy3);
302             dz33             = _mm256_sub_pd(iz3,jz3);
303
304             /* Calculate squared distance and things based on it */
305             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
306             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
307             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
308             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
309             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
310             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
311             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
312             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
313             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
314             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
315
316             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
317             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
318             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
319             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
320             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
321             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
322             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
323             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
324             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
325             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
326
327             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
328             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
329             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
330             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
331             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
332             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
333             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
334             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
335             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
336             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
337
338             fjx0             = _mm256_setzero_pd();
339             fjy0             = _mm256_setzero_pd();
340             fjz0             = _mm256_setzero_pd();
341             fjx1             = _mm256_setzero_pd();
342             fjy1             = _mm256_setzero_pd();
343             fjz1             = _mm256_setzero_pd();
344             fjx2             = _mm256_setzero_pd();
345             fjy2             = _mm256_setzero_pd();
346             fjz2             = _mm256_setzero_pd();
347             fjx3             = _mm256_setzero_pd();
348             fjy3             = _mm256_setzero_pd();
349             fjz3             = _mm256_setzero_pd();
350
351             /**************************
352              * CALCULATE INTERACTIONS *
353              **************************/
354
355             r00              = _mm256_mul_pd(rsq00,rinv00);
356
357             /* Analytical LJ-PME */
358             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
359             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
360             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
361             exponent         = gmx_simd_exp_d(ewcljrsq);
362             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
363             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
364             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
365             vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
366             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
367             vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
368             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
369             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
370
371             /* Update potential sum for this i atom from the interaction with this j atom. */
372             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
373
374             fscal            = fvdw;
375
376             /* Calculate temporary vectorial force */
377             tx               = _mm256_mul_pd(fscal,dx00);
378             ty               = _mm256_mul_pd(fscal,dy00);
379             tz               = _mm256_mul_pd(fscal,dz00);
380
381             /* Update vectorial force */
382             fix0             = _mm256_add_pd(fix0,tx);
383             fiy0             = _mm256_add_pd(fiy0,ty);
384             fiz0             = _mm256_add_pd(fiz0,tz);
385
386             fjx0             = _mm256_add_pd(fjx0,tx);
387             fjy0             = _mm256_add_pd(fjy0,ty);
388             fjz0             = _mm256_add_pd(fjz0,tz);
389
390             /**************************
391              * CALCULATE INTERACTIONS *
392              **************************/
393
394             r11              = _mm256_mul_pd(rsq11,rinv11);
395
396             /* EWALD ELECTROSTATICS */
397
398             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
399             ewrt             = _mm256_mul_pd(r11,ewtabscale);
400             ewitab           = _mm256_cvttpd_epi32(ewrt);
401             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
402             ewitab           = _mm_slli_epi32(ewitab,2);
403             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
404             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
405             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
406             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
407             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
408             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
409             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
410             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
411             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
412
413             /* Update potential sum for this i atom from the interaction with this j atom. */
414             velecsum         = _mm256_add_pd(velecsum,velec);
415
416             fscal            = felec;
417
418             /* Calculate temporary vectorial force */
419             tx               = _mm256_mul_pd(fscal,dx11);
420             ty               = _mm256_mul_pd(fscal,dy11);
421             tz               = _mm256_mul_pd(fscal,dz11);
422
423             /* Update vectorial force */
424             fix1             = _mm256_add_pd(fix1,tx);
425             fiy1             = _mm256_add_pd(fiy1,ty);
426             fiz1             = _mm256_add_pd(fiz1,tz);
427
428             fjx1             = _mm256_add_pd(fjx1,tx);
429             fjy1             = _mm256_add_pd(fjy1,ty);
430             fjz1             = _mm256_add_pd(fjz1,tz);
431
432             /**************************
433              * CALCULATE INTERACTIONS *
434              **************************/
435
436             r12              = _mm256_mul_pd(rsq12,rinv12);
437
438             /* EWALD ELECTROSTATICS */
439
440             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
441             ewrt             = _mm256_mul_pd(r12,ewtabscale);
442             ewitab           = _mm256_cvttpd_epi32(ewrt);
443             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
444             ewitab           = _mm_slli_epi32(ewitab,2);
445             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
446             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
447             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
448             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
449             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
450             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
451             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
452             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
453             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
454
455             /* Update potential sum for this i atom from the interaction with this j atom. */
456             velecsum         = _mm256_add_pd(velecsum,velec);
457
458             fscal            = felec;
459
460             /* Calculate temporary vectorial force */
461             tx               = _mm256_mul_pd(fscal,dx12);
462             ty               = _mm256_mul_pd(fscal,dy12);
463             tz               = _mm256_mul_pd(fscal,dz12);
464
465             /* Update vectorial force */
466             fix1             = _mm256_add_pd(fix1,tx);
467             fiy1             = _mm256_add_pd(fiy1,ty);
468             fiz1             = _mm256_add_pd(fiz1,tz);
469
470             fjx2             = _mm256_add_pd(fjx2,tx);
471             fjy2             = _mm256_add_pd(fjy2,ty);
472             fjz2             = _mm256_add_pd(fjz2,tz);
473
474             /**************************
475              * CALCULATE INTERACTIONS *
476              **************************/
477
478             r13              = _mm256_mul_pd(rsq13,rinv13);
479
480             /* EWALD ELECTROSTATICS */
481
482             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
483             ewrt             = _mm256_mul_pd(r13,ewtabscale);
484             ewitab           = _mm256_cvttpd_epi32(ewrt);
485             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
486             ewitab           = _mm_slli_epi32(ewitab,2);
487             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
488             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
489             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
490             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
491             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
492             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
493             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
494             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
495             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
496
497             /* Update potential sum for this i atom from the interaction with this j atom. */
498             velecsum         = _mm256_add_pd(velecsum,velec);
499
500             fscal            = felec;
501
502             /* Calculate temporary vectorial force */
503             tx               = _mm256_mul_pd(fscal,dx13);
504             ty               = _mm256_mul_pd(fscal,dy13);
505             tz               = _mm256_mul_pd(fscal,dz13);
506
507             /* Update vectorial force */
508             fix1             = _mm256_add_pd(fix1,tx);
509             fiy1             = _mm256_add_pd(fiy1,ty);
510             fiz1             = _mm256_add_pd(fiz1,tz);
511
512             fjx3             = _mm256_add_pd(fjx3,tx);
513             fjy3             = _mm256_add_pd(fjy3,ty);
514             fjz3             = _mm256_add_pd(fjz3,tz);
515
516             /**************************
517              * CALCULATE INTERACTIONS *
518              **************************/
519
520             r21              = _mm256_mul_pd(rsq21,rinv21);
521
522             /* EWALD ELECTROSTATICS */
523
524             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
525             ewrt             = _mm256_mul_pd(r21,ewtabscale);
526             ewitab           = _mm256_cvttpd_epi32(ewrt);
527             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
528             ewitab           = _mm_slli_epi32(ewitab,2);
529             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
530             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
531             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
532             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
533             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
534             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
535             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
536             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
537             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
538
539             /* Update potential sum for this i atom from the interaction with this j atom. */
540             velecsum         = _mm256_add_pd(velecsum,velec);
541
542             fscal            = felec;
543
544             /* Calculate temporary vectorial force */
545             tx               = _mm256_mul_pd(fscal,dx21);
546             ty               = _mm256_mul_pd(fscal,dy21);
547             tz               = _mm256_mul_pd(fscal,dz21);
548
549             /* Update vectorial force */
550             fix2             = _mm256_add_pd(fix2,tx);
551             fiy2             = _mm256_add_pd(fiy2,ty);
552             fiz2             = _mm256_add_pd(fiz2,tz);
553
554             fjx1             = _mm256_add_pd(fjx1,tx);
555             fjy1             = _mm256_add_pd(fjy1,ty);
556             fjz1             = _mm256_add_pd(fjz1,tz);
557
558             /**************************
559              * CALCULATE INTERACTIONS *
560              **************************/
561
562             r22              = _mm256_mul_pd(rsq22,rinv22);
563
564             /* EWALD ELECTROSTATICS */
565
566             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
567             ewrt             = _mm256_mul_pd(r22,ewtabscale);
568             ewitab           = _mm256_cvttpd_epi32(ewrt);
569             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
570             ewitab           = _mm_slli_epi32(ewitab,2);
571             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
572             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
573             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
574             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
575             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
576             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
577             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
578             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
579             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
580
581             /* Update potential sum for this i atom from the interaction with this j atom. */
582             velecsum         = _mm256_add_pd(velecsum,velec);
583
584             fscal            = felec;
585
586             /* Calculate temporary vectorial force */
587             tx               = _mm256_mul_pd(fscal,dx22);
588             ty               = _mm256_mul_pd(fscal,dy22);
589             tz               = _mm256_mul_pd(fscal,dz22);
590
591             /* Update vectorial force */
592             fix2             = _mm256_add_pd(fix2,tx);
593             fiy2             = _mm256_add_pd(fiy2,ty);
594             fiz2             = _mm256_add_pd(fiz2,tz);
595
596             fjx2             = _mm256_add_pd(fjx2,tx);
597             fjy2             = _mm256_add_pd(fjy2,ty);
598             fjz2             = _mm256_add_pd(fjz2,tz);
599
600             /**************************
601              * CALCULATE INTERACTIONS *
602              **************************/
603
604             r23              = _mm256_mul_pd(rsq23,rinv23);
605
606             /* EWALD ELECTROSTATICS */
607
608             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
609             ewrt             = _mm256_mul_pd(r23,ewtabscale);
610             ewitab           = _mm256_cvttpd_epi32(ewrt);
611             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
612             ewitab           = _mm_slli_epi32(ewitab,2);
613             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
614             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
615             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
616             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
617             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
618             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
619             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
620             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
621             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
622
623             /* Update potential sum for this i atom from the interaction with this j atom. */
624             velecsum         = _mm256_add_pd(velecsum,velec);
625
626             fscal            = felec;
627
628             /* Calculate temporary vectorial force */
629             tx               = _mm256_mul_pd(fscal,dx23);
630             ty               = _mm256_mul_pd(fscal,dy23);
631             tz               = _mm256_mul_pd(fscal,dz23);
632
633             /* Update vectorial force */
634             fix2             = _mm256_add_pd(fix2,tx);
635             fiy2             = _mm256_add_pd(fiy2,ty);
636             fiz2             = _mm256_add_pd(fiz2,tz);
637
638             fjx3             = _mm256_add_pd(fjx3,tx);
639             fjy3             = _mm256_add_pd(fjy3,ty);
640             fjz3             = _mm256_add_pd(fjz3,tz);
641
642             /**************************
643              * CALCULATE INTERACTIONS *
644              **************************/
645
646             r31              = _mm256_mul_pd(rsq31,rinv31);
647
648             /* EWALD ELECTROSTATICS */
649
650             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
651             ewrt             = _mm256_mul_pd(r31,ewtabscale);
652             ewitab           = _mm256_cvttpd_epi32(ewrt);
653             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
654             ewitab           = _mm_slli_epi32(ewitab,2);
655             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
656             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
657             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
658             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
659             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
660             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
661             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
662             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
663             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
664
665             /* Update potential sum for this i atom from the interaction with this j atom. */
666             velecsum         = _mm256_add_pd(velecsum,velec);
667
668             fscal            = felec;
669
670             /* Calculate temporary vectorial force */
671             tx               = _mm256_mul_pd(fscal,dx31);
672             ty               = _mm256_mul_pd(fscal,dy31);
673             tz               = _mm256_mul_pd(fscal,dz31);
674
675             /* Update vectorial force */
676             fix3             = _mm256_add_pd(fix3,tx);
677             fiy3             = _mm256_add_pd(fiy3,ty);
678             fiz3             = _mm256_add_pd(fiz3,tz);
679
680             fjx1             = _mm256_add_pd(fjx1,tx);
681             fjy1             = _mm256_add_pd(fjy1,ty);
682             fjz1             = _mm256_add_pd(fjz1,tz);
683
684             /**************************
685              * CALCULATE INTERACTIONS *
686              **************************/
687
688             r32              = _mm256_mul_pd(rsq32,rinv32);
689
690             /* EWALD ELECTROSTATICS */
691
692             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
693             ewrt             = _mm256_mul_pd(r32,ewtabscale);
694             ewitab           = _mm256_cvttpd_epi32(ewrt);
695             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
696             ewitab           = _mm_slli_epi32(ewitab,2);
697             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
698             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
699             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
700             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
701             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
702             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
703             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
704             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
705             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
706
707             /* Update potential sum for this i atom from the interaction with this j atom. */
708             velecsum         = _mm256_add_pd(velecsum,velec);
709
710             fscal            = felec;
711
712             /* Calculate temporary vectorial force */
713             tx               = _mm256_mul_pd(fscal,dx32);
714             ty               = _mm256_mul_pd(fscal,dy32);
715             tz               = _mm256_mul_pd(fscal,dz32);
716
717             /* Update vectorial force */
718             fix3             = _mm256_add_pd(fix3,tx);
719             fiy3             = _mm256_add_pd(fiy3,ty);
720             fiz3             = _mm256_add_pd(fiz3,tz);
721
722             fjx2             = _mm256_add_pd(fjx2,tx);
723             fjy2             = _mm256_add_pd(fjy2,ty);
724             fjz2             = _mm256_add_pd(fjz2,tz);
725
726             /**************************
727              * CALCULATE INTERACTIONS *
728              **************************/
729
730             r33              = _mm256_mul_pd(rsq33,rinv33);
731
732             /* EWALD ELECTROSTATICS */
733
734             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
735             ewrt             = _mm256_mul_pd(r33,ewtabscale);
736             ewitab           = _mm256_cvttpd_epi32(ewrt);
737             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
738             ewitab           = _mm_slli_epi32(ewitab,2);
739             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
740             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
741             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
742             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
743             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
744             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
745             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
746             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
747             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
748
749             /* Update potential sum for this i atom from the interaction with this j atom. */
750             velecsum         = _mm256_add_pd(velecsum,velec);
751
752             fscal            = felec;
753
754             /* Calculate temporary vectorial force */
755             tx               = _mm256_mul_pd(fscal,dx33);
756             ty               = _mm256_mul_pd(fscal,dy33);
757             tz               = _mm256_mul_pd(fscal,dz33);
758
759             /* Update vectorial force */
760             fix3             = _mm256_add_pd(fix3,tx);
761             fiy3             = _mm256_add_pd(fiy3,ty);
762             fiz3             = _mm256_add_pd(fiz3,tz);
763
764             fjx3             = _mm256_add_pd(fjx3,tx);
765             fjy3             = _mm256_add_pd(fjy3,ty);
766             fjz3             = _mm256_add_pd(fjz3,tz);
767
768             fjptrA             = f+j_coord_offsetA;
769             fjptrB             = f+j_coord_offsetB;
770             fjptrC             = f+j_coord_offsetC;
771             fjptrD             = f+j_coord_offsetD;
772
773             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
774                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
775                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
776
777             /* Inner loop uses 426 flops */
778         }
779
780         if(jidx<j_index_end)
781         {
782
783             /* Get j neighbor index, and coordinate index */
784             jnrlistA         = jjnr[jidx];
785             jnrlistB         = jjnr[jidx+1];
786             jnrlistC         = jjnr[jidx+2];
787             jnrlistD         = jjnr[jidx+3];
788             /* Sign of each element will be negative for non-real atoms.
789              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
790              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
791              */
792             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
793
794             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
795             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
796             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
797
798             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
799             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
800             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
801             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
802             j_coord_offsetA  = DIM*jnrA;
803             j_coord_offsetB  = DIM*jnrB;
804             j_coord_offsetC  = DIM*jnrC;
805             j_coord_offsetD  = DIM*jnrD;
806
807             /* load j atom coordinates */
808             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
809                                                  x+j_coord_offsetC,x+j_coord_offsetD,
810                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
811                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
812
813             /* Calculate displacement vector */
814             dx00             = _mm256_sub_pd(ix0,jx0);
815             dy00             = _mm256_sub_pd(iy0,jy0);
816             dz00             = _mm256_sub_pd(iz0,jz0);
817             dx11             = _mm256_sub_pd(ix1,jx1);
818             dy11             = _mm256_sub_pd(iy1,jy1);
819             dz11             = _mm256_sub_pd(iz1,jz1);
820             dx12             = _mm256_sub_pd(ix1,jx2);
821             dy12             = _mm256_sub_pd(iy1,jy2);
822             dz12             = _mm256_sub_pd(iz1,jz2);
823             dx13             = _mm256_sub_pd(ix1,jx3);
824             dy13             = _mm256_sub_pd(iy1,jy3);
825             dz13             = _mm256_sub_pd(iz1,jz3);
826             dx21             = _mm256_sub_pd(ix2,jx1);
827             dy21             = _mm256_sub_pd(iy2,jy1);
828             dz21             = _mm256_sub_pd(iz2,jz1);
829             dx22             = _mm256_sub_pd(ix2,jx2);
830             dy22             = _mm256_sub_pd(iy2,jy2);
831             dz22             = _mm256_sub_pd(iz2,jz2);
832             dx23             = _mm256_sub_pd(ix2,jx3);
833             dy23             = _mm256_sub_pd(iy2,jy3);
834             dz23             = _mm256_sub_pd(iz2,jz3);
835             dx31             = _mm256_sub_pd(ix3,jx1);
836             dy31             = _mm256_sub_pd(iy3,jy1);
837             dz31             = _mm256_sub_pd(iz3,jz1);
838             dx32             = _mm256_sub_pd(ix3,jx2);
839             dy32             = _mm256_sub_pd(iy3,jy2);
840             dz32             = _mm256_sub_pd(iz3,jz2);
841             dx33             = _mm256_sub_pd(ix3,jx3);
842             dy33             = _mm256_sub_pd(iy3,jy3);
843             dz33             = _mm256_sub_pd(iz3,jz3);
844
845             /* Calculate squared distance and things based on it */
846             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
847             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
848             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
849             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
850             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
851             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
852             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
853             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
854             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
855             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
856
857             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
858             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
859             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
860             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
861             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
862             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
863             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
864             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
865             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
866             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
867
868             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
869             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
870             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
871             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
872             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
873             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
874             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
875             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
876             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
877             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
878
879             fjx0             = _mm256_setzero_pd();
880             fjy0             = _mm256_setzero_pd();
881             fjz0             = _mm256_setzero_pd();
882             fjx1             = _mm256_setzero_pd();
883             fjy1             = _mm256_setzero_pd();
884             fjz1             = _mm256_setzero_pd();
885             fjx2             = _mm256_setzero_pd();
886             fjy2             = _mm256_setzero_pd();
887             fjz2             = _mm256_setzero_pd();
888             fjx3             = _mm256_setzero_pd();
889             fjy3             = _mm256_setzero_pd();
890             fjz3             = _mm256_setzero_pd();
891
892             /**************************
893              * CALCULATE INTERACTIONS *
894              **************************/
895
896             r00              = _mm256_mul_pd(rsq00,rinv00);
897             r00              = _mm256_andnot_pd(dummy_mask,r00);
898
899             /* Analytical LJ-PME */
900             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
901             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
902             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
903             exponent         = gmx_simd_exp_d(ewcljrsq);
904             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
905             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
906             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
907             vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
908             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
909             vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
910             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
911             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
912
913             /* Update potential sum for this i atom from the interaction with this j atom. */
914             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
915             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
916
917             fscal            = fvdw;
918
919             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
920
921             /* Calculate temporary vectorial force */
922             tx               = _mm256_mul_pd(fscal,dx00);
923             ty               = _mm256_mul_pd(fscal,dy00);
924             tz               = _mm256_mul_pd(fscal,dz00);
925
926             /* Update vectorial force */
927             fix0             = _mm256_add_pd(fix0,tx);
928             fiy0             = _mm256_add_pd(fiy0,ty);
929             fiz0             = _mm256_add_pd(fiz0,tz);
930
931             fjx0             = _mm256_add_pd(fjx0,tx);
932             fjy0             = _mm256_add_pd(fjy0,ty);
933             fjz0             = _mm256_add_pd(fjz0,tz);
934
935             /**************************
936              * CALCULATE INTERACTIONS *
937              **************************/
938
939             r11              = _mm256_mul_pd(rsq11,rinv11);
940             r11              = _mm256_andnot_pd(dummy_mask,r11);
941
942             /* EWALD ELECTROSTATICS */
943
944             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
945             ewrt             = _mm256_mul_pd(r11,ewtabscale);
946             ewitab           = _mm256_cvttpd_epi32(ewrt);
947             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
948             ewitab           = _mm_slli_epi32(ewitab,2);
949             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
950             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
951             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
952             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
953             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
954             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
955             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
956             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
957             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
958
959             /* Update potential sum for this i atom from the interaction with this j atom. */
960             velec            = _mm256_andnot_pd(dummy_mask,velec);
961             velecsum         = _mm256_add_pd(velecsum,velec);
962
963             fscal            = felec;
964
965             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
966
967             /* Calculate temporary vectorial force */
968             tx               = _mm256_mul_pd(fscal,dx11);
969             ty               = _mm256_mul_pd(fscal,dy11);
970             tz               = _mm256_mul_pd(fscal,dz11);
971
972             /* Update vectorial force */
973             fix1             = _mm256_add_pd(fix1,tx);
974             fiy1             = _mm256_add_pd(fiy1,ty);
975             fiz1             = _mm256_add_pd(fiz1,tz);
976
977             fjx1             = _mm256_add_pd(fjx1,tx);
978             fjy1             = _mm256_add_pd(fjy1,ty);
979             fjz1             = _mm256_add_pd(fjz1,tz);
980
981             /**************************
982              * CALCULATE INTERACTIONS *
983              **************************/
984
985             r12              = _mm256_mul_pd(rsq12,rinv12);
986             r12              = _mm256_andnot_pd(dummy_mask,r12);
987
988             /* EWALD ELECTROSTATICS */
989
990             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
991             ewrt             = _mm256_mul_pd(r12,ewtabscale);
992             ewitab           = _mm256_cvttpd_epi32(ewrt);
993             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
994             ewitab           = _mm_slli_epi32(ewitab,2);
995             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
996             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
997             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
998             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
999             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1000             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1001             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1002             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1003             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1004
1005             /* Update potential sum for this i atom from the interaction with this j atom. */
1006             velec            = _mm256_andnot_pd(dummy_mask,velec);
1007             velecsum         = _mm256_add_pd(velecsum,velec);
1008
1009             fscal            = felec;
1010
1011             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1012
1013             /* Calculate temporary vectorial force */
1014             tx               = _mm256_mul_pd(fscal,dx12);
1015             ty               = _mm256_mul_pd(fscal,dy12);
1016             tz               = _mm256_mul_pd(fscal,dz12);
1017
1018             /* Update vectorial force */
1019             fix1             = _mm256_add_pd(fix1,tx);
1020             fiy1             = _mm256_add_pd(fiy1,ty);
1021             fiz1             = _mm256_add_pd(fiz1,tz);
1022
1023             fjx2             = _mm256_add_pd(fjx2,tx);
1024             fjy2             = _mm256_add_pd(fjy2,ty);
1025             fjz2             = _mm256_add_pd(fjz2,tz);
1026
1027             /**************************
1028              * CALCULATE INTERACTIONS *
1029              **************************/
1030
1031             r13              = _mm256_mul_pd(rsq13,rinv13);
1032             r13              = _mm256_andnot_pd(dummy_mask,r13);
1033
1034             /* EWALD ELECTROSTATICS */
1035
1036             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1037             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1038             ewitab           = _mm256_cvttpd_epi32(ewrt);
1039             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1040             ewitab           = _mm_slli_epi32(ewitab,2);
1041             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1042             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1043             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1044             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1045             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1046             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1047             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1048             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1049             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1050
1051             /* Update potential sum for this i atom from the interaction with this j atom. */
1052             velec            = _mm256_andnot_pd(dummy_mask,velec);
1053             velecsum         = _mm256_add_pd(velecsum,velec);
1054
1055             fscal            = felec;
1056
1057             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1058
1059             /* Calculate temporary vectorial force */
1060             tx               = _mm256_mul_pd(fscal,dx13);
1061             ty               = _mm256_mul_pd(fscal,dy13);
1062             tz               = _mm256_mul_pd(fscal,dz13);
1063
1064             /* Update vectorial force */
1065             fix1             = _mm256_add_pd(fix1,tx);
1066             fiy1             = _mm256_add_pd(fiy1,ty);
1067             fiz1             = _mm256_add_pd(fiz1,tz);
1068
1069             fjx3             = _mm256_add_pd(fjx3,tx);
1070             fjy3             = _mm256_add_pd(fjy3,ty);
1071             fjz3             = _mm256_add_pd(fjz3,tz);
1072
1073             /**************************
1074              * CALCULATE INTERACTIONS *
1075              **************************/
1076
1077             r21              = _mm256_mul_pd(rsq21,rinv21);
1078             r21              = _mm256_andnot_pd(dummy_mask,r21);
1079
1080             /* EWALD ELECTROSTATICS */
1081
1082             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1083             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1084             ewitab           = _mm256_cvttpd_epi32(ewrt);
1085             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1086             ewitab           = _mm_slli_epi32(ewitab,2);
1087             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1088             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1089             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1090             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1091             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1092             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1093             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1094             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1095             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1096
1097             /* Update potential sum for this i atom from the interaction with this j atom. */
1098             velec            = _mm256_andnot_pd(dummy_mask,velec);
1099             velecsum         = _mm256_add_pd(velecsum,velec);
1100
1101             fscal            = felec;
1102
1103             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1104
1105             /* Calculate temporary vectorial force */
1106             tx               = _mm256_mul_pd(fscal,dx21);
1107             ty               = _mm256_mul_pd(fscal,dy21);
1108             tz               = _mm256_mul_pd(fscal,dz21);
1109
1110             /* Update vectorial force */
1111             fix2             = _mm256_add_pd(fix2,tx);
1112             fiy2             = _mm256_add_pd(fiy2,ty);
1113             fiz2             = _mm256_add_pd(fiz2,tz);
1114
1115             fjx1             = _mm256_add_pd(fjx1,tx);
1116             fjy1             = _mm256_add_pd(fjy1,ty);
1117             fjz1             = _mm256_add_pd(fjz1,tz);
1118
1119             /**************************
1120              * CALCULATE INTERACTIONS *
1121              **************************/
1122
1123             r22              = _mm256_mul_pd(rsq22,rinv22);
1124             r22              = _mm256_andnot_pd(dummy_mask,r22);
1125
1126             /* EWALD ELECTROSTATICS */
1127
1128             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1129             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1130             ewitab           = _mm256_cvttpd_epi32(ewrt);
1131             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1132             ewitab           = _mm_slli_epi32(ewitab,2);
1133             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1134             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1135             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1136             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1137             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1138             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1139             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1140             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1141             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1142
1143             /* Update potential sum for this i atom from the interaction with this j atom. */
1144             velec            = _mm256_andnot_pd(dummy_mask,velec);
1145             velecsum         = _mm256_add_pd(velecsum,velec);
1146
1147             fscal            = felec;
1148
1149             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1150
1151             /* Calculate temporary vectorial force */
1152             tx               = _mm256_mul_pd(fscal,dx22);
1153             ty               = _mm256_mul_pd(fscal,dy22);
1154             tz               = _mm256_mul_pd(fscal,dz22);
1155
1156             /* Update vectorial force */
1157             fix2             = _mm256_add_pd(fix2,tx);
1158             fiy2             = _mm256_add_pd(fiy2,ty);
1159             fiz2             = _mm256_add_pd(fiz2,tz);
1160
1161             fjx2             = _mm256_add_pd(fjx2,tx);
1162             fjy2             = _mm256_add_pd(fjy2,ty);
1163             fjz2             = _mm256_add_pd(fjz2,tz);
1164
1165             /**************************
1166              * CALCULATE INTERACTIONS *
1167              **************************/
1168
1169             r23              = _mm256_mul_pd(rsq23,rinv23);
1170             r23              = _mm256_andnot_pd(dummy_mask,r23);
1171
1172             /* EWALD ELECTROSTATICS */
1173
1174             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1175             ewrt             = _mm256_mul_pd(r23,ewtabscale);
1176             ewitab           = _mm256_cvttpd_epi32(ewrt);
1177             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1178             ewitab           = _mm_slli_epi32(ewitab,2);
1179             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1180             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1181             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1182             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1183             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1184             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1185             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1186             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1187             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1188
1189             /* Update potential sum for this i atom from the interaction with this j atom. */
1190             velec            = _mm256_andnot_pd(dummy_mask,velec);
1191             velecsum         = _mm256_add_pd(velecsum,velec);
1192
1193             fscal            = felec;
1194
1195             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1196
1197             /* Calculate temporary vectorial force */
1198             tx               = _mm256_mul_pd(fscal,dx23);
1199             ty               = _mm256_mul_pd(fscal,dy23);
1200             tz               = _mm256_mul_pd(fscal,dz23);
1201
1202             /* Update vectorial force */
1203             fix2             = _mm256_add_pd(fix2,tx);
1204             fiy2             = _mm256_add_pd(fiy2,ty);
1205             fiz2             = _mm256_add_pd(fiz2,tz);
1206
1207             fjx3             = _mm256_add_pd(fjx3,tx);
1208             fjy3             = _mm256_add_pd(fjy3,ty);
1209             fjz3             = _mm256_add_pd(fjz3,tz);
1210
1211             /**************************
1212              * CALCULATE INTERACTIONS *
1213              **************************/
1214
1215             r31              = _mm256_mul_pd(rsq31,rinv31);
1216             r31              = _mm256_andnot_pd(dummy_mask,r31);
1217
1218             /* EWALD ELECTROSTATICS */
1219
1220             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1221             ewrt             = _mm256_mul_pd(r31,ewtabscale);
1222             ewitab           = _mm256_cvttpd_epi32(ewrt);
1223             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1224             ewitab           = _mm_slli_epi32(ewitab,2);
1225             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1226             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1227             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1228             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1229             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1230             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1231             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1232             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1233             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1234
1235             /* Update potential sum for this i atom from the interaction with this j atom. */
1236             velec            = _mm256_andnot_pd(dummy_mask,velec);
1237             velecsum         = _mm256_add_pd(velecsum,velec);
1238
1239             fscal            = felec;
1240
1241             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1242
1243             /* Calculate temporary vectorial force */
1244             tx               = _mm256_mul_pd(fscal,dx31);
1245             ty               = _mm256_mul_pd(fscal,dy31);
1246             tz               = _mm256_mul_pd(fscal,dz31);
1247
1248             /* Update vectorial force */
1249             fix3             = _mm256_add_pd(fix3,tx);
1250             fiy3             = _mm256_add_pd(fiy3,ty);
1251             fiz3             = _mm256_add_pd(fiz3,tz);
1252
1253             fjx1             = _mm256_add_pd(fjx1,tx);
1254             fjy1             = _mm256_add_pd(fjy1,ty);
1255             fjz1             = _mm256_add_pd(fjz1,tz);
1256
1257             /**************************
1258              * CALCULATE INTERACTIONS *
1259              **************************/
1260
1261             r32              = _mm256_mul_pd(rsq32,rinv32);
1262             r32              = _mm256_andnot_pd(dummy_mask,r32);
1263
1264             /* EWALD ELECTROSTATICS */
1265
1266             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1267             ewrt             = _mm256_mul_pd(r32,ewtabscale);
1268             ewitab           = _mm256_cvttpd_epi32(ewrt);
1269             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1270             ewitab           = _mm_slli_epi32(ewitab,2);
1271             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1272             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1273             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1274             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1275             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1276             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1277             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1278             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1279             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1280
1281             /* Update potential sum for this i atom from the interaction with this j atom. */
1282             velec            = _mm256_andnot_pd(dummy_mask,velec);
1283             velecsum         = _mm256_add_pd(velecsum,velec);
1284
1285             fscal            = felec;
1286
1287             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1288
1289             /* Calculate temporary vectorial force */
1290             tx               = _mm256_mul_pd(fscal,dx32);
1291             ty               = _mm256_mul_pd(fscal,dy32);
1292             tz               = _mm256_mul_pd(fscal,dz32);
1293
1294             /* Update vectorial force */
1295             fix3             = _mm256_add_pd(fix3,tx);
1296             fiy3             = _mm256_add_pd(fiy3,ty);
1297             fiz3             = _mm256_add_pd(fiz3,tz);
1298
1299             fjx2             = _mm256_add_pd(fjx2,tx);
1300             fjy2             = _mm256_add_pd(fjy2,ty);
1301             fjz2             = _mm256_add_pd(fjz2,tz);
1302
1303             /**************************
1304              * CALCULATE INTERACTIONS *
1305              **************************/
1306
1307             r33              = _mm256_mul_pd(rsq33,rinv33);
1308             r33              = _mm256_andnot_pd(dummy_mask,r33);
1309
1310             /* EWALD ELECTROSTATICS */
1311
1312             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1313             ewrt             = _mm256_mul_pd(r33,ewtabscale);
1314             ewitab           = _mm256_cvttpd_epi32(ewrt);
1315             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1316             ewitab           = _mm_slli_epi32(ewitab,2);
1317             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1318             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1319             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1320             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1321             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1322             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1323             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1324             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1325             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1326
1327             /* Update potential sum for this i atom from the interaction with this j atom. */
1328             velec            = _mm256_andnot_pd(dummy_mask,velec);
1329             velecsum         = _mm256_add_pd(velecsum,velec);
1330
1331             fscal            = felec;
1332
1333             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1334
1335             /* Calculate temporary vectorial force */
1336             tx               = _mm256_mul_pd(fscal,dx33);
1337             ty               = _mm256_mul_pd(fscal,dy33);
1338             tz               = _mm256_mul_pd(fscal,dz33);
1339
1340             /* Update vectorial force */
1341             fix3             = _mm256_add_pd(fix3,tx);
1342             fiy3             = _mm256_add_pd(fiy3,ty);
1343             fiz3             = _mm256_add_pd(fiz3,tz);
1344
1345             fjx3             = _mm256_add_pd(fjx3,tx);
1346             fjy3             = _mm256_add_pd(fjy3,ty);
1347             fjz3             = _mm256_add_pd(fjz3,tz);
1348
1349             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1350             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1351             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1352             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1353
1354             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1355                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1356                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1357
1358             /* Inner loop uses 436 flops */
1359         }
1360
1361         /* End of innermost loop */
1362
1363         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1364                                                  f+i_coord_offset,fshift+i_shift_offset);
1365
1366         ggid                        = gid[iidx];
1367         /* Update potential energies */
1368         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1369         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1370
1371         /* Increment number of inner iterations */
1372         inneriter                  += j_index_end - j_index_start;
1373
1374         /* Outer loop uses 26 flops */
1375     }
1376
1377     /* Increment number of outer iterations */
1378     outeriter        += nri;
1379
1380     /* Update outer/inner flops */
1381
1382     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*436);
1383 }
1384 /*
1385  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double
1386  * Electrostatics interaction: Ewald
1387  * VdW interaction:            LJEwald
1388  * Geometry:                   Water4-Water4
1389  * Calculate force/pot:        Force
1390  */
1391 void
1392 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double
1393                     (t_nblist                    * gmx_restrict       nlist,
1394                      rvec                        * gmx_restrict          xx,
1395                      rvec                        * gmx_restrict          ff,
1396                      t_forcerec                  * gmx_restrict          fr,
1397                      t_mdatoms                   * gmx_restrict     mdatoms,
1398                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1399                      t_nrnb                      * gmx_restrict        nrnb)
1400 {
1401     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1402      * just 0 for non-waters.
1403      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1404      * jnr indices corresponding to data put in the four positions in the SIMD register.
1405      */
1406     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1407     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1408     int              jnrA,jnrB,jnrC,jnrD;
1409     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1410     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1411     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1412     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1413     real             rcutoff_scalar;
1414     real             *shiftvec,*fshift,*x,*f;
1415     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1416     real             scratch[4*DIM];
1417     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1418     real *           vdwioffsetptr0;
1419     real *           vdwgridioffsetptr0;
1420     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1421     real *           vdwioffsetptr1;
1422     real *           vdwgridioffsetptr1;
1423     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1424     real *           vdwioffsetptr2;
1425     real *           vdwgridioffsetptr2;
1426     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1427     real *           vdwioffsetptr3;
1428     real *           vdwgridioffsetptr3;
1429     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1430     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1431     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1432     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1433     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1434     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1435     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1436     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1437     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1438     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1439     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1440     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1441     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1442     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1443     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1444     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1445     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1446     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1447     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1448     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1449     real             *charge;
1450     int              nvdwtype;
1451     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1452     int              *vdwtype;
1453     real             *vdwparam;
1454     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1455     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1456     __m256d           c6grid_00;
1457     __m256d           c6grid_11;
1458     __m256d           c6grid_12;
1459     __m256d           c6grid_13;
1460     __m256d           c6grid_21;
1461     __m256d           c6grid_22;
1462     __m256d           c6grid_23;
1463     __m256d           c6grid_31;
1464     __m256d           c6grid_32;
1465     __m256d           c6grid_33;
1466     real             *vdwgridparam;
1467     __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1468     __m256d           one_half  = _mm256_set1_pd(0.5);
1469     __m256d           minus_one = _mm256_set1_pd(-1.0);
1470     __m128i          ewitab;
1471     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1472     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1473     real             *ewtab;
1474     __m256d          dummy_mask,cutoff_mask;
1475     __m128           tmpmask0,tmpmask1;
1476     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1477     __m256d          one     = _mm256_set1_pd(1.0);
1478     __m256d          two     = _mm256_set1_pd(2.0);
1479     x                = xx[0];
1480     f                = ff[0];
1481
1482     nri              = nlist->nri;
1483     iinr             = nlist->iinr;
1484     jindex           = nlist->jindex;
1485     jjnr             = nlist->jjnr;
1486     shiftidx         = nlist->shift;
1487     gid              = nlist->gid;
1488     shiftvec         = fr->shift_vec[0];
1489     fshift           = fr->fshift[0];
1490     facel            = _mm256_set1_pd(fr->epsfac);
1491     charge           = mdatoms->chargeA;
1492     nvdwtype         = fr->ntype;
1493     vdwparam         = fr->nbfp;
1494     vdwtype          = mdatoms->typeA;
1495     vdwgridparam     = fr->ljpme_c6grid;
1496     sh_lj_ewald      = _mm256_set1_pd(fr->ic->sh_lj_ewald);
1497     ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
1498     ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
1499
1500     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1501     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1502     beta2            = _mm256_mul_pd(beta,beta);
1503     beta3            = _mm256_mul_pd(beta,beta2);
1504
1505     ewtab            = fr->ic->tabq_coul_F;
1506     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1507     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1508
1509     /* Setup water-specific parameters */
1510     inr              = nlist->iinr[0];
1511     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1512     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1513     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1514     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1515     vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
1516
1517     jq1              = _mm256_set1_pd(charge[inr+1]);
1518     jq2              = _mm256_set1_pd(charge[inr+2]);
1519     jq3              = _mm256_set1_pd(charge[inr+3]);
1520     vdwjidx0A        = 2*vdwtype[inr+0];
1521     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1522     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1523     c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
1524     qq11             = _mm256_mul_pd(iq1,jq1);
1525     qq12             = _mm256_mul_pd(iq1,jq2);
1526     qq13             = _mm256_mul_pd(iq1,jq3);
1527     qq21             = _mm256_mul_pd(iq2,jq1);
1528     qq22             = _mm256_mul_pd(iq2,jq2);
1529     qq23             = _mm256_mul_pd(iq2,jq3);
1530     qq31             = _mm256_mul_pd(iq3,jq1);
1531     qq32             = _mm256_mul_pd(iq3,jq2);
1532     qq33             = _mm256_mul_pd(iq3,jq3);
1533
1534     /* Avoid stupid compiler warnings */
1535     jnrA = jnrB = jnrC = jnrD = 0;
1536     j_coord_offsetA = 0;
1537     j_coord_offsetB = 0;
1538     j_coord_offsetC = 0;
1539     j_coord_offsetD = 0;
1540
1541     outeriter        = 0;
1542     inneriter        = 0;
1543
1544     for(iidx=0;iidx<4*DIM;iidx++)
1545     {
1546         scratch[iidx] = 0.0;
1547     }
1548
1549     /* Start outer loop over neighborlists */
1550     for(iidx=0; iidx<nri; iidx++)
1551     {
1552         /* Load shift vector for this list */
1553         i_shift_offset   = DIM*shiftidx[iidx];
1554
1555         /* Load limits for loop over neighbors */
1556         j_index_start    = jindex[iidx];
1557         j_index_end      = jindex[iidx+1];
1558
1559         /* Get outer coordinate index */
1560         inr              = iinr[iidx];
1561         i_coord_offset   = DIM*inr;
1562
1563         /* Load i particle coords and add shift vector */
1564         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1565                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1566
1567         fix0             = _mm256_setzero_pd();
1568         fiy0             = _mm256_setzero_pd();
1569         fiz0             = _mm256_setzero_pd();
1570         fix1             = _mm256_setzero_pd();
1571         fiy1             = _mm256_setzero_pd();
1572         fiz1             = _mm256_setzero_pd();
1573         fix2             = _mm256_setzero_pd();
1574         fiy2             = _mm256_setzero_pd();
1575         fiz2             = _mm256_setzero_pd();
1576         fix3             = _mm256_setzero_pd();
1577         fiy3             = _mm256_setzero_pd();
1578         fiz3             = _mm256_setzero_pd();
1579
1580         /* Start inner kernel loop */
1581         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1582         {
1583
1584             /* Get j neighbor index, and coordinate index */
1585             jnrA             = jjnr[jidx];
1586             jnrB             = jjnr[jidx+1];
1587             jnrC             = jjnr[jidx+2];
1588             jnrD             = jjnr[jidx+3];
1589             j_coord_offsetA  = DIM*jnrA;
1590             j_coord_offsetB  = DIM*jnrB;
1591             j_coord_offsetC  = DIM*jnrC;
1592             j_coord_offsetD  = DIM*jnrD;
1593
1594             /* load j atom coordinates */
1595             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1596                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1597                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1598                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1599
1600             /* Calculate displacement vector */
1601             dx00             = _mm256_sub_pd(ix0,jx0);
1602             dy00             = _mm256_sub_pd(iy0,jy0);
1603             dz00             = _mm256_sub_pd(iz0,jz0);
1604             dx11             = _mm256_sub_pd(ix1,jx1);
1605             dy11             = _mm256_sub_pd(iy1,jy1);
1606             dz11             = _mm256_sub_pd(iz1,jz1);
1607             dx12             = _mm256_sub_pd(ix1,jx2);
1608             dy12             = _mm256_sub_pd(iy1,jy2);
1609             dz12             = _mm256_sub_pd(iz1,jz2);
1610             dx13             = _mm256_sub_pd(ix1,jx3);
1611             dy13             = _mm256_sub_pd(iy1,jy3);
1612             dz13             = _mm256_sub_pd(iz1,jz3);
1613             dx21             = _mm256_sub_pd(ix2,jx1);
1614             dy21             = _mm256_sub_pd(iy2,jy1);
1615             dz21             = _mm256_sub_pd(iz2,jz1);
1616             dx22             = _mm256_sub_pd(ix2,jx2);
1617             dy22             = _mm256_sub_pd(iy2,jy2);
1618             dz22             = _mm256_sub_pd(iz2,jz2);
1619             dx23             = _mm256_sub_pd(ix2,jx3);
1620             dy23             = _mm256_sub_pd(iy2,jy3);
1621             dz23             = _mm256_sub_pd(iz2,jz3);
1622             dx31             = _mm256_sub_pd(ix3,jx1);
1623             dy31             = _mm256_sub_pd(iy3,jy1);
1624             dz31             = _mm256_sub_pd(iz3,jz1);
1625             dx32             = _mm256_sub_pd(ix3,jx2);
1626             dy32             = _mm256_sub_pd(iy3,jy2);
1627             dz32             = _mm256_sub_pd(iz3,jz2);
1628             dx33             = _mm256_sub_pd(ix3,jx3);
1629             dy33             = _mm256_sub_pd(iy3,jy3);
1630             dz33             = _mm256_sub_pd(iz3,jz3);
1631
1632             /* Calculate squared distance and things based on it */
1633             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1634             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1635             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1636             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1637             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1638             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1639             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1640             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1641             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1642             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1643
1644             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1645             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1646             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1647             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1648             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1649             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1650             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1651             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1652             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1653             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1654
1655             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1656             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1657             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1658             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1659             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1660             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1661             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1662             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1663             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1664             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1665
1666             fjx0             = _mm256_setzero_pd();
1667             fjy0             = _mm256_setzero_pd();
1668             fjz0             = _mm256_setzero_pd();
1669             fjx1             = _mm256_setzero_pd();
1670             fjy1             = _mm256_setzero_pd();
1671             fjz1             = _mm256_setzero_pd();
1672             fjx2             = _mm256_setzero_pd();
1673             fjy2             = _mm256_setzero_pd();
1674             fjz2             = _mm256_setzero_pd();
1675             fjx3             = _mm256_setzero_pd();
1676             fjy3             = _mm256_setzero_pd();
1677             fjz3             = _mm256_setzero_pd();
1678
1679             /**************************
1680              * CALCULATE INTERACTIONS *
1681              **************************/
1682
1683             r00              = _mm256_mul_pd(rsq00,rinv00);
1684
1685             /* Analytical LJ-PME */
1686             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1687             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
1688             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
1689             exponent         = gmx_simd_exp_d(ewcljrsq);
1690             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1691             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1692             /* f6A = 6 * C6grid * (1 - poly) */
1693             f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
1694             /* f6B = C6grid * exponent * beta^6 */
1695             f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
1696             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1697             fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1698
1699             fscal            = fvdw;
1700
1701             /* Calculate temporary vectorial force */
1702             tx               = _mm256_mul_pd(fscal,dx00);
1703             ty               = _mm256_mul_pd(fscal,dy00);
1704             tz               = _mm256_mul_pd(fscal,dz00);
1705
1706             /* Update vectorial force */
1707             fix0             = _mm256_add_pd(fix0,tx);
1708             fiy0             = _mm256_add_pd(fiy0,ty);
1709             fiz0             = _mm256_add_pd(fiz0,tz);
1710
1711             fjx0             = _mm256_add_pd(fjx0,tx);
1712             fjy0             = _mm256_add_pd(fjy0,ty);
1713             fjz0             = _mm256_add_pd(fjz0,tz);
1714
1715             /**************************
1716              * CALCULATE INTERACTIONS *
1717              **************************/
1718
1719             r11              = _mm256_mul_pd(rsq11,rinv11);
1720
1721             /* EWALD ELECTROSTATICS */
1722
1723             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1724             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1725             ewitab           = _mm256_cvttpd_epi32(ewrt);
1726             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1727             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1728                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1729                                             &ewtabF,&ewtabFn);
1730             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1731             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1732
1733             fscal            = felec;
1734
1735             /* Calculate temporary vectorial force */
1736             tx               = _mm256_mul_pd(fscal,dx11);
1737             ty               = _mm256_mul_pd(fscal,dy11);
1738             tz               = _mm256_mul_pd(fscal,dz11);
1739
1740             /* Update vectorial force */
1741             fix1             = _mm256_add_pd(fix1,tx);
1742             fiy1             = _mm256_add_pd(fiy1,ty);
1743             fiz1             = _mm256_add_pd(fiz1,tz);
1744
1745             fjx1             = _mm256_add_pd(fjx1,tx);
1746             fjy1             = _mm256_add_pd(fjy1,ty);
1747             fjz1             = _mm256_add_pd(fjz1,tz);
1748
1749             /**************************
1750              * CALCULATE INTERACTIONS *
1751              **************************/
1752
1753             r12              = _mm256_mul_pd(rsq12,rinv12);
1754
1755             /* EWALD ELECTROSTATICS */
1756
1757             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1758             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1759             ewitab           = _mm256_cvttpd_epi32(ewrt);
1760             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1761             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1762                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1763                                             &ewtabF,&ewtabFn);
1764             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1765             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1766
1767             fscal            = felec;
1768
1769             /* Calculate temporary vectorial force */
1770             tx               = _mm256_mul_pd(fscal,dx12);
1771             ty               = _mm256_mul_pd(fscal,dy12);
1772             tz               = _mm256_mul_pd(fscal,dz12);
1773
1774             /* Update vectorial force */
1775             fix1             = _mm256_add_pd(fix1,tx);
1776             fiy1             = _mm256_add_pd(fiy1,ty);
1777             fiz1             = _mm256_add_pd(fiz1,tz);
1778
1779             fjx2             = _mm256_add_pd(fjx2,tx);
1780             fjy2             = _mm256_add_pd(fjy2,ty);
1781             fjz2             = _mm256_add_pd(fjz2,tz);
1782
1783             /**************************
1784              * CALCULATE INTERACTIONS *
1785              **************************/
1786
1787             r13              = _mm256_mul_pd(rsq13,rinv13);
1788
1789             /* EWALD ELECTROSTATICS */
1790
1791             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1792             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1793             ewitab           = _mm256_cvttpd_epi32(ewrt);
1794             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1795             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1796                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1797                                             &ewtabF,&ewtabFn);
1798             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1799             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1800
1801             fscal            = felec;
1802
1803             /* Calculate temporary vectorial force */
1804             tx               = _mm256_mul_pd(fscal,dx13);
1805             ty               = _mm256_mul_pd(fscal,dy13);
1806             tz               = _mm256_mul_pd(fscal,dz13);
1807
1808             /* Update vectorial force */
1809             fix1             = _mm256_add_pd(fix1,tx);
1810             fiy1             = _mm256_add_pd(fiy1,ty);
1811             fiz1             = _mm256_add_pd(fiz1,tz);
1812
1813             fjx3             = _mm256_add_pd(fjx3,tx);
1814             fjy3             = _mm256_add_pd(fjy3,ty);
1815             fjz3             = _mm256_add_pd(fjz3,tz);
1816
1817             /**************************
1818              * CALCULATE INTERACTIONS *
1819              **************************/
1820
1821             r21              = _mm256_mul_pd(rsq21,rinv21);
1822
1823             /* EWALD ELECTROSTATICS */
1824
1825             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1826             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1827             ewitab           = _mm256_cvttpd_epi32(ewrt);
1828             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1829             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1830                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1831                                             &ewtabF,&ewtabFn);
1832             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1833             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1834
1835             fscal            = felec;
1836
1837             /* Calculate temporary vectorial force */
1838             tx               = _mm256_mul_pd(fscal,dx21);
1839             ty               = _mm256_mul_pd(fscal,dy21);
1840             tz               = _mm256_mul_pd(fscal,dz21);
1841
1842             /* Update vectorial force */
1843             fix2             = _mm256_add_pd(fix2,tx);
1844             fiy2             = _mm256_add_pd(fiy2,ty);
1845             fiz2             = _mm256_add_pd(fiz2,tz);
1846
1847             fjx1             = _mm256_add_pd(fjx1,tx);
1848             fjy1             = _mm256_add_pd(fjy1,ty);
1849             fjz1             = _mm256_add_pd(fjz1,tz);
1850
1851             /**************************
1852              * CALCULATE INTERACTIONS *
1853              **************************/
1854
1855             r22              = _mm256_mul_pd(rsq22,rinv22);
1856
1857             /* EWALD ELECTROSTATICS */
1858
1859             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1860             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1861             ewitab           = _mm256_cvttpd_epi32(ewrt);
1862             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1863             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1864                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1865                                             &ewtabF,&ewtabFn);
1866             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1867             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1868
1869             fscal            = felec;
1870
1871             /* Calculate temporary vectorial force */
1872             tx               = _mm256_mul_pd(fscal,dx22);
1873             ty               = _mm256_mul_pd(fscal,dy22);
1874             tz               = _mm256_mul_pd(fscal,dz22);
1875
1876             /* Update vectorial force */
1877             fix2             = _mm256_add_pd(fix2,tx);
1878             fiy2             = _mm256_add_pd(fiy2,ty);
1879             fiz2             = _mm256_add_pd(fiz2,tz);
1880
1881             fjx2             = _mm256_add_pd(fjx2,tx);
1882             fjy2             = _mm256_add_pd(fjy2,ty);
1883             fjz2             = _mm256_add_pd(fjz2,tz);
1884
1885             /**************************
1886              * CALCULATE INTERACTIONS *
1887              **************************/
1888
1889             r23              = _mm256_mul_pd(rsq23,rinv23);
1890
1891             /* EWALD ELECTROSTATICS */
1892
1893             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1894             ewrt             = _mm256_mul_pd(r23,ewtabscale);
1895             ewitab           = _mm256_cvttpd_epi32(ewrt);
1896             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1897             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1898                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1899                                             &ewtabF,&ewtabFn);
1900             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1901             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1902
1903             fscal            = felec;
1904
1905             /* Calculate temporary vectorial force */
1906             tx               = _mm256_mul_pd(fscal,dx23);
1907             ty               = _mm256_mul_pd(fscal,dy23);
1908             tz               = _mm256_mul_pd(fscal,dz23);
1909
1910             /* Update vectorial force */
1911             fix2             = _mm256_add_pd(fix2,tx);
1912             fiy2             = _mm256_add_pd(fiy2,ty);
1913             fiz2             = _mm256_add_pd(fiz2,tz);
1914
1915             fjx3             = _mm256_add_pd(fjx3,tx);
1916             fjy3             = _mm256_add_pd(fjy3,ty);
1917             fjz3             = _mm256_add_pd(fjz3,tz);
1918
1919             /**************************
1920              * CALCULATE INTERACTIONS *
1921              **************************/
1922
1923             r31              = _mm256_mul_pd(rsq31,rinv31);
1924
1925             /* EWALD ELECTROSTATICS */
1926
1927             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1928             ewrt             = _mm256_mul_pd(r31,ewtabscale);
1929             ewitab           = _mm256_cvttpd_epi32(ewrt);
1930             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1931             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1932                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1933                                             &ewtabF,&ewtabFn);
1934             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1935             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1936
1937             fscal            = felec;
1938
1939             /* Calculate temporary vectorial force */
1940             tx               = _mm256_mul_pd(fscal,dx31);
1941             ty               = _mm256_mul_pd(fscal,dy31);
1942             tz               = _mm256_mul_pd(fscal,dz31);
1943
1944             /* Update vectorial force */
1945             fix3             = _mm256_add_pd(fix3,tx);
1946             fiy3             = _mm256_add_pd(fiy3,ty);
1947             fiz3             = _mm256_add_pd(fiz3,tz);
1948
1949             fjx1             = _mm256_add_pd(fjx1,tx);
1950             fjy1             = _mm256_add_pd(fjy1,ty);
1951             fjz1             = _mm256_add_pd(fjz1,tz);
1952
1953             /**************************
1954              * CALCULATE INTERACTIONS *
1955              **************************/
1956
1957             r32              = _mm256_mul_pd(rsq32,rinv32);
1958
1959             /* EWALD ELECTROSTATICS */
1960
1961             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1962             ewrt             = _mm256_mul_pd(r32,ewtabscale);
1963             ewitab           = _mm256_cvttpd_epi32(ewrt);
1964             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1965             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1966                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1967                                             &ewtabF,&ewtabFn);
1968             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1969             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1970
1971             fscal            = felec;
1972
1973             /* Calculate temporary vectorial force */
1974             tx               = _mm256_mul_pd(fscal,dx32);
1975             ty               = _mm256_mul_pd(fscal,dy32);
1976             tz               = _mm256_mul_pd(fscal,dz32);
1977
1978             /* Update vectorial force */
1979             fix3             = _mm256_add_pd(fix3,tx);
1980             fiy3             = _mm256_add_pd(fiy3,ty);
1981             fiz3             = _mm256_add_pd(fiz3,tz);
1982
1983             fjx2             = _mm256_add_pd(fjx2,tx);
1984             fjy2             = _mm256_add_pd(fjy2,ty);
1985             fjz2             = _mm256_add_pd(fjz2,tz);
1986
1987             /**************************
1988              * CALCULATE INTERACTIONS *
1989              **************************/
1990
1991             r33              = _mm256_mul_pd(rsq33,rinv33);
1992
1993             /* EWALD ELECTROSTATICS */
1994
1995             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1996             ewrt             = _mm256_mul_pd(r33,ewtabscale);
1997             ewitab           = _mm256_cvttpd_epi32(ewrt);
1998             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1999             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2000                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2001                                             &ewtabF,&ewtabFn);
2002             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2003             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2004
2005             fscal            = felec;
2006
2007             /* Calculate temporary vectorial force */
2008             tx               = _mm256_mul_pd(fscal,dx33);
2009             ty               = _mm256_mul_pd(fscal,dy33);
2010             tz               = _mm256_mul_pd(fscal,dz33);
2011
2012             /* Update vectorial force */
2013             fix3             = _mm256_add_pd(fix3,tx);
2014             fiy3             = _mm256_add_pd(fiy3,ty);
2015             fiz3             = _mm256_add_pd(fiz3,tz);
2016
2017             fjx3             = _mm256_add_pd(fjx3,tx);
2018             fjy3             = _mm256_add_pd(fjy3,ty);
2019             fjz3             = _mm256_add_pd(fjz3,tz);
2020
2021             fjptrA             = f+j_coord_offsetA;
2022             fjptrB             = f+j_coord_offsetB;
2023             fjptrC             = f+j_coord_offsetC;
2024             fjptrD             = f+j_coord_offsetD;
2025
2026             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2027                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2028                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2029
2030             /* Inner loop uses 373 flops */
2031         }
2032
2033         if(jidx<j_index_end)
2034         {
2035
2036             /* Get j neighbor index, and coordinate index */
2037             jnrlistA         = jjnr[jidx];
2038             jnrlistB         = jjnr[jidx+1];
2039             jnrlistC         = jjnr[jidx+2];
2040             jnrlistD         = jjnr[jidx+3];
2041             /* Sign of each element will be negative for non-real atoms.
2042              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2043              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2044              */
2045             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2046
2047             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2048             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2049             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2050
2051             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2052             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2053             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2054             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2055             j_coord_offsetA  = DIM*jnrA;
2056             j_coord_offsetB  = DIM*jnrB;
2057             j_coord_offsetC  = DIM*jnrC;
2058             j_coord_offsetD  = DIM*jnrD;
2059
2060             /* load j atom coordinates */
2061             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2062                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2063                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2064                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2065
2066             /* Calculate displacement vector */
2067             dx00             = _mm256_sub_pd(ix0,jx0);
2068             dy00             = _mm256_sub_pd(iy0,jy0);
2069             dz00             = _mm256_sub_pd(iz0,jz0);
2070             dx11             = _mm256_sub_pd(ix1,jx1);
2071             dy11             = _mm256_sub_pd(iy1,jy1);
2072             dz11             = _mm256_sub_pd(iz1,jz1);
2073             dx12             = _mm256_sub_pd(ix1,jx2);
2074             dy12             = _mm256_sub_pd(iy1,jy2);
2075             dz12             = _mm256_sub_pd(iz1,jz2);
2076             dx13             = _mm256_sub_pd(ix1,jx3);
2077             dy13             = _mm256_sub_pd(iy1,jy3);
2078             dz13             = _mm256_sub_pd(iz1,jz3);
2079             dx21             = _mm256_sub_pd(ix2,jx1);
2080             dy21             = _mm256_sub_pd(iy2,jy1);
2081             dz21             = _mm256_sub_pd(iz2,jz1);
2082             dx22             = _mm256_sub_pd(ix2,jx2);
2083             dy22             = _mm256_sub_pd(iy2,jy2);
2084             dz22             = _mm256_sub_pd(iz2,jz2);
2085             dx23             = _mm256_sub_pd(ix2,jx3);
2086             dy23             = _mm256_sub_pd(iy2,jy3);
2087             dz23             = _mm256_sub_pd(iz2,jz3);
2088             dx31             = _mm256_sub_pd(ix3,jx1);
2089             dy31             = _mm256_sub_pd(iy3,jy1);
2090             dz31             = _mm256_sub_pd(iz3,jz1);
2091             dx32             = _mm256_sub_pd(ix3,jx2);
2092             dy32             = _mm256_sub_pd(iy3,jy2);
2093             dz32             = _mm256_sub_pd(iz3,jz2);
2094             dx33             = _mm256_sub_pd(ix3,jx3);
2095             dy33             = _mm256_sub_pd(iy3,jy3);
2096             dz33             = _mm256_sub_pd(iz3,jz3);
2097
2098             /* Calculate squared distance and things based on it */
2099             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2100             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2101             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2102             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2103             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2104             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2105             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2106             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2107             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2108             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2109
2110             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
2111             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2112             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2113             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
2114             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2115             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2116             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
2117             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
2118             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
2119             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
2120
2121             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2122             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2123             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2124             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2125             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2126             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2127             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2128             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2129             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2130             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2131
2132             fjx0             = _mm256_setzero_pd();
2133             fjy0             = _mm256_setzero_pd();
2134             fjz0             = _mm256_setzero_pd();
2135             fjx1             = _mm256_setzero_pd();
2136             fjy1             = _mm256_setzero_pd();
2137             fjz1             = _mm256_setzero_pd();
2138             fjx2             = _mm256_setzero_pd();
2139             fjy2             = _mm256_setzero_pd();
2140             fjz2             = _mm256_setzero_pd();
2141             fjx3             = _mm256_setzero_pd();
2142             fjy3             = _mm256_setzero_pd();
2143             fjz3             = _mm256_setzero_pd();
2144
2145             /**************************
2146              * CALCULATE INTERACTIONS *
2147              **************************/
2148
2149             r00              = _mm256_mul_pd(rsq00,rinv00);
2150             r00              = _mm256_andnot_pd(dummy_mask,r00);
2151
2152             /* Analytical LJ-PME */
2153             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2154             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
2155             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
2156             exponent         = gmx_simd_exp_d(ewcljrsq);
2157             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2158             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
2159             /* f6A = 6 * C6grid * (1 - poly) */
2160             f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
2161             /* f6B = C6grid * exponent * beta^6 */
2162             f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
2163             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2164             fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2165
2166             fscal            = fvdw;
2167
2168             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2169
2170             /* Calculate temporary vectorial force */
2171             tx               = _mm256_mul_pd(fscal,dx00);
2172             ty               = _mm256_mul_pd(fscal,dy00);
2173             tz               = _mm256_mul_pd(fscal,dz00);
2174
2175             /* Update vectorial force */
2176             fix0             = _mm256_add_pd(fix0,tx);
2177             fiy0             = _mm256_add_pd(fiy0,ty);
2178             fiz0             = _mm256_add_pd(fiz0,tz);
2179
2180             fjx0             = _mm256_add_pd(fjx0,tx);
2181             fjy0             = _mm256_add_pd(fjy0,ty);
2182             fjz0             = _mm256_add_pd(fjz0,tz);
2183
2184             /**************************
2185              * CALCULATE INTERACTIONS *
2186              **************************/
2187
2188             r11              = _mm256_mul_pd(rsq11,rinv11);
2189             r11              = _mm256_andnot_pd(dummy_mask,r11);
2190
2191             /* EWALD ELECTROSTATICS */
2192
2193             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2194             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2195             ewitab           = _mm256_cvttpd_epi32(ewrt);
2196             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2197             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2198                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2199                                             &ewtabF,&ewtabFn);
2200             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2201             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2202
2203             fscal            = felec;
2204
2205             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2206
2207             /* Calculate temporary vectorial force */
2208             tx               = _mm256_mul_pd(fscal,dx11);
2209             ty               = _mm256_mul_pd(fscal,dy11);
2210             tz               = _mm256_mul_pd(fscal,dz11);
2211
2212             /* Update vectorial force */
2213             fix1             = _mm256_add_pd(fix1,tx);
2214             fiy1             = _mm256_add_pd(fiy1,ty);
2215             fiz1             = _mm256_add_pd(fiz1,tz);
2216
2217             fjx1             = _mm256_add_pd(fjx1,tx);
2218             fjy1             = _mm256_add_pd(fjy1,ty);
2219             fjz1             = _mm256_add_pd(fjz1,tz);
2220
2221             /**************************
2222              * CALCULATE INTERACTIONS *
2223              **************************/
2224
2225             r12              = _mm256_mul_pd(rsq12,rinv12);
2226             r12              = _mm256_andnot_pd(dummy_mask,r12);
2227
2228             /* EWALD ELECTROSTATICS */
2229
2230             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2231             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2232             ewitab           = _mm256_cvttpd_epi32(ewrt);
2233             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2234             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2235                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2236                                             &ewtabF,&ewtabFn);
2237             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2238             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2239
2240             fscal            = felec;
2241
2242             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2243
2244             /* Calculate temporary vectorial force */
2245             tx               = _mm256_mul_pd(fscal,dx12);
2246             ty               = _mm256_mul_pd(fscal,dy12);
2247             tz               = _mm256_mul_pd(fscal,dz12);
2248
2249             /* Update vectorial force */
2250             fix1             = _mm256_add_pd(fix1,tx);
2251             fiy1             = _mm256_add_pd(fiy1,ty);
2252             fiz1             = _mm256_add_pd(fiz1,tz);
2253
2254             fjx2             = _mm256_add_pd(fjx2,tx);
2255             fjy2             = _mm256_add_pd(fjy2,ty);
2256             fjz2             = _mm256_add_pd(fjz2,tz);
2257
2258             /**************************
2259              * CALCULATE INTERACTIONS *
2260              **************************/
2261
2262             r13              = _mm256_mul_pd(rsq13,rinv13);
2263             r13              = _mm256_andnot_pd(dummy_mask,r13);
2264
2265             /* EWALD ELECTROSTATICS */
2266
2267             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2268             ewrt             = _mm256_mul_pd(r13,ewtabscale);
2269             ewitab           = _mm256_cvttpd_epi32(ewrt);
2270             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2271             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2272                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2273                                             &ewtabF,&ewtabFn);
2274             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2275             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2276
2277             fscal            = felec;
2278
2279             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2280
2281             /* Calculate temporary vectorial force */
2282             tx               = _mm256_mul_pd(fscal,dx13);
2283             ty               = _mm256_mul_pd(fscal,dy13);
2284             tz               = _mm256_mul_pd(fscal,dz13);
2285
2286             /* Update vectorial force */
2287             fix1             = _mm256_add_pd(fix1,tx);
2288             fiy1             = _mm256_add_pd(fiy1,ty);
2289             fiz1             = _mm256_add_pd(fiz1,tz);
2290
2291             fjx3             = _mm256_add_pd(fjx3,tx);
2292             fjy3             = _mm256_add_pd(fjy3,ty);
2293             fjz3             = _mm256_add_pd(fjz3,tz);
2294
2295             /**************************
2296              * CALCULATE INTERACTIONS *
2297              **************************/
2298
2299             r21              = _mm256_mul_pd(rsq21,rinv21);
2300             r21              = _mm256_andnot_pd(dummy_mask,r21);
2301
2302             /* EWALD ELECTROSTATICS */
2303
2304             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2305             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2306             ewitab           = _mm256_cvttpd_epi32(ewrt);
2307             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2308             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2309                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2310                                             &ewtabF,&ewtabFn);
2311             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2312             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2313
2314             fscal            = felec;
2315
2316             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2317
2318             /* Calculate temporary vectorial force */
2319             tx               = _mm256_mul_pd(fscal,dx21);
2320             ty               = _mm256_mul_pd(fscal,dy21);
2321             tz               = _mm256_mul_pd(fscal,dz21);
2322
2323             /* Update vectorial force */
2324             fix2             = _mm256_add_pd(fix2,tx);
2325             fiy2             = _mm256_add_pd(fiy2,ty);
2326             fiz2             = _mm256_add_pd(fiz2,tz);
2327
2328             fjx1             = _mm256_add_pd(fjx1,tx);
2329             fjy1             = _mm256_add_pd(fjy1,ty);
2330             fjz1             = _mm256_add_pd(fjz1,tz);
2331
2332             /**************************
2333              * CALCULATE INTERACTIONS *
2334              **************************/
2335
2336             r22              = _mm256_mul_pd(rsq22,rinv22);
2337             r22              = _mm256_andnot_pd(dummy_mask,r22);
2338
2339             /* EWALD ELECTROSTATICS */
2340
2341             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2342             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2343             ewitab           = _mm256_cvttpd_epi32(ewrt);
2344             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2345             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2346                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2347                                             &ewtabF,&ewtabFn);
2348             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2349             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2350
2351             fscal            = felec;
2352
2353             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2354
2355             /* Calculate temporary vectorial force */
2356             tx               = _mm256_mul_pd(fscal,dx22);
2357             ty               = _mm256_mul_pd(fscal,dy22);
2358             tz               = _mm256_mul_pd(fscal,dz22);
2359
2360             /* Update vectorial force */
2361             fix2             = _mm256_add_pd(fix2,tx);
2362             fiy2             = _mm256_add_pd(fiy2,ty);
2363             fiz2             = _mm256_add_pd(fiz2,tz);
2364
2365             fjx2             = _mm256_add_pd(fjx2,tx);
2366             fjy2             = _mm256_add_pd(fjy2,ty);
2367             fjz2             = _mm256_add_pd(fjz2,tz);
2368
2369             /**************************
2370              * CALCULATE INTERACTIONS *
2371              **************************/
2372
2373             r23              = _mm256_mul_pd(rsq23,rinv23);
2374             r23              = _mm256_andnot_pd(dummy_mask,r23);
2375
2376             /* EWALD ELECTROSTATICS */
2377
2378             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2379             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2380             ewitab           = _mm256_cvttpd_epi32(ewrt);
2381             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2382             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2383                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2384                                             &ewtabF,&ewtabFn);
2385             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2386             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2387
2388             fscal            = felec;
2389
2390             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2391
2392             /* Calculate temporary vectorial force */
2393             tx               = _mm256_mul_pd(fscal,dx23);
2394             ty               = _mm256_mul_pd(fscal,dy23);
2395             tz               = _mm256_mul_pd(fscal,dz23);
2396
2397             /* Update vectorial force */
2398             fix2             = _mm256_add_pd(fix2,tx);
2399             fiy2             = _mm256_add_pd(fiy2,ty);
2400             fiz2             = _mm256_add_pd(fiz2,tz);
2401
2402             fjx3             = _mm256_add_pd(fjx3,tx);
2403             fjy3             = _mm256_add_pd(fjy3,ty);
2404             fjz3             = _mm256_add_pd(fjz3,tz);
2405
2406             /**************************
2407              * CALCULATE INTERACTIONS *
2408              **************************/
2409
2410             r31              = _mm256_mul_pd(rsq31,rinv31);
2411             r31              = _mm256_andnot_pd(dummy_mask,r31);
2412
2413             /* EWALD ELECTROSTATICS */
2414
2415             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2416             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2417             ewitab           = _mm256_cvttpd_epi32(ewrt);
2418             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2419             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2420                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2421                                             &ewtabF,&ewtabFn);
2422             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2423             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2424
2425             fscal            = felec;
2426
2427             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2428
2429             /* Calculate temporary vectorial force */
2430             tx               = _mm256_mul_pd(fscal,dx31);
2431             ty               = _mm256_mul_pd(fscal,dy31);
2432             tz               = _mm256_mul_pd(fscal,dz31);
2433
2434             /* Update vectorial force */
2435             fix3             = _mm256_add_pd(fix3,tx);
2436             fiy3             = _mm256_add_pd(fiy3,ty);
2437             fiz3             = _mm256_add_pd(fiz3,tz);
2438
2439             fjx1             = _mm256_add_pd(fjx1,tx);
2440             fjy1             = _mm256_add_pd(fjy1,ty);
2441             fjz1             = _mm256_add_pd(fjz1,tz);
2442
2443             /**************************
2444              * CALCULATE INTERACTIONS *
2445              **************************/
2446
2447             r32              = _mm256_mul_pd(rsq32,rinv32);
2448             r32              = _mm256_andnot_pd(dummy_mask,r32);
2449
2450             /* EWALD ELECTROSTATICS */
2451
2452             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2453             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2454             ewitab           = _mm256_cvttpd_epi32(ewrt);
2455             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2456             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2457                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2458                                             &ewtabF,&ewtabFn);
2459             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2460             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2461
2462             fscal            = felec;
2463
2464             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2465
2466             /* Calculate temporary vectorial force */
2467             tx               = _mm256_mul_pd(fscal,dx32);
2468             ty               = _mm256_mul_pd(fscal,dy32);
2469             tz               = _mm256_mul_pd(fscal,dz32);
2470
2471             /* Update vectorial force */
2472             fix3             = _mm256_add_pd(fix3,tx);
2473             fiy3             = _mm256_add_pd(fiy3,ty);
2474             fiz3             = _mm256_add_pd(fiz3,tz);
2475
2476             fjx2             = _mm256_add_pd(fjx2,tx);
2477             fjy2             = _mm256_add_pd(fjy2,ty);
2478             fjz2             = _mm256_add_pd(fjz2,tz);
2479
2480             /**************************
2481              * CALCULATE INTERACTIONS *
2482              **************************/
2483
2484             r33              = _mm256_mul_pd(rsq33,rinv33);
2485             r33              = _mm256_andnot_pd(dummy_mask,r33);
2486
2487             /* EWALD ELECTROSTATICS */
2488
2489             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2490             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2491             ewitab           = _mm256_cvttpd_epi32(ewrt);
2492             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2493             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2494                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2495                                             &ewtabF,&ewtabFn);
2496             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2497             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2498
2499             fscal            = felec;
2500
2501             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2502
2503             /* Calculate temporary vectorial force */
2504             tx               = _mm256_mul_pd(fscal,dx33);
2505             ty               = _mm256_mul_pd(fscal,dy33);
2506             tz               = _mm256_mul_pd(fscal,dz33);
2507
2508             /* Update vectorial force */
2509             fix3             = _mm256_add_pd(fix3,tx);
2510             fiy3             = _mm256_add_pd(fiy3,ty);
2511             fiz3             = _mm256_add_pd(fiz3,tz);
2512
2513             fjx3             = _mm256_add_pd(fjx3,tx);
2514             fjy3             = _mm256_add_pd(fjy3,ty);
2515             fjz3             = _mm256_add_pd(fjz3,tz);
2516
2517             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2518             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2519             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2520             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2521
2522             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2523                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2524                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2525
2526             /* Inner loop uses 383 flops */
2527         }
2528
2529         /* End of innermost loop */
2530
2531         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2532                                                  f+i_coord_offset,fshift+i_shift_offset);
2533
2534         /* Increment number of inner iterations */
2535         inneriter                  += j_index_end - j_index_start;
2536
2537         /* Outer loop uses 24 flops */
2538     }
2539
2540     /* Increment number of outer iterations */
2541     outeriter        += nri;
2542
2543     /* Update outer/inner flops */
2544
2545     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*383);
2546 }