b11abb8fa40aeebaf4ce0107d6d26d49edb1eaff
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_double
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LJEwald
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82     real             scratch[4*DIM];
83     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84     real *           vdwioffsetptr0;
85     real *           vdwgridioffsetptr0;
86     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     real *           vdwioffsetptr1;
88     real *           vdwgridioffsetptr1;
89     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     real *           vdwgridioffsetptr2;
92     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
93     real *           vdwioffsetptr3;
94     real *           vdwgridioffsetptr3;
95     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
96     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
97     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
98     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
99     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
100     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
101     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
102     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
103     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
104     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
105     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
106     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
107     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
108     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
109     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
110     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
111     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
112     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
113     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
114     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
115     real             *charge;
116     int              nvdwtype;
117     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
118     int              *vdwtype;
119     real             *vdwparam;
120     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
121     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
122     __m256d           c6grid_00;
123     __m256d           c6grid_11;
124     __m256d           c6grid_12;
125     __m256d           c6grid_13;
126     __m256d           c6grid_21;
127     __m256d           c6grid_22;
128     __m256d           c6grid_23;
129     __m256d           c6grid_31;
130     __m256d           c6grid_32;
131     __m256d           c6grid_33;
132     real             *vdwgridparam;
133     __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
134     __m256d           one_half  = _mm256_set1_pd(0.5);
135     __m256d           minus_one = _mm256_set1_pd(-1.0);
136     __m128i          ewitab;
137     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
138     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
139     real             *ewtab;
140     __m256d          dummy_mask,cutoff_mask;
141     __m128           tmpmask0,tmpmask1;
142     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
143     __m256d          one     = _mm256_set1_pd(1.0);
144     __m256d          two     = _mm256_set1_pd(2.0);
145     x                = xx[0];
146     f                = ff[0];
147
148     nri              = nlist->nri;
149     iinr             = nlist->iinr;
150     jindex           = nlist->jindex;
151     jjnr             = nlist->jjnr;
152     shiftidx         = nlist->shift;
153     gid              = nlist->gid;
154     shiftvec         = fr->shift_vec[0];
155     fshift           = fr->fshift[0];
156     facel            = _mm256_set1_pd(fr->epsfac);
157     charge           = mdatoms->chargeA;
158     nvdwtype         = fr->ntype;
159     vdwparam         = fr->nbfp;
160     vdwtype          = mdatoms->typeA;
161     vdwgridparam     = fr->ljpme_c6grid;
162     sh_lj_ewald      = _mm256_set1_pd(fr->ic->sh_lj_ewald);
163     ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
164     ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
165
166     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
167     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
168     beta2            = _mm256_mul_pd(beta,beta);
169     beta3            = _mm256_mul_pd(beta,beta2);
170
171     ewtab            = fr->ic->tabq_coul_FDV0;
172     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
173     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
174
175     /* Setup water-specific parameters */
176     inr              = nlist->iinr[0];
177     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
178     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
179     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
180     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
181     vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
182
183     jq1              = _mm256_set1_pd(charge[inr+1]);
184     jq2              = _mm256_set1_pd(charge[inr+2]);
185     jq3              = _mm256_set1_pd(charge[inr+3]);
186     vdwjidx0A        = 2*vdwtype[inr+0];
187     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
188     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
189     c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
190     qq11             = _mm256_mul_pd(iq1,jq1);
191     qq12             = _mm256_mul_pd(iq1,jq2);
192     qq13             = _mm256_mul_pd(iq1,jq3);
193     qq21             = _mm256_mul_pd(iq2,jq1);
194     qq22             = _mm256_mul_pd(iq2,jq2);
195     qq23             = _mm256_mul_pd(iq2,jq3);
196     qq31             = _mm256_mul_pd(iq3,jq1);
197     qq32             = _mm256_mul_pd(iq3,jq2);
198     qq33             = _mm256_mul_pd(iq3,jq3);
199
200     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
201     rcutoff_scalar   = fr->rcoulomb;
202     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
203     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
204
205     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
206     rvdw             = _mm256_set1_pd(fr->rvdw);
207
208     /* Avoid stupid compiler warnings */
209     jnrA = jnrB = jnrC = jnrD = 0;
210     j_coord_offsetA = 0;
211     j_coord_offsetB = 0;
212     j_coord_offsetC = 0;
213     j_coord_offsetD = 0;
214
215     outeriter        = 0;
216     inneriter        = 0;
217
218     for(iidx=0;iidx<4*DIM;iidx++)
219     {
220         scratch[iidx] = 0.0;
221     }
222
223     /* Start outer loop over neighborlists */
224     for(iidx=0; iidx<nri; iidx++)
225     {
226         /* Load shift vector for this list */
227         i_shift_offset   = DIM*shiftidx[iidx];
228
229         /* Load limits for loop over neighbors */
230         j_index_start    = jindex[iidx];
231         j_index_end      = jindex[iidx+1];
232
233         /* Get outer coordinate index */
234         inr              = iinr[iidx];
235         i_coord_offset   = DIM*inr;
236
237         /* Load i particle coords and add shift vector */
238         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
239                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
240
241         fix0             = _mm256_setzero_pd();
242         fiy0             = _mm256_setzero_pd();
243         fiz0             = _mm256_setzero_pd();
244         fix1             = _mm256_setzero_pd();
245         fiy1             = _mm256_setzero_pd();
246         fiz1             = _mm256_setzero_pd();
247         fix2             = _mm256_setzero_pd();
248         fiy2             = _mm256_setzero_pd();
249         fiz2             = _mm256_setzero_pd();
250         fix3             = _mm256_setzero_pd();
251         fiy3             = _mm256_setzero_pd();
252         fiz3             = _mm256_setzero_pd();
253
254         /* Reset potential sums */
255         velecsum         = _mm256_setzero_pd();
256         vvdwsum          = _mm256_setzero_pd();
257
258         /* Start inner kernel loop */
259         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
260         {
261
262             /* Get j neighbor index, and coordinate index */
263             jnrA             = jjnr[jidx];
264             jnrB             = jjnr[jidx+1];
265             jnrC             = jjnr[jidx+2];
266             jnrD             = jjnr[jidx+3];
267             j_coord_offsetA  = DIM*jnrA;
268             j_coord_offsetB  = DIM*jnrB;
269             j_coord_offsetC  = DIM*jnrC;
270             j_coord_offsetD  = DIM*jnrD;
271
272             /* load j atom coordinates */
273             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
274                                                  x+j_coord_offsetC,x+j_coord_offsetD,
275                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
276                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
277
278             /* Calculate displacement vector */
279             dx00             = _mm256_sub_pd(ix0,jx0);
280             dy00             = _mm256_sub_pd(iy0,jy0);
281             dz00             = _mm256_sub_pd(iz0,jz0);
282             dx11             = _mm256_sub_pd(ix1,jx1);
283             dy11             = _mm256_sub_pd(iy1,jy1);
284             dz11             = _mm256_sub_pd(iz1,jz1);
285             dx12             = _mm256_sub_pd(ix1,jx2);
286             dy12             = _mm256_sub_pd(iy1,jy2);
287             dz12             = _mm256_sub_pd(iz1,jz2);
288             dx13             = _mm256_sub_pd(ix1,jx3);
289             dy13             = _mm256_sub_pd(iy1,jy3);
290             dz13             = _mm256_sub_pd(iz1,jz3);
291             dx21             = _mm256_sub_pd(ix2,jx1);
292             dy21             = _mm256_sub_pd(iy2,jy1);
293             dz21             = _mm256_sub_pd(iz2,jz1);
294             dx22             = _mm256_sub_pd(ix2,jx2);
295             dy22             = _mm256_sub_pd(iy2,jy2);
296             dz22             = _mm256_sub_pd(iz2,jz2);
297             dx23             = _mm256_sub_pd(ix2,jx3);
298             dy23             = _mm256_sub_pd(iy2,jy3);
299             dz23             = _mm256_sub_pd(iz2,jz3);
300             dx31             = _mm256_sub_pd(ix3,jx1);
301             dy31             = _mm256_sub_pd(iy3,jy1);
302             dz31             = _mm256_sub_pd(iz3,jz1);
303             dx32             = _mm256_sub_pd(ix3,jx2);
304             dy32             = _mm256_sub_pd(iy3,jy2);
305             dz32             = _mm256_sub_pd(iz3,jz2);
306             dx33             = _mm256_sub_pd(ix3,jx3);
307             dy33             = _mm256_sub_pd(iy3,jy3);
308             dz33             = _mm256_sub_pd(iz3,jz3);
309
310             /* Calculate squared distance and things based on it */
311             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
312             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
313             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
314             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
315             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
316             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
317             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
318             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
319             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
320             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
321
322             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
323             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
324             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
325             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
326             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
327             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
328             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
329             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
330             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
331             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
332
333             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
334             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
335             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
336             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
337             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
338             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
339             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
340             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
341             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
342             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
343
344             fjx0             = _mm256_setzero_pd();
345             fjy0             = _mm256_setzero_pd();
346             fjz0             = _mm256_setzero_pd();
347             fjx1             = _mm256_setzero_pd();
348             fjy1             = _mm256_setzero_pd();
349             fjz1             = _mm256_setzero_pd();
350             fjx2             = _mm256_setzero_pd();
351             fjy2             = _mm256_setzero_pd();
352             fjz2             = _mm256_setzero_pd();
353             fjx3             = _mm256_setzero_pd();
354             fjy3             = _mm256_setzero_pd();
355             fjz3             = _mm256_setzero_pd();
356
357             /**************************
358              * CALCULATE INTERACTIONS *
359              **************************/
360
361             if (gmx_mm256_any_lt(rsq00,rcutoff2))
362             {
363
364             r00              = _mm256_mul_pd(rsq00,rinv00);
365
366             /* Analytical LJ-PME */
367             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
368             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
369             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
370             exponent         = gmx_simd_exp_d(ewcljrsq);
371             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
372             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
373             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
374             vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
375             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
376             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
377                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
378             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
379             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
380
381             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
382
383             /* Update potential sum for this i atom from the interaction with this j atom. */
384             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
385             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
386
387             fscal            = fvdw;
388
389             fscal            = _mm256_and_pd(fscal,cutoff_mask);
390
391             /* Calculate temporary vectorial force */
392             tx               = _mm256_mul_pd(fscal,dx00);
393             ty               = _mm256_mul_pd(fscal,dy00);
394             tz               = _mm256_mul_pd(fscal,dz00);
395
396             /* Update vectorial force */
397             fix0             = _mm256_add_pd(fix0,tx);
398             fiy0             = _mm256_add_pd(fiy0,ty);
399             fiz0             = _mm256_add_pd(fiz0,tz);
400
401             fjx0             = _mm256_add_pd(fjx0,tx);
402             fjy0             = _mm256_add_pd(fjy0,ty);
403             fjz0             = _mm256_add_pd(fjz0,tz);
404
405             }
406
407             /**************************
408              * CALCULATE INTERACTIONS *
409              **************************/
410
411             if (gmx_mm256_any_lt(rsq11,rcutoff2))
412             {
413
414             r11              = _mm256_mul_pd(rsq11,rinv11);
415
416             /* EWALD ELECTROSTATICS */
417
418             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
419             ewrt             = _mm256_mul_pd(r11,ewtabscale);
420             ewitab           = _mm256_cvttpd_epi32(ewrt);
421             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
422             ewitab           = _mm_slli_epi32(ewitab,2);
423             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
424             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
425             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
426             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
427             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
428             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
429             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
430             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
431             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
432
433             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
434
435             /* Update potential sum for this i atom from the interaction with this j atom. */
436             velec            = _mm256_and_pd(velec,cutoff_mask);
437             velecsum         = _mm256_add_pd(velecsum,velec);
438
439             fscal            = felec;
440
441             fscal            = _mm256_and_pd(fscal,cutoff_mask);
442
443             /* Calculate temporary vectorial force */
444             tx               = _mm256_mul_pd(fscal,dx11);
445             ty               = _mm256_mul_pd(fscal,dy11);
446             tz               = _mm256_mul_pd(fscal,dz11);
447
448             /* Update vectorial force */
449             fix1             = _mm256_add_pd(fix1,tx);
450             fiy1             = _mm256_add_pd(fiy1,ty);
451             fiz1             = _mm256_add_pd(fiz1,tz);
452
453             fjx1             = _mm256_add_pd(fjx1,tx);
454             fjy1             = _mm256_add_pd(fjy1,ty);
455             fjz1             = _mm256_add_pd(fjz1,tz);
456
457             }
458
459             /**************************
460              * CALCULATE INTERACTIONS *
461              **************************/
462
463             if (gmx_mm256_any_lt(rsq12,rcutoff2))
464             {
465
466             r12              = _mm256_mul_pd(rsq12,rinv12);
467
468             /* EWALD ELECTROSTATICS */
469
470             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
471             ewrt             = _mm256_mul_pd(r12,ewtabscale);
472             ewitab           = _mm256_cvttpd_epi32(ewrt);
473             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
474             ewitab           = _mm_slli_epi32(ewitab,2);
475             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
476             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
477             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
478             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
479             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
480             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
481             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
482             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
483             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
484
485             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
486
487             /* Update potential sum for this i atom from the interaction with this j atom. */
488             velec            = _mm256_and_pd(velec,cutoff_mask);
489             velecsum         = _mm256_add_pd(velecsum,velec);
490
491             fscal            = felec;
492
493             fscal            = _mm256_and_pd(fscal,cutoff_mask);
494
495             /* Calculate temporary vectorial force */
496             tx               = _mm256_mul_pd(fscal,dx12);
497             ty               = _mm256_mul_pd(fscal,dy12);
498             tz               = _mm256_mul_pd(fscal,dz12);
499
500             /* Update vectorial force */
501             fix1             = _mm256_add_pd(fix1,tx);
502             fiy1             = _mm256_add_pd(fiy1,ty);
503             fiz1             = _mm256_add_pd(fiz1,tz);
504
505             fjx2             = _mm256_add_pd(fjx2,tx);
506             fjy2             = _mm256_add_pd(fjy2,ty);
507             fjz2             = _mm256_add_pd(fjz2,tz);
508
509             }
510
511             /**************************
512              * CALCULATE INTERACTIONS *
513              **************************/
514
515             if (gmx_mm256_any_lt(rsq13,rcutoff2))
516             {
517
518             r13              = _mm256_mul_pd(rsq13,rinv13);
519
520             /* EWALD ELECTROSTATICS */
521
522             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
523             ewrt             = _mm256_mul_pd(r13,ewtabscale);
524             ewitab           = _mm256_cvttpd_epi32(ewrt);
525             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
526             ewitab           = _mm_slli_epi32(ewitab,2);
527             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
528             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
529             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
530             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
531             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
532             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
533             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
534             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
535             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
536
537             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
538
539             /* Update potential sum for this i atom from the interaction with this j atom. */
540             velec            = _mm256_and_pd(velec,cutoff_mask);
541             velecsum         = _mm256_add_pd(velecsum,velec);
542
543             fscal            = felec;
544
545             fscal            = _mm256_and_pd(fscal,cutoff_mask);
546
547             /* Calculate temporary vectorial force */
548             tx               = _mm256_mul_pd(fscal,dx13);
549             ty               = _mm256_mul_pd(fscal,dy13);
550             tz               = _mm256_mul_pd(fscal,dz13);
551
552             /* Update vectorial force */
553             fix1             = _mm256_add_pd(fix1,tx);
554             fiy1             = _mm256_add_pd(fiy1,ty);
555             fiz1             = _mm256_add_pd(fiz1,tz);
556
557             fjx3             = _mm256_add_pd(fjx3,tx);
558             fjy3             = _mm256_add_pd(fjy3,ty);
559             fjz3             = _mm256_add_pd(fjz3,tz);
560
561             }
562
563             /**************************
564              * CALCULATE INTERACTIONS *
565              **************************/
566
567             if (gmx_mm256_any_lt(rsq21,rcutoff2))
568             {
569
570             r21              = _mm256_mul_pd(rsq21,rinv21);
571
572             /* EWALD ELECTROSTATICS */
573
574             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
575             ewrt             = _mm256_mul_pd(r21,ewtabscale);
576             ewitab           = _mm256_cvttpd_epi32(ewrt);
577             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
578             ewitab           = _mm_slli_epi32(ewitab,2);
579             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
580             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
581             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
582             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
583             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
584             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
585             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
586             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
587             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
588
589             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
590
591             /* Update potential sum for this i atom from the interaction with this j atom. */
592             velec            = _mm256_and_pd(velec,cutoff_mask);
593             velecsum         = _mm256_add_pd(velecsum,velec);
594
595             fscal            = felec;
596
597             fscal            = _mm256_and_pd(fscal,cutoff_mask);
598
599             /* Calculate temporary vectorial force */
600             tx               = _mm256_mul_pd(fscal,dx21);
601             ty               = _mm256_mul_pd(fscal,dy21);
602             tz               = _mm256_mul_pd(fscal,dz21);
603
604             /* Update vectorial force */
605             fix2             = _mm256_add_pd(fix2,tx);
606             fiy2             = _mm256_add_pd(fiy2,ty);
607             fiz2             = _mm256_add_pd(fiz2,tz);
608
609             fjx1             = _mm256_add_pd(fjx1,tx);
610             fjy1             = _mm256_add_pd(fjy1,ty);
611             fjz1             = _mm256_add_pd(fjz1,tz);
612
613             }
614
615             /**************************
616              * CALCULATE INTERACTIONS *
617              **************************/
618
619             if (gmx_mm256_any_lt(rsq22,rcutoff2))
620             {
621
622             r22              = _mm256_mul_pd(rsq22,rinv22);
623
624             /* EWALD ELECTROSTATICS */
625
626             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
627             ewrt             = _mm256_mul_pd(r22,ewtabscale);
628             ewitab           = _mm256_cvttpd_epi32(ewrt);
629             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
630             ewitab           = _mm_slli_epi32(ewitab,2);
631             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
632             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
633             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
634             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
635             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
636             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
637             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
638             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
639             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
640
641             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
642
643             /* Update potential sum for this i atom from the interaction with this j atom. */
644             velec            = _mm256_and_pd(velec,cutoff_mask);
645             velecsum         = _mm256_add_pd(velecsum,velec);
646
647             fscal            = felec;
648
649             fscal            = _mm256_and_pd(fscal,cutoff_mask);
650
651             /* Calculate temporary vectorial force */
652             tx               = _mm256_mul_pd(fscal,dx22);
653             ty               = _mm256_mul_pd(fscal,dy22);
654             tz               = _mm256_mul_pd(fscal,dz22);
655
656             /* Update vectorial force */
657             fix2             = _mm256_add_pd(fix2,tx);
658             fiy2             = _mm256_add_pd(fiy2,ty);
659             fiz2             = _mm256_add_pd(fiz2,tz);
660
661             fjx2             = _mm256_add_pd(fjx2,tx);
662             fjy2             = _mm256_add_pd(fjy2,ty);
663             fjz2             = _mm256_add_pd(fjz2,tz);
664
665             }
666
667             /**************************
668              * CALCULATE INTERACTIONS *
669              **************************/
670
671             if (gmx_mm256_any_lt(rsq23,rcutoff2))
672             {
673
674             r23              = _mm256_mul_pd(rsq23,rinv23);
675
676             /* EWALD ELECTROSTATICS */
677
678             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
679             ewrt             = _mm256_mul_pd(r23,ewtabscale);
680             ewitab           = _mm256_cvttpd_epi32(ewrt);
681             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
682             ewitab           = _mm_slli_epi32(ewitab,2);
683             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
684             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
685             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
686             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
687             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
688             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
689             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
690             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
691             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
692
693             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
694
695             /* Update potential sum for this i atom from the interaction with this j atom. */
696             velec            = _mm256_and_pd(velec,cutoff_mask);
697             velecsum         = _mm256_add_pd(velecsum,velec);
698
699             fscal            = felec;
700
701             fscal            = _mm256_and_pd(fscal,cutoff_mask);
702
703             /* Calculate temporary vectorial force */
704             tx               = _mm256_mul_pd(fscal,dx23);
705             ty               = _mm256_mul_pd(fscal,dy23);
706             tz               = _mm256_mul_pd(fscal,dz23);
707
708             /* Update vectorial force */
709             fix2             = _mm256_add_pd(fix2,tx);
710             fiy2             = _mm256_add_pd(fiy2,ty);
711             fiz2             = _mm256_add_pd(fiz2,tz);
712
713             fjx3             = _mm256_add_pd(fjx3,tx);
714             fjy3             = _mm256_add_pd(fjy3,ty);
715             fjz3             = _mm256_add_pd(fjz3,tz);
716
717             }
718
719             /**************************
720              * CALCULATE INTERACTIONS *
721              **************************/
722
723             if (gmx_mm256_any_lt(rsq31,rcutoff2))
724             {
725
726             r31              = _mm256_mul_pd(rsq31,rinv31);
727
728             /* EWALD ELECTROSTATICS */
729
730             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
731             ewrt             = _mm256_mul_pd(r31,ewtabscale);
732             ewitab           = _mm256_cvttpd_epi32(ewrt);
733             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
734             ewitab           = _mm_slli_epi32(ewitab,2);
735             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
736             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
737             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
738             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
739             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
740             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
741             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
742             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
743             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
744
745             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
746
747             /* Update potential sum for this i atom from the interaction with this j atom. */
748             velec            = _mm256_and_pd(velec,cutoff_mask);
749             velecsum         = _mm256_add_pd(velecsum,velec);
750
751             fscal            = felec;
752
753             fscal            = _mm256_and_pd(fscal,cutoff_mask);
754
755             /* Calculate temporary vectorial force */
756             tx               = _mm256_mul_pd(fscal,dx31);
757             ty               = _mm256_mul_pd(fscal,dy31);
758             tz               = _mm256_mul_pd(fscal,dz31);
759
760             /* Update vectorial force */
761             fix3             = _mm256_add_pd(fix3,tx);
762             fiy3             = _mm256_add_pd(fiy3,ty);
763             fiz3             = _mm256_add_pd(fiz3,tz);
764
765             fjx1             = _mm256_add_pd(fjx1,tx);
766             fjy1             = _mm256_add_pd(fjy1,ty);
767             fjz1             = _mm256_add_pd(fjz1,tz);
768
769             }
770
771             /**************************
772              * CALCULATE INTERACTIONS *
773              **************************/
774
775             if (gmx_mm256_any_lt(rsq32,rcutoff2))
776             {
777
778             r32              = _mm256_mul_pd(rsq32,rinv32);
779
780             /* EWALD ELECTROSTATICS */
781
782             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
783             ewrt             = _mm256_mul_pd(r32,ewtabscale);
784             ewitab           = _mm256_cvttpd_epi32(ewrt);
785             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
786             ewitab           = _mm_slli_epi32(ewitab,2);
787             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
788             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
789             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
790             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
791             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
792             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
793             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
794             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
795             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
796
797             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
798
799             /* Update potential sum for this i atom from the interaction with this j atom. */
800             velec            = _mm256_and_pd(velec,cutoff_mask);
801             velecsum         = _mm256_add_pd(velecsum,velec);
802
803             fscal            = felec;
804
805             fscal            = _mm256_and_pd(fscal,cutoff_mask);
806
807             /* Calculate temporary vectorial force */
808             tx               = _mm256_mul_pd(fscal,dx32);
809             ty               = _mm256_mul_pd(fscal,dy32);
810             tz               = _mm256_mul_pd(fscal,dz32);
811
812             /* Update vectorial force */
813             fix3             = _mm256_add_pd(fix3,tx);
814             fiy3             = _mm256_add_pd(fiy3,ty);
815             fiz3             = _mm256_add_pd(fiz3,tz);
816
817             fjx2             = _mm256_add_pd(fjx2,tx);
818             fjy2             = _mm256_add_pd(fjy2,ty);
819             fjz2             = _mm256_add_pd(fjz2,tz);
820
821             }
822
823             /**************************
824              * CALCULATE INTERACTIONS *
825              **************************/
826
827             if (gmx_mm256_any_lt(rsq33,rcutoff2))
828             {
829
830             r33              = _mm256_mul_pd(rsq33,rinv33);
831
832             /* EWALD ELECTROSTATICS */
833
834             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
835             ewrt             = _mm256_mul_pd(r33,ewtabscale);
836             ewitab           = _mm256_cvttpd_epi32(ewrt);
837             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
838             ewitab           = _mm_slli_epi32(ewitab,2);
839             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
840             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
841             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
842             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
843             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
844             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
845             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
846             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
847             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
848
849             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
850
851             /* Update potential sum for this i atom from the interaction with this j atom. */
852             velec            = _mm256_and_pd(velec,cutoff_mask);
853             velecsum         = _mm256_add_pd(velecsum,velec);
854
855             fscal            = felec;
856
857             fscal            = _mm256_and_pd(fscal,cutoff_mask);
858
859             /* Calculate temporary vectorial force */
860             tx               = _mm256_mul_pd(fscal,dx33);
861             ty               = _mm256_mul_pd(fscal,dy33);
862             tz               = _mm256_mul_pd(fscal,dz33);
863
864             /* Update vectorial force */
865             fix3             = _mm256_add_pd(fix3,tx);
866             fiy3             = _mm256_add_pd(fiy3,ty);
867             fiz3             = _mm256_add_pd(fiz3,tz);
868
869             fjx3             = _mm256_add_pd(fjx3,tx);
870             fjy3             = _mm256_add_pd(fjy3,ty);
871             fjz3             = _mm256_add_pd(fjz3,tz);
872
873             }
874
875             fjptrA             = f+j_coord_offsetA;
876             fjptrB             = f+j_coord_offsetB;
877             fjptrC             = f+j_coord_offsetC;
878             fjptrD             = f+j_coord_offsetD;
879
880             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
881                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
882                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
883
884             /* Inner loop uses 479 flops */
885         }
886
887         if(jidx<j_index_end)
888         {
889
890             /* Get j neighbor index, and coordinate index */
891             jnrlistA         = jjnr[jidx];
892             jnrlistB         = jjnr[jidx+1];
893             jnrlistC         = jjnr[jidx+2];
894             jnrlistD         = jjnr[jidx+3];
895             /* Sign of each element will be negative for non-real atoms.
896              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
897              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
898              */
899             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
900
901             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
902             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
903             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
904
905             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
906             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
907             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
908             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
909             j_coord_offsetA  = DIM*jnrA;
910             j_coord_offsetB  = DIM*jnrB;
911             j_coord_offsetC  = DIM*jnrC;
912             j_coord_offsetD  = DIM*jnrD;
913
914             /* load j atom coordinates */
915             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
916                                                  x+j_coord_offsetC,x+j_coord_offsetD,
917                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
918                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
919
920             /* Calculate displacement vector */
921             dx00             = _mm256_sub_pd(ix0,jx0);
922             dy00             = _mm256_sub_pd(iy0,jy0);
923             dz00             = _mm256_sub_pd(iz0,jz0);
924             dx11             = _mm256_sub_pd(ix1,jx1);
925             dy11             = _mm256_sub_pd(iy1,jy1);
926             dz11             = _mm256_sub_pd(iz1,jz1);
927             dx12             = _mm256_sub_pd(ix1,jx2);
928             dy12             = _mm256_sub_pd(iy1,jy2);
929             dz12             = _mm256_sub_pd(iz1,jz2);
930             dx13             = _mm256_sub_pd(ix1,jx3);
931             dy13             = _mm256_sub_pd(iy1,jy3);
932             dz13             = _mm256_sub_pd(iz1,jz3);
933             dx21             = _mm256_sub_pd(ix2,jx1);
934             dy21             = _mm256_sub_pd(iy2,jy1);
935             dz21             = _mm256_sub_pd(iz2,jz1);
936             dx22             = _mm256_sub_pd(ix2,jx2);
937             dy22             = _mm256_sub_pd(iy2,jy2);
938             dz22             = _mm256_sub_pd(iz2,jz2);
939             dx23             = _mm256_sub_pd(ix2,jx3);
940             dy23             = _mm256_sub_pd(iy2,jy3);
941             dz23             = _mm256_sub_pd(iz2,jz3);
942             dx31             = _mm256_sub_pd(ix3,jx1);
943             dy31             = _mm256_sub_pd(iy3,jy1);
944             dz31             = _mm256_sub_pd(iz3,jz1);
945             dx32             = _mm256_sub_pd(ix3,jx2);
946             dy32             = _mm256_sub_pd(iy3,jy2);
947             dz32             = _mm256_sub_pd(iz3,jz2);
948             dx33             = _mm256_sub_pd(ix3,jx3);
949             dy33             = _mm256_sub_pd(iy3,jy3);
950             dz33             = _mm256_sub_pd(iz3,jz3);
951
952             /* Calculate squared distance and things based on it */
953             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
954             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
955             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
956             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
957             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
958             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
959             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
960             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
961             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
962             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
963
964             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
965             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
966             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
967             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
968             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
969             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
970             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
971             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
972             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
973             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
974
975             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
976             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
977             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
978             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
979             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
980             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
981             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
982             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
983             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
984             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
985
986             fjx0             = _mm256_setzero_pd();
987             fjy0             = _mm256_setzero_pd();
988             fjz0             = _mm256_setzero_pd();
989             fjx1             = _mm256_setzero_pd();
990             fjy1             = _mm256_setzero_pd();
991             fjz1             = _mm256_setzero_pd();
992             fjx2             = _mm256_setzero_pd();
993             fjy2             = _mm256_setzero_pd();
994             fjz2             = _mm256_setzero_pd();
995             fjx3             = _mm256_setzero_pd();
996             fjy3             = _mm256_setzero_pd();
997             fjz3             = _mm256_setzero_pd();
998
999             /**************************
1000              * CALCULATE INTERACTIONS *
1001              **************************/
1002
1003             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1004             {
1005
1006             r00              = _mm256_mul_pd(rsq00,rinv00);
1007             r00              = _mm256_andnot_pd(dummy_mask,r00);
1008
1009             /* Analytical LJ-PME */
1010             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1011             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
1012             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
1013             exponent         = gmx_simd_exp_d(ewcljrsq);
1014             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1015             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1016             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
1017             vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
1018             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1019             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
1020                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
1021             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
1022             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
1023
1024             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1025
1026             /* Update potential sum for this i atom from the interaction with this j atom. */
1027             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
1028             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
1029             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
1030
1031             fscal            = fvdw;
1032
1033             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1034
1035             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1036
1037             /* Calculate temporary vectorial force */
1038             tx               = _mm256_mul_pd(fscal,dx00);
1039             ty               = _mm256_mul_pd(fscal,dy00);
1040             tz               = _mm256_mul_pd(fscal,dz00);
1041
1042             /* Update vectorial force */
1043             fix0             = _mm256_add_pd(fix0,tx);
1044             fiy0             = _mm256_add_pd(fiy0,ty);
1045             fiz0             = _mm256_add_pd(fiz0,tz);
1046
1047             fjx0             = _mm256_add_pd(fjx0,tx);
1048             fjy0             = _mm256_add_pd(fjy0,ty);
1049             fjz0             = _mm256_add_pd(fjz0,tz);
1050
1051             }
1052
1053             /**************************
1054              * CALCULATE INTERACTIONS *
1055              **************************/
1056
1057             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1058             {
1059
1060             r11              = _mm256_mul_pd(rsq11,rinv11);
1061             r11              = _mm256_andnot_pd(dummy_mask,r11);
1062
1063             /* EWALD ELECTROSTATICS */
1064
1065             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1066             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1067             ewitab           = _mm256_cvttpd_epi32(ewrt);
1068             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1069             ewitab           = _mm_slli_epi32(ewitab,2);
1070             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1071             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1072             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1073             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1074             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1075             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1076             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1077             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1078             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1079
1080             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1081
1082             /* Update potential sum for this i atom from the interaction with this j atom. */
1083             velec            = _mm256_and_pd(velec,cutoff_mask);
1084             velec            = _mm256_andnot_pd(dummy_mask,velec);
1085             velecsum         = _mm256_add_pd(velecsum,velec);
1086
1087             fscal            = felec;
1088
1089             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1090
1091             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1092
1093             /* Calculate temporary vectorial force */
1094             tx               = _mm256_mul_pd(fscal,dx11);
1095             ty               = _mm256_mul_pd(fscal,dy11);
1096             tz               = _mm256_mul_pd(fscal,dz11);
1097
1098             /* Update vectorial force */
1099             fix1             = _mm256_add_pd(fix1,tx);
1100             fiy1             = _mm256_add_pd(fiy1,ty);
1101             fiz1             = _mm256_add_pd(fiz1,tz);
1102
1103             fjx1             = _mm256_add_pd(fjx1,tx);
1104             fjy1             = _mm256_add_pd(fjy1,ty);
1105             fjz1             = _mm256_add_pd(fjz1,tz);
1106
1107             }
1108
1109             /**************************
1110              * CALCULATE INTERACTIONS *
1111              **************************/
1112
1113             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1114             {
1115
1116             r12              = _mm256_mul_pd(rsq12,rinv12);
1117             r12              = _mm256_andnot_pd(dummy_mask,r12);
1118
1119             /* EWALD ELECTROSTATICS */
1120
1121             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1122             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1123             ewitab           = _mm256_cvttpd_epi32(ewrt);
1124             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1125             ewitab           = _mm_slli_epi32(ewitab,2);
1126             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1127             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1128             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1129             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1130             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1131             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1132             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1133             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1134             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1135
1136             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1137
1138             /* Update potential sum for this i atom from the interaction with this j atom. */
1139             velec            = _mm256_and_pd(velec,cutoff_mask);
1140             velec            = _mm256_andnot_pd(dummy_mask,velec);
1141             velecsum         = _mm256_add_pd(velecsum,velec);
1142
1143             fscal            = felec;
1144
1145             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1146
1147             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1148
1149             /* Calculate temporary vectorial force */
1150             tx               = _mm256_mul_pd(fscal,dx12);
1151             ty               = _mm256_mul_pd(fscal,dy12);
1152             tz               = _mm256_mul_pd(fscal,dz12);
1153
1154             /* Update vectorial force */
1155             fix1             = _mm256_add_pd(fix1,tx);
1156             fiy1             = _mm256_add_pd(fiy1,ty);
1157             fiz1             = _mm256_add_pd(fiz1,tz);
1158
1159             fjx2             = _mm256_add_pd(fjx2,tx);
1160             fjy2             = _mm256_add_pd(fjy2,ty);
1161             fjz2             = _mm256_add_pd(fjz2,tz);
1162
1163             }
1164
1165             /**************************
1166              * CALCULATE INTERACTIONS *
1167              **************************/
1168
1169             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1170             {
1171
1172             r13              = _mm256_mul_pd(rsq13,rinv13);
1173             r13              = _mm256_andnot_pd(dummy_mask,r13);
1174
1175             /* EWALD ELECTROSTATICS */
1176
1177             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1178             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1179             ewitab           = _mm256_cvttpd_epi32(ewrt);
1180             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1181             ewitab           = _mm_slli_epi32(ewitab,2);
1182             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1183             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1184             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1185             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1186             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1187             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1188             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1189             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
1190             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1191
1192             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1193
1194             /* Update potential sum for this i atom from the interaction with this j atom. */
1195             velec            = _mm256_and_pd(velec,cutoff_mask);
1196             velec            = _mm256_andnot_pd(dummy_mask,velec);
1197             velecsum         = _mm256_add_pd(velecsum,velec);
1198
1199             fscal            = felec;
1200
1201             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1202
1203             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1204
1205             /* Calculate temporary vectorial force */
1206             tx               = _mm256_mul_pd(fscal,dx13);
1207             ty               = _mm256_mul_pd(fscal,dy13);
1208             tz               = _mm256_mul_pd(fscal,dz13);
1209
1210             /* Update vectorial force */
1211             fix1             = _mm256_add_pd(fix1,tx);
1212             fiy1             = _mm256_add_pd(fiy1,ty);
1213             fiz1             = _mm256_add_pd(fiz1,tz);
1214
1215             fjx3             = _mm256_add_pd(fjx3,tx);
1216             fjy3             = _mm256_add_pd(fjy3,ty);
1217             fjz3             = _mm256_add_pd(fjz3,tz);
1218
1219             }
1220
1221             /**************************
1222              * CALCULATE INTERACTIONS *
1223              **************************/
1224
1225             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1226             {
1227
1228             r21              = _mm256_mul_pd(rsq21,rinv21);
1229             r21              = _mm256_andnot_pd(dummy_mask,r21);
1230
1231             /* EWALD ELECTROSTATICS */
1232
1233             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1234             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1235             ewitab           = _mm256_cvttpd_epi32(ewrt);
1236             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1237             ewitab           = _mm_slli_epi32(ewitab,2);
1238             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1239             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1240             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1241             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1242             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1243             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1244             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1245             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1246             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1247
1248             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1249
1250             /* Update potential sum for this i atom from the interaction with this j atom. */
1251             velec            = _mm256_and_pd(velec,cutoff_mask);
1252             velec            = _mm256_andnot_pd(dummy_mask,velec);
1253             velecsum         = _mm256_add_pd(velecsum,velec);
1254
1255             fscal            = felec;
1256
1257             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1258
1259             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1260
1261             /* Calculate temporary vectorial force */
1262             tx               = _mm256_mul_pd(fscal,dx21);
1263             ty               = _mm256_mul_pd(fscal,dy21);
1264             tz               = _mm256_mul_pd(fscal,dz21);
1265
1266             /* Update vectorial force */
1267             fix2             = _mm256_add_pd(fix2,tx);
1268             fiy2             = _mm256_add_pd(fiy2,ty);
1269             fiz2             = _mm256_add_pd(fiz2,tz);
1270
1271             fjx1             = _mm256_add_pd(fjx1,tx);
1272             fjy1             = _mm256_add_pd(fjy1,ty);
1273             fjz1             = _mm256_add_pd(fjz1,tz);
1274
1275             }
1276
1277             /**************************
1278              * CALCULATE INTERACTIONS *
1279              **************************/
1280
1281             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1282             {
1283
1284             r22              = _mm256_mul_pd(rsq22,rinv22);
1285             r22              = _mm256_andnot_pd(dummy_mask,r22);
1286
1287             /* EWALD ELECTROSTATICS */
1288
1289             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1290             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1291             ewitab           = _mm256_cvttpd_epi32(ewrt);
1292             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1293             ewitab           = _mm_slli_epi32(ewitab,2);
1294             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1295             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1296             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1297             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1298             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1299             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1300             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1301             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1302             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1303
1304             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1305
1306             /* Update potential sum for this i atom from the interaction with this j atom. */
1307             velec            = _mm256_and_pd(velec,cutoff_mask);
1308             velec            = _mm256_andnot_pd(dummy_mask,velec);
1309             velecsum         = _mm256_add_pd(velecsum,velec);
1310
1311             fscal            = felec;
1312
1313             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1314
1315             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1316
1317             /* Calculate temporary vectorial force */
1318             tx               = _mm256_mul_pd(fscal,dx22);
1319             ty               = _mm256_mul_pd(fscal,dy22);
1320             tz               = _mm256_mul_pd(fscal,dz22);
1321
1322             /* Update vectorial force */
1323             fix2             = _mm256_add_pd(fix2,tx);
1324             fiy2             = _mm256_add_pd(fiy2,ty);
1325             fiz2             = _mm256_add_pd(fiz2,tz);
1326
1327             fjx2             = _mm256_add_pd(fjx2,tx);
1328             fjy2             = _mm256_add_pd(fjy2,ty);
1329             fjz2             = _mm256_add_pd(fjz2,tz);
1330
1331             }
1332
1333             /**************************
1334              * CALCULATE INTERACTIONS *
1335              **************************/
1336
1337             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1338             {
1339
1340             r23              = _mm256_mul_pd(rsq23,rinv23);
1341             r23              = _mm256_andnot_pd(dummy_mask,r23);
1342
1343             /* EWALD ELECTROSTATICS */
1344
1345             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1346             ewrt             = _mm256_mul_pd(r23,ewtabscale);
1347             ewitab           = _mm256_cvttpd_epi32(ewrt);
1348             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1349             ewitab           = _mm_slli_epi32(ewitab,2);
1350             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1351             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1352             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1353             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1354             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1355             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1356             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1357             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
1358             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1359
1360             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1361
1362             /* Update potential sum for this i atom from the interaction with this j atom. */
1363             velec            = _mm256_and_pd(velec,cutoff_mask);
1364             velec            = _mm256_andnot_pd(dummy_mask,velec);
1365             velecsum         = _mm256_add_pd(velecsum,velec);
1366
1367             fscal            = felec;
1368
1369             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1370
1371             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1372
1373             /* Calculate temporary vectorial force */
1374             tx               = _mm256_mul_pd(fscal,dx23);
1375             ty               = _mm256_mul_pd(fscal,dy23);
1376             tz               = _mm256_mul_pd(fscal,dz23);
1377
1378             /* Update vectorial force */
1379             fix2             = _mm256_add_pd(fix2,tx);
1380             fiy2             = _mm256_add_pd(fiy2,ty);
1381             fiz2             = _mm256_add_pd(fiz2,tz);
1382
1383             fjx3             = _mm256_add_pd(fjx3,tx);
1384             fjy3             = _mm256_add_pd(fjy3,ty);
1385             fjz3             = _mm256_add_pd(fjz3,tz);
1386
1387             }
1388
1389             /**************************
1390              * CALCULATE INTERACTIONS *
1391              **************************/
1392
1393             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1394             {
1395
1396             r31              = _mm256_mul_pd(rsq31,rinv31);
1397             r31              = _mm256_andnot_pd(dummy_mask,r31);
1398
1399             /* EWALD ELECTROSTATICS */
1400
1401             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1402             ewrt             = _mm256_mul_pd(r31,ewtabscale);
1403             ewitab           = _mm256_cvttpd_epi32(ewrt);
1404             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1405             ewitab           = _mm_slli_epi32(ewitab,2);
1406             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1407             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1408             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1409             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1410             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1411             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1412             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1413             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
1414             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1415
1416             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1417
1418             /* Update potential sum for this i atom from the interaction with this j atom. */
1419             velec            = _mm256_and_pd(velec,cutoff_mask);
1420             velec            = _mm256_andnot_pd(dummy_mask,velec);
1421             velecsum         = _mm256_add_pd(velecsum,velec);
1422
1423             fscal            = felec;
1424
1425             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1426
1427             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1428
1429             /* Calculate temporary vectorial force */
1430             tx               = _mm256_mul_pd(fscal,dx31);
1431             ty               = _mm256_mul_pd(fscal,dy31);
1432             tz               = _mm256_mul_pd(fscal,dz31);
1433
1434             /* Update vectorial force */
1435             fix3             = _mm256_add_pd(fix3,tx);
1436             fiy3             = _mm256_add_pd(fiy3,ty);
1437             fiz3             = _mm256_add_pd(fiz3,tz);
1438
1439             fjx1             = _mm256_add_pd(fjx1,tx);
1440             fjy1             = _mm256_add_pd(fjy1,ty);
1441             fjz1             = _mm256_add_pd(fjz1,tz);
1442
1443             }
1444
1445             /**************************
1446              * CALCULATE INTERACTIONS *
1447              **************************/
1448
1449             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1450             {
1451
1452             r32              = _mm256_mul_pd(rsq32,rinv32);
1453             r32              = _mm256_andnot_pd(dummy_mask,r32);
1454
1455             /* EWALD ELECTROSTATICS */
1456
1457             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1458             ewrt             = _mm256_mul_pd(r32,ewtabscale);
1459             ewitab           = _mm256_cvttpd_epi32(ewrt);
1460             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1461             ewitab           = _mm_slli_epi32(ewitab,2);
1462             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1463             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1464             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1465             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1466             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1467             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1468             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1469             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
1470             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1471
1472             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1473
1474             /* Update potential sum for this i atom from the interaction with this j atom. */
1475             velec            = _mm256_and_pd(velec,cutoff_mask);
1476             velec            = _mm256_andnot_pd(dummy_mask,velec);
1477             velecsum         = _mm256_add_pd(velecsum,velec);
1478
1479             fscal            = felec;
1480
1481             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1482
1483             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1484
1485             /* Calculate temporary vectorial force */
1486             tx               = _mm256_mul_pd(fscal,dx32);
1487             ty               = _mm256_mul_pd(fscal,dy32);
1488             tz               = _mm256_mul_pd(fscal,dz32);
1489
1490             /* Update vectorial force */
1491             fix3             = _mm256_add_pd(fix3,tx);
1492             fiy3             = _mm256_add_pd(fiy3,ty);
1493             fiz3             = _mm256_add_pd(fiz3,tz);
1494
1495             fjx2             = _mm256_add_pd(fjx2,tx);
1496             fjy2             = _mm256_add_pd(fjy2,ty);
1497             fjz2             = _mm256_add_pd(fjz2,tz);
1498
1499             }
1500
1501             /**************************
1502              * CALCULATE INTERACTIONS *
1503              **************************/
1504
1505             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1506             {
1507
1508             r33              = _mm256_mul_pd(rsq33,rinv33);
1509             r33              = _mm256_andnot_pd(dummy_mask,r33);
1510
1511             /* EWALD ELECTROSTATICS */
1512
1513             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1514             ewrt             = _mm256_mul_pd(r33,ewtabscale);
1515             ewitab           = _mm256_cvttpd_epi32(ewrt);
1516             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1517             ewitab           = _mm_slli_epi32(ewitab,2);
1518             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1519             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1520             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1521             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1522             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1523             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1524             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1525             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
1526             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1527
1528             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1529
1530             /* Update potential sum for this i atom from the interaction with this j atom. */
1531             velec            = _mm256_and_pd(velec,cutoff_mask);
1532             velec            = _mm256_andnot_pd(dummy_mask,velec);
1533             velecsum         = _mm256_add_pd(velecsum,velec);
1534
1535             fscal            = felec;
1536
1537             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1538
1539             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1540
1541             /* Calculate temporary vectorial force */
1542             tx               = _mm256_mul_pd(fscal,dx33);
1543             ty               = _mm256_mul_pd(fscal,dy33);
1544             tz               = _mm256_mul_pd(fscal,dz33);
1545
1546             /* Update vectorial force */
1547             fix3             = _mm256_add_pd(fix3,tx);
1548             fiy3             = _mm256_add_pd(fiy3,ty);
1549             fiz3             = _mm256_add_pd(fiz3,tz);
1550
1551             fjx3             = _mm256_add_pd(fjx3,tx);
1552             fjy3             = _mm256_add_pd(fjy3,ty);
1553             fjz3             = _mm256_add_pd(fjz3,tz);
1554
1555             }
1556
1557             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1558             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1559             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1560             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1561
1562             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1563                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1564                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1565
1566             /* Inner loop uses 489 flops */
1567         }
1568
1569         /* End of innermost loop */
1570
1571         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1572                                                  f+i_coord_offset,fshift+i_shift_offset);
1573
1574         ggid                        = gid[iidx];
1575         /* Update potential energies */
1576         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1577         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1578
1579         /* Increment number of inner iterations */
1580         inneriter                  += j_index_end - j_index_start;
1581
1582         /* Outer loop uses 26 flops */
1583     }
1584
1585     /* Increment number of outer iterations */
1586     outeriter        += nri;
1587
1588     /* Update outer/inner flops */
1589
1590     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*489);
1591 }
1592 /*
1593  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_double
1594  * Electrostatics interaction: Ewald
1595  * VdW interaction:            LJEwald
1596  * Geometry:                   Water4-Water4
1597  * Calculate force/pot:        Force
1598  */
1599 void
1600 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_double
1601                     (t_nblist                    * gmx_restrict       nlist,
1602                      rvec                        * gmx_restrict          xx,
1603                      rvec                        * gmx_restrict          ff,
1604                      t_forcerec                  * gmx_restrict          fr,
1605                      t_mdatoms                   * gmx_restrict     mdatoms,
1606                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1607                      t_nrnb                      * gmx_restrict        nrnb)
1608 {
1609     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1610      * just 0 for non-waters.
1611      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1612      * jnr indices corresponding to data put in the four positions in the SIMD register.
1613      */
1614     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1615     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1616     int              jnrA,jnrB,jnrC,jnrD;
1617     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1618     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1619     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1620     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1621     real             rcutoff_scalar;
1622     real             *shiftvec,*fshift,*x,*f;
1623     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1624     real             scratch[4*DIM];
1625     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1626     real *           vdwioffsetptr0;
1627     real *           vdwgridioffsetptr0;
1628     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1629     real *           vdwioffsetptr1;
1630     real *           vdwgridioffsetptr1;
1631     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1632     real *           vdwioffsetptr2;
1633     real *           vdwgridioffsetptr2;
1634     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1635     real *           vdwioffsetptr3;
1636     real *           vdwgridioffsetptr3;
1637     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1638     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1639     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1640     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1641     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1642     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1643     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1644     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1645     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1646     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1647     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1648     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1649     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1650     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1651     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1652     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1653     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1654     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1655     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1656     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1657     real             *charge;
1658     int              nvdwtype;
1659     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1660     int              *vdwtype;
1661     real             *vdwparam;
1662     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1663     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1664     __m256d           c6grid_00;
1665     __m256d           c6grid_11;
1666     __m256d           c6grid_12;
1667     __m256d           c6grid_13;
1668     __m256d           c6grid_21;
1669     __m256d           c6grid_22;
1670     __m256d           c6grid_23;
1671     __m256d           c6grid_31;
1672     __m256d           c6grid_32;
1673     __m256d           c6grid_33;
1674     real             *vdwgridparam;
1675     __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1676     __m256d           one_half  = _mm256_set1_pd(0.5);
1677     __m256d           minus_one = _mm256_set1_pd(-1.0);
1678     __m128i          ewitab;
1679     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1680     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1681     real             *ewtab;
1682     __m256d          dummy_mask,cutoff_mask;
1683     __m128           tmpmask0,tmpmask1;
1684     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1685     __m256d          one     = _mm256_set1_pd(1.0);
1686     __m256d          two     = _mm256_set1_pd(2.0);
1687     x                = xx[0];
1688     f                = ff[0];
1689
1690     nri              = nlist->nri;
1691     iinr             = nlist->iinr;
1692     jindex           = nlist->jindex;
1693     jjnr             = nlist->jjnr;
1694     shiftidx         = nlist->shift;
1695     gid              = nlist->gid;
1696     shiftvec         = fr->shift_vec[0];
1697     fshift           = fr->fshift[0];
1698     facel            = _mm256_set1_pd(fr->epsfac);
1699     charge           = mdatoms->chargeA;
1700     nvdwtype         = fr->ntype;
1701     vdwparam         = fr->nbfp;
1702     vdwtype          = mdatoms->typeA;
1703     vdwgridparam     = fr->ljpme_c6grid;
1704     sh_lj_ewald      = _mm256_set1_pd(fr->ic->sh_lj_ewald);
1705     ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
1706     ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
1707
1708     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1709     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1710     beta2            = _mm256_mul_pd(beta,beta);
1711     beta3            = _mm256_mul_pd(beta,beta2);
1712
1713     ewtab            = fr->ic->tabq_coul_F;
1714     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1715     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1716
1717     /* Setup water-specific parameters */
1718     inr              = nlist->iinr[0];
1719     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1720     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1721     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1722     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1723     vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
1724
1725     jq1              = _mm256_set1_pd(charge[inr+1]);
1726     jq2              = _mm256_set1_pd(charge[inr+2]);
1727     jq3              = _mm256_set1_pd(charge[inr+3]);
1728     vdwjidx0A        = 2*vdwtype[inr+0];
1729     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1730     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1731     c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
1732     qq11             = _mm256_mul_pd(iq1,jq1);
1733     qq12             = _mm256_mul_pd(iq1,jq2);
1734     qq13             = _mm256_mul_pd(iq1,jq3);
1735     qq21             = _mm256_mul_pd(iq2,jq1);
1736     qq22             = _mm256_mul_pd(iq2,jq2);
1737     qq23             = _mm256_mul_pd(iq2,jq3);
1738     qq31             = _mm256_mul_pd(iq3,jq1);
1739     qq32             = _mm256_mul_pd(iq3,jq2);
1740     qq33             = _mm256_mul_pd(iq3,jq3);
1741
1742     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1743     rcutoff_scalar   = fr->rcoulomb;
1744     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1745     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1746
1747     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
1748     rvdw             = _mm256_set1_pd(fr->rvdw);
1749
1750     /* Avoid stupid compiler warnings */
1751     jnrA = jnrB = jnrC = jnrD = 0;
1752     j_coord_offsetA = 0;
1753     j_coord_offsetB = 0;
1754     j_coord_offsetC = 0;
1755     j_coord_offsetD = 0;
1756
1757     outeriter        = 0;
1758     inneriter        = 0;
1759
1760     for(iidx=0;iidx<4*DIM;iidx++)
1761     {
1762         scratch[iidx] = 0.0;
1763     }
1764
1765     /* Start outer loop over neighborlists */
1766     for(iidx=0; iidx<nri; iidx++)
1767     {
1768         /* Load shift vector for this list */
1769         i_shift_offset   = DIM*shiftidx[iidx];
1770
1771         /* Load limits for loop over neighbors */
1772         j_index_start    = jindex[iidx];
1773         j_index_end      = jindex[iidx+1];
1774
1775         /* Get outer coordinate index */
1776         inr              = iinr[iidx];
1777         i_coord_offset   = DIM*inr;
1778
1779         /* Load i particle coords and add shift vector */
1780         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1781                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1782
1783         fix0             = _mm256_setzero_pd();
1784         fiy0             = _mm256_setzero_pd();
1785         fiz0             = _mm256_setzero_pd();
1786         fix1             = _mm256_setzero_pd();
1787         fiy1             = _mm256_setzero_pd();
1788         fiz1             = _mm256_setzero_pd();
1789         fix2             = _mm256_setzero_pd();
1790         fiy2             = _mm256_setzero_pd();
1791         fiz2             = _mm256_setzero_pd();
1792         fix3             = _mm256_setzero_pd();
1793         fiy3             = _mm256_setzero_pd();
1794         fiz3             = _mm256_setzero_pd();
1795
1796         /* Start inner kernel loop */
1797         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1798         {
1799
1800             /* Get j neighbor index, and coordinate index */
1801             jnrA             = jjnr[jidx];
1802             jnrB             = jjnr[jidx+1];
1803             jnrC             = jjnr[jidx+2];
1804             jnrD             = jjnr[jidx+3];
1805             j_coord_offsetA  = DIM*jnrA;
1806             j_coord_offsetB  = DIM*jnrB;
1807             j_coord_offsetC  = DIM*jnrC;
1808             j_coord_offsetD  = DIM*jnrD;
1809
1810             /* load j atom coordinates */
1811             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1812                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1813                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1814                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1815
1816             /* Calculate displacement vector */
1817             dx00             = _mm256_sub_pd(ix0,jx0);
1818             dy00             = _mm256_sub_pd(iy0,jy0);
1819             dz00             = _mm256_sub_pd(iz0,jz0);
1820             dx11             = _mm256_sub_pd(ix1,jx1);
1821             dy11             = _mm256_sub_pd(iy1,jy1);
1822             dz11             = _mm256_sub_pd(iz1,jz1);
1823             dx12             = _mm256_sub_pd(ix1,jx2);
1824             dy12             = _mm256_sub_pd(iy1,jy2);
1825             dz12             = _mm256_sub_pd(iz1,jz2);
1826             dx13             = _mm256_sub_pd(ix1,jx3);
1827             dy13             = _mm256_sub_pd(iy1,jy3);
1828             dz13             = _mm256_sub_pd(iz1,jz3);
1829             dx21             = _mm256_sub_pd(ix2,jx1);
1830             dy21             = _mm256_sub_pd(iy2,jy1);
1831             dz21             = _mm256_sub_pd(iz2,jz1);
1832             dx22             = _mm256_sub_pd(ix2,jx2);
1833             dy22             = _mm256_sub_pd(iy2,jy2);
1834             dz22             = _mm256_sub_pd(iz2,jz2);
1835             dx23             = _mm256_sub_pd(ix2,jx3);
1836             dy23             = _mm256_sub_pd(iy2,jy3);
1837             dz23             = _mm256_sub_pd(iz2,jz3);
1838             dx31             = _mm256_sub_pd(ix3,jx1);
1839             dy31             = _mm256_sub_pd(iy3,jy1);
1840             dz31             = _mm256_sub_pd(iz3,jz1);
1841             dx32             = _mm256_sub_pd(ix3,jx2);
1842             dy32             = _mm256_sub_pd(iy3,jy2);
1843             dz32             = _mm256_sub_pd(iz3,jz2);
1844             dx33             = _mm256_sub_pd(ix3,jx3);
1845             dy33             = _mm256_sub_pd(iy3,jy3);
1846             dz33             = _mm256_sub_pd(iz3,jz3);
1847
1848             /* Calculate squared distance and things based on it */
1849             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1850             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1851             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1852             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1853             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1854             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1855             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1856             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1857             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1858             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1859
1860             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1861             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1862             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1863             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1864             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1865             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1866             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1867             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1868             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1869             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1870
1871             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1872             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1873             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1874             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1875             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1876             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1877             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1878             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1879             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1880             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1881
1882             fjx0             = _mm256_setzero_pd();
1883             fjy0             = _mm256_setzero_pd();
1884             fjz0             = _mm256_setzero_pd();
1885             fjx1             = _mm256_setzero_pd();
1886             fjy1             = _mm256_setzero_pd();
1887             fjz1             = _mm256_setzero_pd();
1888             fjx2             = _mm256_setzero_pd();
1889             fjy2             = _mm256_setzero_pd();
1890             fjz2             = _mm256_setzero_pd();
1891             fjx3             = _mm256_setzero_pd();
1892             fjy3             = _mm256_setzero_pd();
1893             fjz3             = _mm256_setzero_pd();
1894
1895             /**************************
1896              * CALCULATE INTERACTIONS *
1897              **************************/
1898
1899             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1900             {
1901
1902             r00              = _mm256_mul_pd(rsq00,rinv00);
1903
1904             /* Analytical LJ-PME */
1905             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1906             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
1907             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
1908             exponent         = gmx_simd_exp_d(ewcljrsq);
1909             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1910             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1911             /* f6A = 6 * C6grid * (1 - poly) */
1912             f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
1913             /* f6B = C6grid * exponent * beta^6 */
1914             f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
1915             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1916             fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1917
1918             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1919
1920             fscal            = fvdw;
1921
1922             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1923
1924             /* Calculate temporary vectorial force */
1925             tx               = _mm256_mul_pd(fscal,dx00);
1926             ty               = _mm256_mul_pd(fscal,dy00);
1927             tz               = _mm256_mul_pd(fscal,dz00);
1928
1929             /* Update vectorial force */
1930             fix0             = _mm256_add_pd(fix0,tx);
1931             fiy0             = _mm256_add_pd(fiy0,ty);
1932             fiz0             = _mm256_add_pd(fiz0,tz);
1933
1934             fjx0             = _mm256_add_pd(fjx0,tx);
1935             fjy0             = _mm256_add_pd(fjy0,ty);
1936             fjz0             = _mm256_add_pd(fjz0,tz);
1937
1938             }
1939
1940             /**************************
1941              * CALCULATE INTERACTIONS *
1942              **************************/
1943
1944             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1945             {
1946
1947             r11              = _mm256_mul_pd(rsq11,rinv11);
1948
1949             /* EWALD ELECTROSTATICS */
1950
1951             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1952             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1953             ewitab           = _mm256_cvttpd_epi32(ewrt);
1954             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1955             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1956                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1957                                             &ewtabF,&ewtabFn);
1958             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1959             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1960
1961             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1962
1963             fscal            = felec;
1964
1965             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1966
1967             /* Calculate temporary vectorial force */
1968             tx               = _mm256_mul_pd(fscal,dx11);
1969             ty               = _mm256_mul_pd(fscal,dy11);
1970             tz               = _mm256_mul_pd(fscal,dz11);
1971
1972             /* Update vectorial force */
1973             fix1             = _mm256_add_pd(fix1,tx);
1974             fiy1             = _mm256_add_pd(fiy1,ty);
1975             fiz1             = _mm256_add_pd(fiz1,tz);
1976
1977             fjx1             = _mm256_add_pd(fjx1,tx);
1978             fjy1             = _mm256_add_pd(fjy1,ty);
1979             fjz1             = _mm256_add_pd(fjz1,tz);
1980
1981             }
1982
1983             /**************************
1984              * CALCULATE INTERACTIONS *
1985              **************************/
1986
1987             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1988             {
1989
1990             r12              = _mm256_mul_pd(rsq12,rinv12);
1991
1992             /* EWALD ELECTROSTATICS */
1993
1994             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1995             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1996             ewitab           = _mm256_cvttpd_epi32(ewrt);
1997             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1998             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1999                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2000                                             &ewtabF,&ewtabFn);
2001             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2002             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2003
2004             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2005
2006             fscal            = felec;
2007
2008             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2009
2010             /* Calculate temporary vectorial force */
2011             tx               = _mm256_mul_pd(fscal,dx12);
2012             ty               = _mm256_mul_pd(fscal,dy12);
2013             tz               = _mm256_mul_pd(fscal,dz12);
2014
2015             /* Update vectorial force */
2016             fix1             = _mm256_add_pd(fix1,tx);
2017             fiy1             = _mm256_add_pd(fiy1,ty);
2018             fiz1             = _mm256_add_pd(fiz1,tz);
2019
2020             fjx2             = _mm256_add_pd(fjx2,tx);
2021             fjy2             = _mm256_add_pd(fjy2,ty);
2022             fjz2             = _mm256_add_pd(fjz2,tz);
2023
2024             }
2025
2026             /**************************
2027              * CALCULATE INTERACTIONS *
2028              **************************/
2029
2030             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2031             {
2032
2033             r13              = _mm256_mul_pd(rsq13,rinv13);
2034
2035             /* EWALD ELECTROSTATICS */
2036
2037             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2038             ewrt             = _mm256_mul_pd(r13,ewtabscale);
2039             ewitab           = _mm256_cvttpd_epi32(ewrt);
2040             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2041             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2042                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2043                                             &ewtabF,&ewtabFn);
2044             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2045             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2046
2047             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2048
2049             fscal            = felec;
2050
2051             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2052
2053             /* Calculate temporary vectorial force */
2054             tx               = _mm256_mul_pd(fscal,dx13);
2055             ty               = _mm256_mul_pd(fscal,dy13);
2056             tz               = _mm256_mul_pd(fscal,dz13);
2057
2058             /* Update vectorial force */
2059             fix1             = _mm256_add_pd(fix1,tx);
2060             fiy1             = _mm256_add_pd(fiy1,ty);
2061             fiz1             = _mm256_add_pd(fiz1,tz);
2062
2063             fjx3             = _mm256_add_pd(fjx3,tx);
2064             fjy3             = _mm256_add_pd(fjy3,ty);
2065             fjz3             = _mm256_add_pd(fjz3,tz);
2066
2067             }
2068
2069             /**************************
2070              * CALCULATE INTERACTIONS *
2071              **************************/
2072
2073             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2074             {
2075
2076             r21              = _mm256_mul_pd(rsq21,rinv21);
2077
2078             /* EWALD ELECTROSTATICS */
2079
2080             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2081             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2082             ewitab           = _mm256_cvttpd_epi32(ewrt);
2083             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2084             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2085                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2086                                             &ewtabF,&ewtabFn);
2087             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2088             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2089
2090             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2091
2092             fscal            = felec;
2093
2094             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2095
2096             /* Calculate temporary vectorial force */
2097             tx               = _mm256_mul_pd(fscal,dx21);
2098             ty               = _mm256_mul_pd(fscal,dy21);
2099             tz               = _mm256_mul_pd(fscal,dz21);
2100
2101             /* Update vectorial force */
2102             fix2             = _mm256_add_pd(fix2,tx);
2103             fiy2             = _mm256_add_pd(fiy2,ty);
2104             fiz2             = _mm256_add_pd(fiz2,tz);
2105
2106             fjx1             = _mm256_add_pd(fjx1,tx);
2107             fjy1             = _mm256_add_pd(fjy1,ty);
2108             fjz1             = _mm256_add_pd(fjz1,tz);
2109
2110             }
2111
2112             /**************************
2113              * CALCULATE INTERACTIONS *
2114              **************************/
2115
2116             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2117             {
2118
2119             r22              = _mm256_mul_pd(rsq22,rinv22);
2120
2121             /* EWALD ELECTROSTATICS */
2122
2123             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2124             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2125             ewitab           = _mm256_cvttpd_epi32(ewrt);
2126             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2127             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2128                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2129                                             &ewtabF,&ewtabFn);
2130             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2131             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2132
2133             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2134
2135             fscal            = felec;
2136
2137             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2138
2139             /* Calculate temporary vectorial force */
2140             tx               = _mm256_mul_pd(fscal,dx22);
2141             ty               = _mm256_mul_pd(fscal,dy22);
2142             tz               = _mm256_mul_pd(fscal,dz22);
2143
2144             /* Update vectorial force */
2145             fix2             = _mm256_add_pd(fix2,tx);
2146             fiy2             = _mm256_add_pd(fiy2,ty);
2147             fiz2             = _mm256_add_pd(fiz2,tz);
2148
2149             fjx2             = _mm256_add_pd(fjx2,tx);
2150             fjy2             = _mm256_add_pd(fjy2,ty);
2151             fjz2             = _mm256_add_pd(fjz2,tz);
2152
2153             }
2154
2155             /**************************
2156              * CALCULATE INTERACTIONS *
2157              **************************/
2158
2159             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2160             {
2161
2162             r23              = _mm256_mul_pd(rsq23,rinv23);
2163
2164             /* EWALD ELECTROSTATICS */
2165
2166             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2167             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2168             ewitab           = _mm256_cvttpd_epi32(ewrt);
2169             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2170             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2171                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2172                                             &ewtabF,&ewtabFn);
2173             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2174             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2175
2176             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2177
2178             fscal            = felec;
2179
2180             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2181
2182             /* Calculate temporary vectorial force */
2183             tx               = _mm256_mul_pd(fscal,dx23);
2184             ty               = _mm256_mul_pd(fscal,dy23);
2185             tz               = _mm256_mul_pd(fscal,dz23);
2186
2187             /* Update vectorial force */
2188             fix2             = _mm256_add_pd(fix2,tx);
2189             fiy2             = _mm256_add_pd(fiy2,ty);
2190             fiz2             = _mm256_add_pd(fiz2,tz);
2191
2192             fjx3             = _mm256_add_pd(fjx3,tx);
2193             fjy3             = _mm256_add_pd(fjy3,ty);
2194             fjz3             = _mm256_add_pd(fjz3,tz);
2195
2196             }
2197
2198             /**************************
2199              * CALCULATE INTERACTIONS *
2200              **************************/
2201
2202             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2203             {
2204
2205             r31              = _mm256_mul_pd(rsq31,rinv31);
2206
2207             /* EWALD ELECTROSTATICS */
2208
2209             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2210             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2211             ewitab           = _mm256_cvttpd_epi32(ewrt);
2212             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2213             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2214                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2215                                             &ewtabF,&ewtabFn);
2216             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2217             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2218
2219             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2220
2221             fscal            = felec;
2222
2223             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2224
2225             /* Calculate temporary vectorial force */
2226             tx               = _mm256_mul_pd(fscal,dx31);
2227             ty               = _mm256_mul_pd(fscal,dy31);
2228             tz               = _mm256_mul_pd(fscal,dz31);
2229
2230             /* Update vectorial force */
2231             fix3             = _mm256_add_pd(fix3,tx);
2232             fiy3             = _mm256_add_pd(fiy3,ty);
2233             fiz3             = _mm256_add_pd(fiz3,tz);
2234
2235             fjx1             = _mm256_add_pd(fjx1,tx);
2236             fjy1             = _mm256_add_pd(fjy1,ty);
2237             fjz1             = _mm256_add_pd(fjz1,tz);
2238
2239             }
2240
2241             /**************************
2242              * CALCULATE INTERACTIONS *
2243              **************************/
2244
2245             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2246             {
2247
2248             r32              = _mm256_mul_pd(rsq32,rinv32);
2249
2250             /* EWALD ELECTROSTATICS */
2251
2252             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2253             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2254             ewitab           = _mm256_cvttpd_epi32(ewrt);
2255             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2256             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2257                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2258                                             &ewtabF,&ewtabFn);
2259             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2260             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2261
2262             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2263
2264             fscal            = felec;
2265
2266             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2267
2268             /* Calculate temporary vectorial force */
2269             tx               = _mm256_mul_pd(fscal,dx32);
2270             ty               = _mm256_mul_pd(fscal,dy32);
2271             tz               = _mm256_mul_pd(fscal,dz32);
2272
2273             /* Update vectorial force */
2274             fix3             = _mm256_add_pd(fix3,tx);
2275             fiy3             = _mm256_add_pd(fiy3,ty);
2276             fiz3             = _mm256_add_pd(fiz3,tz);
2277
2278             fjx2             = _mm256_add_pd(fjx2,tx);
2279             fjy2             = _mm256_add_pd(fjy2,ty);
2280             fjz2             = _mm256_add_pd(fjz2,tz);
2281
2282             }
2283
2284             /**************************
2285              * CALCULATE INTERACTIONS *
2286              **************************/
2287
2288             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2289             {
2290
2291             r33              = _mm256_mul_pd(rsq33,rinv33);
2292
2293             /* EWALD ELECTROSTATICS */
2294
2295             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2296             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2297             ewitab           = _mm256_cvttpd_epi32(ewrt);
2298             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2299             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2300                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2301                                             &ewtabF,&ewtabFn);
2302             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2303             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2304
2305             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2306
2307             fscal            = felec;
2308
2309             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2310
2311             /* Calculate temporary vectorial force */
2312             tx               = _mm256_mul_pd(fscal,dx33);
2313             ty               = _mm256_mul_pd(fscal,dy33);
2314             tz               = _mm256_mul_pd(fscal,dz33);
2315
2316             /* Update vectorial force */
2317             fix3             = _mm256_add_pd(fix3,tx);
2318             fiy3             = _mm256_add_pd(fiy3,ty);
2319             fiz3             = _mm256_add_pd(fiz3,tz);
2320
2321             fjx3             = _mm256_add_pd(fjx3,tx);
2322             fjy3             = _mm256_add_pd(fjy3,ty);
2323             fjz3             = _mm256_add_pd(fjz3,tz);
2324
2325             }
2326
2327             fjptrA             = f+j_coord_offsetA;
2328             fjptrB             = f+j_coord_offsetB;
2329             fjptrC             = f+j_coord_offsetC;
2330             fjptrD             = f+j_coord_offsetD;
2331
2332             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2333                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2334                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2335
2336             /* Inner loop uses 403 flops */
2337         }
2338
2339         if(jidx<j_index_end)
2340         {
2341
2342             /* Get j neighbor index, and coordinate index */
2343             jnrlistA         = jjnr[jidx];
2344             jnrlistB         = jjnr[jidx+1];
2345             jnrlistC         = jjnr[jidx+2];
2346             jnrlistD         = jjnr[jidx+3];
2347             /* Sign of each element will be negative for non-real atoms.
2348              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2349              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2350              */
2351             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2352
2353             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2354             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2355             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2356
2357             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2358             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2359             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2360             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2361             j_coord_offsetA  = DIM*jnrA;
2362             j_coord_offsetB  = DIM*jnrB;
2363             j_coord_offsetC  = DIM*jnrC;
2364             j_coord_offsetD  = DIM*jnrD;
2365
2366             /* load j atom coordinates */
2367             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2368                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2369                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2370                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2371
2372             /* Calculate displacement vector */
2373             dx00             = _mm256_sub_pd(ix0,jx0);
2374             dy00             = _mm256_sub_pd(iy0,jy0);
2375             dz00             = _mm256_sub_pd(iz0,jz0);
2376             dx11             = _mm256_sub_pd(ix1,jx1);
2377             dy11             = _mm256_sub_pd(iy1,jy1);
2378             dz11             = _mm256_sub_pd(iz1,jz1);
2379             dx12             = _mm256_sub_pd(ix1,jx2);
2380             dy12             = _mm256_sub_pd(iy1,jy2);
2381             dz12             = _mm256_sub_pd(iz1,jz2);
2382             dx13             = _mm256_sub_pd(ix1,jx3);
2383             dy13             = _mm256_sub_pd(iy1,jy3);
2384             dz13             = _mm256_sub_pd(iz1,jz3);
2385             dx21             = _mm256_sub_pd(ix2,jx1);
2386             dy21             = _mm256_sub_pd(iy2,jy1);
2387             dz21             = _mm256_sub_pd(iz2,jz1);
2388             dx22             = _mm256_sub_pd(ix2,jx2);
2389             dy22             = _mm256_sub_pd(iy2,jy2);
2390             dz22             = _mm256_sub_pd(iz2,jz2);
2391             dx23             = _mm256_sub_pd(ix2,jx3);
2392             dy23             = _mm256_sub_pd(iy2,jy3);
2393             dz23             = _mm256_sub_pd(iz2,jz3);
2394             dx31             = _mm256_sub_pd(ix3,jx1);
2395             dy31             = _mm256_sub_pd(iy3,jy1);
2396             dz31             = _mm256_sub_pd(iz3,jz1);
2397             dx32             = _mm256_sub_pd(ix3,jx2);
2398             dy32             = _mm256_sub_pd(iy3,jy2);
2399             dz32             = _mm256_sub_pd(iz3,jz2);
2400             dx33             = _mm256_sub_pd(ix3,jx3);
2401             dy33             = _mm256_sub_pd(iy3,jy3);
2402             dz33             = _mm256_sub_pd(iz3,jz3);
2403
2404             /* Calculate squared distance and things based on it */
2405             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2406             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2407             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2408             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2409             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2410             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2411             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2412             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2413             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2414             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2415
2416             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
2417             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2418             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2419             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
2420             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2421             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2422             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
2423             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
2424             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
2425             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
2426
2427             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2428             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2429             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2430             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2431             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2432             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2433             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2434             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2435             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2436             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2437
2438             fjx0             = _mm256_setzero_pd();
2439             fjy0             = _mm256_setzero_pd();
2440             fjz0             = _mm256_setzero_pd();
2441             fjx1             = _mm256_setzero_pd();
2442             fjy1             = _mm256_setzero_pd();
2443             fjz1             = _mm256_setzero_pd();
2444             fjx2             = _mm256_setzero_pd();
2445             fjy2             = _mm256_setzero_pd();
2446             fjz2             = _mm256_setzero_pd();
2447             fjx3             = _mm256_setzero_pd();
2448             fjy3             = _mm256_setzero_pd();
2449             fjz3             = _mm256_setzero_pd();
2450
2451             /**************************
2452              * CALCULATE INTERACTIONS *
2453              **************************/
2454
2455             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2456             {
2457
2458             r00              = _mm256_mul_pd(rsq00,rinv00);
2459             r00              = _mm256_andnot_pd(dummy_mask,r00);
2460
2461             /* Analytical LJ-PME */
2462             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2463             ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
2464             ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
2465             exponent         = gmx_simd_exp_d(ewcljrsq);
2466             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2467             poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
2468             /* f6A = 6 * C6grid * (1 - poly) */
2469             f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
2470             /* f6B = C6grid * exponent * beta^6 */
2471             f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
2472             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2473             fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2474
2475             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2476
2477             fscal            = fvdw;
2478
2479             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2480
2481             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2482
2483             /* Calculate temporary vectorial force */
2484             tx               = _mm256_mul_pd(fscal,dx00);
2485             ty               = _mm256_mul_pd(fscal,dy00);
2486             tz               = _mm256_mul_pd(fscal,dz00);
2487
2488             /* Update vectorial force */
2489             fix0             = _mm256_add_pd(fix0,tx);
2490             fiy0             = _mm256_add_pd(fiy0,ty);
2491             fiz0             = _mm256_add_pd(fiz0,tz);
2492
2493             fjx0             = _mm256_add_pd(fjx0,tx);
2494             fjy0             = _mm256_add_pd(fjy0,ty);
2495             fjz0             = _mm256_add_pd(fjz0,tz);
2496
2497             }
2498
2499             /**************************
2500              * CALCULATE INTERACTIONS *
2501              **************************/
2502
2503             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2504             {
2505
2506             r11              = _mm256_mul_pd(rsq11,rinv11);
2507             r11              = _mm256_andnot_pd(dummy_mask,r11);
2508
2509             /* EWALD ELECTROSTATICS */
2510
2511             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2512             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2513             ewitab           = _mm256_cvttpd_epi32(ewrt);
2514             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2515             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2516                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2517                                             &ewtabF,&ewtabFn);
2518             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2519             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2520
2521             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2522
2523             fscal            = felec;
2524
2525             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2526
2527             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2528
2529             /* Calculate temporary vectorial force */
2530             tx               = _mm256_mul_pd(fscal,dx11);
2531             ty               = _mm256_mul_pd(fscal,dy11);
2532             tz               = _mm256_mul_pd(fscal,dz11);
2533
2534             /* Update vectorial force */
2535             fix1             = _mm256_add_pd(fix1,tx);
2536             fiy1             = _mm256_add_pd(fiy1,ty);
2537             fiz1             = _mm256_add_pd(fiz1,tz);
2538
2539             fjx1             = _mm256_add_pd(fjx1,tx);
2540             fjy1             = _mm256_add_pd(fjy1,ty);
2541             fjz1             = _mm256_add_pd(fjz1,tz);
2542
2543             }
2544
2545             /**************************
2546              * CALCULATE INTERACTIONS *
2547              **************************/
2548
2549             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2550             {
2551
2552             r12              = _mm256_mul_pd(rsq12,rinv12);
2553             r12              = _mm256_andnot_pd(dummy_mask,r12);
2554
2555             /* EWALD ELECTROSTATICS */
2556
2557             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2558             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2559             ewitab           = _mm256_cvttpd_epi32(ewrt);
2560             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2561             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2562                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2563                                             &ewtabF,&ewtabFn);
2564             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2565             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2566
2567             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2568
2569             fscal            = felec;
2570
2571             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2572
2573             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2574
2575             /* Calculate temporary vectorial force */
2576             tx               = _mm256_mul_pd(fscal,dx12);
2577             ty               = _mm256_mul_pd(fscal,dy12);
2578             tz               = _mm256_mul_pd(fscal,dz12);
2579
2580             /* Update vectorial force */
2581             fix1             = _mm256_add_pd(fix1,tx);
2582             fiy1             = _mm256_add_pd(fiy1,ty);
2583             fiz1             = _mm256_add_pd(fiz1,tz);
2584
2585             fjx2             = _mm256_add_pd(fjx2,tx);
2586             fjy2             = _mm256_add_pd(fjy2,ty);
2587             fjz2             = _mm256_add_pd(fjz2,tz);
2588
2589             }
2590
2591             /**************************
2592              * CALCULATE INTERACTIONS *
2593              **************************/
2594
2595             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2596             {
2597
2598             r13              = _mm256_mul_pd(rsq13,rinv13);
2599             r13              = _mm256_andnot_pd(dummy_mask,r13);
2600
2601             /* EWALD ELECTROSTATICS */
2602
2603             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2604             ewrt             = _mm256_mul_pd(r13,ewtabscale);
2605             ewitab           = _mm256_cvttpd_epi32(ewrt);
2606             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2607             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2608                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2609                                             &ewtabF,&ewtabFn);
2610             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2611             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2612
2613             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2614
2615             fscal            = felec;
2616
2617             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2618
2619             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2620
2621             /* Calculate temporary vectorial force */
2622             tx               = _mm256_mul_pd(fscal,dx13);
2623             ty               = _mm256_mul_pd(fscal,dy13);
2624             tz               = _mm256_mul_pd(fscal,dz13);
2625
2626             /* Update vectorial force */
2627             fix1             = _mm256_add_pd(fix1,tx);
2628             fiy1             = _mm256_add_pd(fiy1,ty);
2629             fiz1             = _mm256_add_pd(fiz1,tz);
2630
2631             fjx3             = _mm256_add_pd(fjx3,tx);
2632             fjy3             = _mm256_add_pd(fjy3,ty);
2633             fjz3             = _mm256_add_pd(fjz3,tz);
2634
2635             }
2636
2637             /**************************
2638              * CALCULATE INTERACTIONS *
2639              **************************/
2640
2641             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2642             {
2643
2644             r21              = _mm256_mul_pd(rsq21,rinv21);
2645             r21              = _mm256_andnot_pd(dummy_mask,r21);
2646
2647             /* EWALD ELECTROSTATICS */
2648
2649             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2650             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2651             ewitab           = _mm256_cvttpd_epi32(ewrt);
2652             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2653             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2654                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2655                                             &ewtabF,&ewtabFn);
2656             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2657             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2658
2659             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2660
2661             fscal            = felec;
2662
2663             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2664
2665             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2666
2667             /* Calculate temporary vectorial force */
2668             tx               = _mm256_mul_pd(fscal,dx21);
2669             ty               = _mm256_mul_pd(fscal,dy21);
2670             tz               = _mm256_mul_pd(fscal,dz21);
2671
2672             /* Update vectorial force */
2673             fix2             = _mm256_add_pd(fix2,tx);
2674             fiy2             = _mm256_add_pd(fiy2,ty);
2675             fiz2             = _mm256_add_pd(fiz2,tz);
2676
2677             fjx1             = _mm256_add_pd(fjx1,tx);
2678             fjy1             = _mm256_add_pd(fjy1,ty);
2679             fjz1             = _mm256_add_pd(fjz1,tz);
2680
2681             }
2682
2683             /**************************
2684              * CALCULATE INTERACTIONS *
2685              **************************/
2686
2687             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2688             {
2689
2690             r22              = _mm256_mul_pd(rsq22,rinv22);
2691             r22              = _mm256_andnot_pd(dummy_mask,r22);
2692
2693             /* EWALD ELECTROSTATICS */
2694
2695             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2696             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2697             ewitab           = _mm256_cvttpd_epi32(ewrt);
2698             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2699             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2700                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2701                                             &ewtabF,&ewtabFn);
2702             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2703             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2704
2705             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2706
2707             fscal            = felec;
2708
2709             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2710
2711             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2712
2713             /* Calculate temporary vectorial force */
2714             tx               = _mm256_mul_pd(fscal,dx22);
2715             ty               = _mm256_mul_pd(fscal,dy22);
2716             tz               = _mm256_mul_pd(fscal,dz22);
2717
2718             /* Update vectorial force */
2719             fix2             = _mm256_add_pd(fix2,tx);
2720             fiy2             = _mm256_add_pd(fiy2,ty);
2721             fiz2             = _mm256_add_pd(fiz2,tz);
2722
2723             fjx2             = _mm256_add_pd(fjx2,tx);
2724             fjy2             = _mm256_add_pd(fjy2,ty);
2725             fjz2             = _mm256_add_pd(fjz2,tz);
2726
2727             }
2728
2729             /**************************
2730              * CALCULATE INTERACTIONS *
2731              **************************/
2732
2733             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2734             {
2735
2736             r23              = _mm256_mul_pd(rsq23,rinv23);
2737             r23              = _mm256_andnot_pd(dummy_mask,r23);
2738
2739             /* EWALD ELECTROSTATICS */
2740
2741             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2742             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2743             ewitab           = _mm256_cvttpd_epi32(ewrt);
2744             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2745             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2746                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2747                                             &ewtabF,&ewtabFn);
2748             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2749             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2750
2751             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2752
2753             fscal            = felec;
2754
2755             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2756
2757             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2758
2759             /* Calculate temporary vectorial force */
2760             tx               = _mm256_mul_pd(fscal,dx23);
2761             ty               = _mm256_mul_pd(fscal,dy23);
2762             tz               = _mm256_mul_pd(fscal,dz23);
2763
2764             /* Update vectorial force */
2765             fix2             = _mm256_add_pd(fix2,tx);
2766             fiy2             = _mm256_add_pd(fiy2,ty);
2767             fiz2             = _mm256_add_pd(fiz2,tz);
2768
2769             fjx3             = _mm256_add_pd(fjx3,tx);
2770             fjy3             = _mm256_add_pd(fjy3,ty);
2771             fjz3             = _mm256_add_pd(fjz3,tz);
2772
2773             }
2774
2775             /**************************
2776              * CALCULATE INTERACTIONS *
2777              **************************/
2778
2779             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2780             {
2781
2782             r31              = _mm256_mul_pd(rsq31,rinv31);
2783             r31              = _mm256_andnot_pd(dummy_mask,r31);
2784
2785             /* EWALD ELECTROSTATICS */
2786
2787             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2788             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2789             ewitab           = _mm256_cvttpd_epi32(ewrt);
2790             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2791             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2792                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2793                                             &ewtabF,&ewtabFn);
2794             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2795             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2796
2797             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2798
2799             fscal            = felec;
2800
2801             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2802
2803             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2804
2805             /* Calculate temporary vectorial force */
2806             tx               = _mm256_mul_pd(fscal,dx31);
2807             ty               = _mm256_mul_pd(fscal,dy31);
2808             tz               = _mm256_mul_pd(fscal,dz31);
2809
2810             /* Update vectorial force */
2811             fix3             = _mm256_add_pd(fix3,tx);
2812             fiy3             = _mm256_add_pd(fiy3,ty);
2813             fiz3             = _mm256_add_pd(fiz3,tz);
2814
2815             fjx1             = _mm256_add_pd(fjx1,tx);
2816             fjy1             = _mm256_add_pd(fjy1,ty);
2817             fjz1             = _mm256_add_pd(fjz1,tz);
2818
2819             }
2820
2821             /**************************
2822              * CALCULATE INTERACTIONS *
2823              **************************/
2824
2825             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2826             {
2827
2828             r32              = _mm256_mul_pd(rsq32,rinv32);
2829             r32              = _mm256_andnot_pd(dummy_mask,r32);
2830
2831             /* EWALD ELECTROSTATICS */
2832
2833             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2834             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2835             ewitab           = _mm256_cvttpd_epi32(ewrt);
2836             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2837             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2838                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2839                                             &ewtabF,&ewtabFn);
2840             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2841             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2842
2843             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2844
2845             fscal            = felec;
2846
2847             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2848
2849             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2850
2851             /* Calculate temporary vectorial force */
2852             tx               = _mm256_mul_pd(fscal,dx32);
2853             ty               = _mm256_mul_pd(fscal,dy32);
2854             tz               = _mm256_mul_pd(fscal,dz32);
2855
2856             /* Update vectorial force */
2857             fix3             = _mm256_add_pd(fix3,tx);
2858             fiy3             = _mm256_add_pd(fiy3,ty);
2859             fiz3             = _mm256_add_pd(fiz3,tz);
2860
2861             fjx2             = _mm256_add_pd(fjx2,tx);
2862             fjy2             = _mm256_add_pd(fjy2,ty);
2863             fjz2             = _mm256_add_pd(fjz2,tz);
2864
2865             }
2866
2867             /**************************
2868              * CALCULATE INTERACTIONS *
2869              **************************/
2870
2871             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2872             {
2873
2874             r33              = _mm256_mul_pd(rsq33,rinv33);
2875             r33              = _mm256_andnot_pd(dummy_mask,r33);
2876
2877             /* EWALD ELECTROSTATICS */
2878
2879             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2880             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2881             ewitab           = _mm256_cvttpd_epi32(ewrt);
2882             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2883             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2884                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2885                                             &ewtabF,&ewtabFn);
2886             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2887             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2888
2889             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2890
2891             fscal            = felec;
2892
2893             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2894
2895             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2896
2897             /* Calculate temporary vectorial force */
2898             tx               = _mm256_mul_pd(fscal,dx33);
2899             ty               = _mm256_mul_pd(fscal,dy33);
2900             tz               = _mm256_mul_pd(fscal,dz33);
2901
2902             /* Update vectorial force */
2903             fix3             = _mm256_add_pd(fix3,tx);
2904             fiy3             = _mm256_add_pd(fiy3,ty);
2905             fiz3             = _mm256_add_pd(fiz3,tz);
2906
2907             fjx3             = _mm256_add_pd(fjx3,tx);
2908             fjy3             = _mm256_add_pd(fjy3,ty);
2909             fjz3             = _mm256_add_pd(fjz3,tz);
2910
2911             }
2912
2913             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2914             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2915             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2916             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2917
2918             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2919                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2920                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2921
2922             /* Inner loop uses 413 flops */
2923         }
2924
2925         /* End of innermost loop */
2926
2927         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2928                                                  f+i_coord_offset,fshift+i_shift_offset);
2929
2930         /* Increment number of inner iterations */
2931         inneriter                  += j_index_end - j_index_start;
2932
2933         /* Outer loop uses 24 flops */
2934     }
2935
2936     /* Increment number of outer iterations */
2937     outeriter        += nri;
2938
2939     /* Update outer/inner flops */
2940
2941     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*413);
2942 }