Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     real *           vdwioffsetptr0;
84     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     real *           vdwioffsetptr1;
86     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     real *           vdwioffsetptr3;
90     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
116     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
117     __m128i          ewitab;
118     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
119     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
120     real             *ewtab;
121     __m256d          dummy_mask,cutoff_mask;
122     __m128           tmpmask0,tmpmask1;
123     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
124     __m256d          one     = _mm256_set1_pd(1.0);
125     __m256d          two     = _mm256_set1_pd(2.0);
126     x                = xx[0];
127     f                = ff[0];
128
129     nri              = nlist->nri;
130     iinr             = nlist->iinr;
131     jindex           = nlist->jindex;
132     jjnr             = nlist->jjnr;
133     shiftidx         = nlist->shift;
134     gid              = nlist->gid;
135     shiftvec         = fr->shift_vec[0];
136     fshift           = fr->fshift[0];
137     facel            = _mm256_set1_pd(fr->ic->epsfac);
138     charge           = mdatoms->chargeA;
139     nvdwtype         = fr->ntype;
140     vdwparam         = fr->nbfp;
141     vdwtype          = mdatoms->typeA;
142
143     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
144     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
145     beta2            = _mm256_mul_pd(beta,beta);
146     beta3            = _mm256_mul_pd(beta,beta2);
147
148     ewtab            = fr->ic->tabq_coul_FDV0;
149     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
150     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
151
152     /* Setup water-specific parameters */
153     inr              = nlist->iinr[0];
154     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
155     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
156     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
157     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
158
159     jq1              = _mm256_set1_pd(charge[inr+1]);
160     jq2              = _mm256_set1_pd(charge[inr+2]);
161     jq3              = _mm256_set1_pd(charge[inr+3]);
162     vdwjidx0A        = 2*vdwtype[inr+0];
163     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
164     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
165     qq11             = _mm256_mul_pd(iq1,jq1);
166     qq12             = _mm256_mul_pd(iq1,jq2);
167     qq13             = _mm256_mul_pd(iq1,jq3);
168     qq21             = _mm256_mul_pd(iq2,jq1);
169     qq22             = _mm256_mul_pd(iq2,jq2);
170     qq23             = _mm256_mul_pd(iq2,jq3);
171     qq31             = _mm256_mul_pd(iq3,jq1);
172     qq32             = _mm256_mul_pd(iq3,jq2);
173     qq33             = _mm256_mul_pd(iq3,jq3);
174
175     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
176     rcutoff_scalar   = fr->ic->rcoulomb;
177     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
178     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
179
180     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
181     rvdw             = _mm256_set1_pd(fr->ic->rvdw);
182
183     /* Avoid stupid compiler warnings */
184     jnrA = jnrB = jnrC = jnrD = 0;
185     j_coord_offsetA = 0;
186     j_coord_offsetB = 0;
187     j_coord_offsetC = 0;
188     j_coord_offsetD = 0;
189
190     outeriter        = 0;
191     inneriter        = 0;
192
193     for(iidx=0;iidx<4*DIM;iidx++)
194     {
195         scratch[iidx] = 0.0;
196     }
197
198     /* Start outer loop over neighborlists */
199     for(iidx=0; iidx<nri; iidx++)
200     {
201         /* Load shift vector for this list */
202         i_shift_offset   = DIM*shiftidx[iidx];
203
204         /* Load limits for loop over neighbors */
205         j_index_start    = jindex[iidx];
206         j_index_end      = jindex[iidx+1];
207
208         /* Get outer coordinate index */
209         inr              = iinr[iidx];
210         i_coord_offset   = DIM*inr;
211
212         /* Load i particle coords and add shift vector */
213         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
214                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
215
216         fix0             = _mm256_setzero_pd();
217         fiy0             = _mm256_setzero_pd();
218         fiz0             = _mm256_setzero_pd();
219         fix1             = _mm256_setzero_pd();
220         fiy1             = _mm256_setzero_pd();
221         fiz1             = _mm256_setzero_pd();
222         fix2             = _mm256_setzero_pd();
223         fiy2             = _mm256_setzero_pd();
224         fiz2             = _mm256_setzero_pd();
225         fix3             = _mm256_setzero_pd();
226         fiy3             = _mm256_setzero_pd();
227         fiz3             = _mm256_setzero_pd();
228
229         /* Reset potential sums */
230         velecsum         = _mm256_setzero_pd();
231         vvdwsum          = _mm256_setzero_pd();
232
233         /* Start inner kernel loop */
234         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
235         {
236
237             /* Get j neighbor index, and coordinate index */
238             jnrA             = jjnr[jidx];
239             jnrB             = jjnr[jidx+1];
240             jnrC             = jjnr[jidx+2];
241             jnrD             = jjnr[jidx+3];
242             j_coord_offsetA  = DIM*jnrA;
243             j_coord_offsetB  = DIM*jnrB;
244             j_coord_offsetC  = DIM*jnrC;
245             j_coord_offsetD  = DIM*jnrD;
246
247             /* load j atom coordinates */
248             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
249                                                  x+j_coord_offsetC,x+j_coord_offsetD,
250                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
251                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
252
253             /* Calculate displacement vector */
254             dx00             = _mm256_sub_pd(ix0,jx0);
255             dy00             = _mm256_sub_pd(iy0,jy0);
256             dz00             = _mm256_sub_pd(iz0,jz0);
257             dx11             = _mm256_sub_pd(ix1,jx1);
258             dy11             = _mm256_sub_pd(iy1,jy1);
259             dz11             = _mm256_sub_pd(iz1,jz1);
260             dx12             = _mm256_sub_pd(ix1,jx2);
261             dy12             = _mm256_sub_pd(iy1,jy2);
262             dz12             = _mm256_sub_pd(iz1,jz2);
263             dx13             = _mm256_sub_pd(ix1,jx3);
264             dy13             = _mm256_sub_pd(iy1,jy3);
265             dz13             = _mm256_sub_pd(iz1,jz3);
266             dx21             = _mm256_sub_pd(ix2,jx1);
267             dy21             = _mm256_sub_pd(iy2,jy1);
268             dz21             = _mm256_sub_pd(iz2,jz1);
269             dx22             = _mm256_sub_pd(ix2,jx2);
270             dy22             = _mm256_sub_pd(iy2,jy2);
271             dz22             = _mm256_sub_pd(iz2,jz2);
272             dx23             = _mm256_sub_pd(ix2,jx3);
273             dy23             = _mm256_sub_pd(iy2,jy3);
274             dz23             = _mm256_sub_pd(iz2,jz3);
275             dx31             = _mm256_sub_pd(ix3,jx1);
276             dy31             = _mm256_sub_pd(iy3,jy1);
277             dz31             = _mm256_sub_pd(iz3,jz1);
278             dx32             = _mm256_sub_pd(ix3,jx2);
279             dy32             = _mm256_sub_pd(iy3,jy2);
280             dz32             = _mm256_sub_pd(iz3,jz2);
281             dx33             = _mm256_sub_pd(ix3,jx3);
282             dy33             = _mm256_sub_pd(iy3,jy3);
283             dz33             = _mm256_sub_pd(iz3,jz3);
284
285             /* Calculate squared distance and things based on it */
286             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
287             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
288             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
289             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
290             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
291             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
292             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
293             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
294             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
295             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
296
297             rinv11           = avx256_invsqrt_d(rsq11);
298             rinv12           = avx256_invsqrt_d(rsq12);
299             rinv13           = avx256_invsqrt_d(rsq13);
300             rinv21           = avx256_invsqrt_d(rsq21);
301             rinv22           = avx256_invsqrt_d(rsq22);
302             rinv23           = avx256_invsqrt_d(rsq23);
303             rinv31           = avx256_invsqrt_d(rsq31);
304             rinv32           = avx256_invsqrt_d(rsq32);
305             rinv33           = avx256_invsqrt_d(rsq33);
306
307             rinvsq00         = avx256_inv_d(rsq00);
308             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
309             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
310             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
311             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
312             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
313             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
314             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
315             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
316             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
317
318             fjx0             = _mm256_setzero_pd();
319             fjy0             = _mm256_setzero_pd();
320             fjz0             = _mm256_setzero_pd();
321             fjx1             = _mm256_setzero_pd();
322             fjy1             = _mm256_setzero_pd();
323             fjz1             = _mm256_setzero_pd();
324             fjx2             = _mm256_setzero_pd();
325             fjy2             = _mm256_setzero_pd();
326             fjz2             = _mm256_setzero_pd();
327             fjx3             = _mm256_setzero_pd();
328             fjy3             = _mm256_setzero_pd();
329             fjz3             = _mm256_setzero_pd();
330
331             /**************************
332              * CALCULATE INTERACTIONS *
333              **************************/
334
335             if (gmx_mm256_any_lt(rsq00,rcutoff2))
336             {
337
338             /* LENNARD-JONES DISPERSION/REPULSION */
339
340             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
341             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
342             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
343             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
344                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
345             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
346
347             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
348
349             /* Update potential sum for this i atom from the interaction with this j atom. */
350             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
351             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
352
353             fscal            = fvdw;
354
355             fscal            = _mm256_and_pd(fscal,cutoff_mask);
356
357             /* Calculate temporary vectorial force */
358             tx               = _mm256_mul_pd(fscal,dx00);
359             ty               = _mm256_mul_pd(fscal,dy00);
360             tz               = _mm256_mul_pd(fscal,dz00);
361
362             /* Update vectorial force */
363             fix0             = _mm256_add_pd(fix0,tx);
364             fiy0             = _mm256_add_pd(fiy0,ty);
365             fiz0             = _mm256_add_pd(fiz0,tz);
366
367             fjx0             = _mm256_add_pd(fjx0,tx);
368             fjy0             = _mm256_add_pd(fjy0,ty);
369             fjz0             = _mm256_add_pd(fjz0,tz);
370
371             }
372
373             /**************************
374              * CALCULATE INTERACTIONS *
375              **************************/
376
377             if (gmx_mm256_any_lt(rsq11,rcutoff2))
378             {
379
380             r11              = _mm256_mul_pd(rsq11,rinv11);
381
382             /* EWALD ELECTROSTATICS */
383
384             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385             ewrt             = _mm256_mul_pd(r11,ewtabscale);
386             ewitab           = _mm256_cvttpd_epi32(ewrt);
387             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
388             ewitab           = _mm_slli_epi32(ewitab,2);
389             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
390             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
391             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
392             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
393             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
394             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
395             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
396             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
397             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
398
399             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
400
401             /* Update potential sum for this i atom from the interaction with this j atom. */
402             velec            = _mm256_and_pd(velec,cutoff_mask);
403             velecsum         = _mm256_add_pd(velecsum,velec);
404
405             fscal            = felec;
406
407             fscal            = _mm256_and_pd(fscal,cutoff_mask);
408
409             /* Calculate temporary vectorial force */
410             tx               = _mm256_mul_pd(fscal,dx11);
411             ty               = _mm256_mul_pd(fscal,dy11);
412             tz               = _mm256_mul_pd(fscal,dz11);
413
414             /* Update vectorial force */
415             fix1             = _mm256_add_pd(fix1,tx);
416             fiy1             = _mm256_add_pd(fiy1,ty);
417             fiz1             = _mm256_add_pd(fiz1,tz);
418
419             fjx1             = _mm256_add_pd(fjx1,tx);
420             fjy1             = _mm256_add_pd(fjy1,ty);
421             fjz1             = _mm256_add_pd(fjz1,tz);
422
423             }
424
425             /**************************
426              * CALCULATE INTERACTIONS *
427              **************************/
428
429             if (gmx_mm256_any_lt(rsq12,rcutoff2))
430             {
431
432             r12              = _mm256_mul_pd(rsq12,rinv12);
433
434             /* EWALD ELECTROSTATICS */
435
436             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
437             ewrt             = _mm256_mul_pd(r12,ewtabscale);
438             ewitab           = _mm256_cvttpd_epi32(ewrt);
439             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
440             ewitab           = _mm_slli_epi32(ewitab,2);
441             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
442             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
443             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
444             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
445             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
446             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
447             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
448             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
449             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
450
451             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
452
453             /* Update potential sum for this i atom from the interaction with this j atom. */
454             velec            = _mm256_and_pd(velec,cutoff_mask);
455             velecsum         = _mm256_add_pd(velecsum,velec);
456
457             fscal            = felec;
458
459             fscal            = _mm256_and_pd(fscal,cutoff_mask);
460
461             /* Calculate temporary vectorial force */
462             tx               = _mm256_mul_pd(fscal,dx12);
463             ty               = _mm256_mul_pd(fscal,dy12);
464             tz               = _mm256_mul_pd(fscal,dz12);
465
466             /* Update vectorial force */
467             fix1             = _mm256_add_pd(fix1,tx);
468             fiy1             = _mm256_add_pd(fiy1,ty);
469             fiz1             = _mm256_add_pd(fiz1,tz);
470
471             fjx2             = _mm256_add_pd(fjx2,tx);
472             fjy2             = _mm256_add_pd(fjy2,ty);
473             fjz2             = _mm256_add_pd(fjz2,tz);
474
475             }
476
477             /**************************
478              * CALCULATE INTERACTIONS *
479              **************************/
480
481             if (gmx_mm256_any_lt(rsq13,rcutoff2))
482             {
483
484             r13              = _mm256_mul_pd(rsq13,rinv13);
485
486             /* EWALD ELECTROSTATICS */
487
488             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
489             ewrt             = _mm256_mul_pd(r13,ewtabscale);
490             ewitab           = _mm256_cvttpd_epi32(ewrt);
491             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
492             ewitab           = _mm_slli_epi32(ewitab,2);
493             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
494             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
495             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
496             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
497             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
498             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
499             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
500             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
501             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
502
503             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
504
505             /* Update potential sum for this i atom from the interaction with this j atom. */
506             velec            = _mm256_and_pd(velec,cutoff_mask);
507             velecsum         = _mm256_add_pd(velecsum,velec);
508
509             fscal            = felec;
510
511             fscal            = _mm256_and_pd(fscal,cutoff_mask);
512
513             /* Calculate temporary vectorial force */
514             tx               = _mm256_mul_pd(fscal,dx13);
515             ty               = _mm256_mul_pd(fscal,dy13);
516             tz               = _mm256_mul_pd(fscal,dz13);
517
518             /* Update vectorial force */
519             fix1             = _mm256_add_pd(fix1,tx);
520             fiy1             = _mm256_add_pd(fiy1,ty);
521             fiz1             = _mm256_add_pd(fiz1,tz);
522
523             fjx3             = _mm256_add_pd(fjx3,tx);
524             fjy3             = _mm256_add_pd(fjy3,ty);
525             fjz3             = _mm256_add_pd(fjz3,tz);
526
527             }
528
529             /**************************
530              * CALCULATE INTERACTIONS *
531              **************************/
532
533             if (gmx_mm256_any_lt(rsq21,rcutoff2))
534             {
535
536             r21              = _mm256_mul_pd(rsq21,rinv21);
537
538             /* EWALD ELECTROSTATICS */
539
540             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
541             ewrt             = _mm256_mul_pd(r21,ewtabscale);
542             ewitab           = _mm256_cvttpd_epi32(ewrt);
543             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
544             ewitab           = _mm_slli_epi32(ewitab,2);
545             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
546             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
547             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
548             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
549             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
550             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
551             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
552             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
553             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
554
555             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
556
557             /* Update potential sum for this i atom from the interaction with this j atom. */
558             velec            = _mm256_and_pd(velec,cutoff_mask);
559             velecsum         = _mm256_add_pd(velecsum,velec);
560
561             fscal            = felec;
562
563             fscal            = _mm256_and_pd(fscal,cutoff_mask);
564
565             /* Calculate temporary vectorial force */
566             tx               = _mm256_mul_pd(fscal,dx21);
567             ty               = _mm256_mul_pd(fscal,dy21);
568             tz               = _mm256_mul_pd(fscal,dz21);
569
570             /* Update vectorial force */
571             fix2             = _mm256_add_pd(fix2,tx);
572             fiy2             = _mm256_add_pd(fiy2,ty);
573             fiz2             = _mm256_add_pd(fiz2,tz);
574
575             fjx1             = _mm256_add_pd(fjx1,tx);
576             fjy1             = _mm256_add_pd(fjy1,ty);
577             fjz1             = _mm256_add_pd(fjz1,tz);
578
579             }
580
581             /**************************
582              * CALCULATE INTERACTIONS *
583              **************************/
584
585             if (gmx_mm256_any_lt(rsq22,rcutoff2))
586             {
587
588             r22              = _mm256_mul_pd(rsq22,rinv22);
589
590             /* EWALD ELECTROSTATICS */
591
592             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
593             ewrt             = _mm256_mul_pd(r22,ewtabscale);
594             ewitab           = _mm256_cvttpd_epi32(ewrt);
595             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
596             ewitab           = _mm_slli_epi32(ewitab,2);
597             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
598             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
599             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
600             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
601             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
602             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
603             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
604             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
605             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
606
607             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
608
609             /* Update potential sum for this i atom from the interaction with this j atom. */
610             velec            = _mm256_and_pd(velec,cutoff_mask);
611             velecsum         = _mm256_add_pd(velecsum,velec);
612
613             fscal            = felec;
614
615             fscal            = _mm256_and_pd(fscal,cutoff_mask);
616
617             /* Calculate temporary vectorial force */
618             tx               = _mm256_mul_pd(fscal,dx22);
619             ty               = _mm256_mul_pd(fscal,dy22);
620             tz               = _mm256_mul_pd(fscal,dz22);
621
622             /* Update vectorial force */
623             fix2             = _mm256_add_pd(fix2,tx);
624             fiy2             = _mm256_add_pd(fiy2,ty);
625             fiz2             = _mm256_add_pd(fiz2,tz);
626
627             fjx2             = _mm256_add_pd(fjx2,tx);
628             fjy2             = _mm256_add_pd(fjy2,ty);
629             fjz2             = _mm256_add_pd(fjz2,tz);
630
631             }
632
633             /**************************
634              * CALCULATE INTERACTIONS *
635              **************************/
636
637             if (gmx_mm256_any_lt(rsq23,rcutoff2))
638             {
639
640             r23              = _mm256_mul_pd(rsq23,rinv23);
641
642             /* EWALD ELECTROSTATICS */
643
644             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
645             ewrt             = _mm256_mul_pd(r23,ewtabscale);
646             ewitab           = _mm256_cvttpd_epi32(ewrt);
647             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
648             ewitab           = _mm_slli_epi32(ewitab,2);
649             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
650             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
651             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
652             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
653             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
654             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
655             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
656             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
657             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
658
659             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
660
661             /* Update potential sum for this i atom from the interaction with this j atom. */
662             velec            = _mm256_and_pd(velec,cutoff_mask);
663             velecsum         = _mm256_add_pd(velecsum,velec);
664
665             fscal            = felec;
666
667             fscal            = _mm256_and_pd(fscal,cutoff_mask);
668
669             /* Calculate temporary vectorial force */
670             tx               = _mm256_mul_pd(fscal,dx23);
671             ty               = _mm256_mul_pd(fscal,dy23);
672             tz               = _mm256_mul_pd(fscal,dz23);
673
674             /* Update vectorial force */
675             fix2             = _mm256_add_pd(fix2,tx);
676             fiy2             = _mm256_add_pd(fiy2,ty);
677             fiz2             = _mm256_add_pd(fiz2,tz);
678
679             fjx3             = _mm256_add_pd(fjx3,tx);
680             fjy3             = _mm256_add_pd(fjy3,ty);
681             fjz3             = _mm256_add_pd(fjz3,tz);
682
683             }
684
685             /**************************
686              * CALCULATE INTERACTIONS *
687              **************************/
688
689             if (gmx_mm256_any_lt(rsq31,rcutoff2))
690             {
691
692             r31              = _mm256_mul_pd(rsq31,rinv31);
693
694             /* EWALD ELECTROSTATICS */
695
696             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
697             ewrt             = _mm256_mul_pd(r31,ewtabscale);
698             ewitab           = _mm256_cvttpd_epi32(ewrt);
699             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
700             ewitab           = _mm_slli_epi32(ewitab,2);
701             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
702             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
703             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
704             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
705             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
706             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
707             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
708             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
709             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
710
711             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
712
713             /* Update potential sum for this i atom from the interaction with this j atom. */
714             velec            = _mm256_and_pd(velec,cutoff_mask);
715             velecsum         = _mm256_add_pd(velecsum,velec);
716
717             fscal            = felec;
718
719             fscal            = _mm256_and_pd(fscal,cutoff_mask);
720
721             /* Calculate temporary vectorial force */
722             tx               = _mm256_mul_pd(fscal,dx31);
723             ty               = _mm256_mul_pd(fscal,dy31);
724             tz               = _mm256_mul_pd(fscal,dz31);
725
726             /* Update vectorial force */
727             fix3             = _mm256_add_pd(fix3,tx);
728             fiy3             = _mm256_add_pd(fiy3,ty);
729             fiz3             = _mm256_add_pd(fiz3,tz);
730
731             fjx1             = _mm256_add_pd(fjx1,tx);
732             fjy1             = _mm256_add_pd(fjy1,ty);
733             fjz1             = _mm256_add_pd(fjz1,tz);
734
735             }
736
737             /**************************
738              * CALCULATE INTERACTIONS *
739              **************************/
740
741             if (gmx_mm256_any_lt(rsq32,rcutoff2))
742             {
743
744             r32              = _mm256_mul_pd(rsq32,rinv32);
745
746             /* EWALD ELECTROSTATICS */
747
748             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
749             ewrt             = _mm256_mul_pd(r32,ewtabscale);
750             ewitab           = _mm256_cvttpd_epi32(ewrt);
751             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
752             ewitab           = _mm_slli_epi32(ewitab,2);
753             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
754             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
755             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
756             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
757             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
758             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
759             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
760             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
761             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
762
763             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
764
765             /* Update potential sum for this i atom from the interaction with this j atom. */
766             velec            = _mm256_and_pd(velec,cutoff_mask);
767             velecsum         = _mm256_add_pd(velecsum,velec);
768
769             fscal            = felec;
770
771             fscal            = _mm256_and_pd(fscal,cutoff_mask);
772
773             /* Calculate temporary vectorial force */
774             tx               = _mm256_mul_pd(fscal,dx32);
775             ty               = _mm256_mul_pd(fscal,dy32);
776             tz               = _mm256_mul_pd(fscal,dz32);
777
778             /* Update vectorial force */
779             fix3             = _mm256_add_pd(fix3,tx);
780             fiy3             = _mm256_add_pd(fiy3,ty);
781             fiz3             = _mm256_add_pd(fiz3,tz);
782
783             fjx2             = _mm256_add_pd(fjx2,tx);
784             fjy2             = _mm256_add_pd(fjy2,ty);
785             fjz2             = _mm256_add_pd(fjz2,tz);
786
787             }
788
789             /**************************
790              * CALCULATE INTERACTIONS *
791              **************************/
792
793             if (gmx_mm256_any_lt(rsq33,rcutoff2))
794             {
795
796             r33              = _mm256_mul_pd(rsq33,rinv33);
797
798             /* EWALD ELECTROSTATICS */
799
800             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
801             ewrt             = _mm256_mul_pd(r33,ewtabscale);
802             ewitab           = _mm256_cvttpd_epi32(ewrt);
803             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
804             ewitab           = _mm_slli_epi32(ewitab,2);
805             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
806             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
807             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
808             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
809             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
810             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
811             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
812             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
813             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
814
815             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
816
817             /* Update potential sum for this i atom from the interaction with this j atom. */
818             velec            = _mm256_and_pd(velec,cutoff_mask);
819             velecsum         = _mm256_add_pd(velecsum,velec);
820
821             fscal            = felec;
822
823             fscal            = _mm256_and_pd(fscal,cutoff_mask);
824
825             /* Calculate temporary vectorial force */
826             tx               = _mm256_mul_pd(fscal,dx33);
827             ty               = _mm256_mul_pd(fscal,dy33);
828             tz               = _mm256_mul_pd(fscal,dz33);
829
830             /* Update vectorial force */
831             fix3             = _mm256_add_pd(fix3,tx);
832             fiy3             = _mm256_add_pd(fiy3,ty);
833             fiz3             = _mm256_add_pd(fiz3,tz);
834
835             fjx3             = _mm256_add_pd(fjx3,tx);
836             fjy3             = _mm256_add_pd(fjy3,ty);
837             fjz3             = _mm256_add_pd(fjz3,tz);
838
839             }
840
841             fjptrA             = f+j_coord_offsetA;
842             fjptrB             = f+j_coord_offsetB;
843             fjptrC             = f+j_coord_offsetC;
844             fjptrD             = f+j_coord_offsetD;
845
846             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
847                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
848                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
849
850             /* Inner loop uses 458 flops */
851         }
852
853         if(jidx<j_index_end)
854         {
855
856             /* Get j neighbor index, and coordinate index */
857             jnrlistA         = jjnr[jidx];
858             jnrlistB         = jjnr[jidx+1];
859             jnrlistC         = jjnr[jidx+2];
860             jnrlistD         = jjnr[jidx+3];
861             /* Sign of each element will be negative for non-real atoms.
862              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
863              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
864              */
865             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
866
867             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
868             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
869             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
870
871             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
872             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
873             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
874             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
875             j_coord_offsetA  = DIM*jnrA;
876             j_coord_offsetB  = DIM*jnrB;
877             j_coord_offsetC  = DIM*jnrC;
878             j_coord_offsetD  = DIM*jnrD;
879
880             /* load j atom coordinates */
881             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
882                                                  x+j_coord_offsetC,x+j_coord_offsetD,
883                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
884                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
885
886             /* Calculate displacement vector */
887             dx00             = _mm256_sub_pd(ix0,jx0);
888             dy00             = _mm256_sub_pd(iy0,jy0);
889             dz00             = _mm256_sub_pd(iz0,jz0);
890             dx11             = _mm256_sub_pd(ix1,jx1);
891             dy11             = _mm256_sub_pd(iy1,jy1);
892             dz11             = _mm256_sub_pd(iz1,jz1);
893             dx12             = _mm256_sub_pd(ix1,jx2);
894             dy12             = _mm256_sub_pd(iy1,jy2);
895             dz12             = _mm256_sub_pd(iz1,jz2);
896             dx13             = _mm256_sub_pd(ix1,jx3);
897             dy13             = _mm256_sub_pd(iy1,jy3);
898             dz13             = _mm256_sub_pd(iz1,jz3);
899             dx21             = _mm256_sub_pd(ix2,jx1);
900             dy21             = _mm256_sub_pd(iy2,jy1);
901             dz21             = _mm256_sub_pd(iz2,jz1);
902             dx22             = _mm256_sub_pd(ix2,jx2);
903             dy22             = _mm256_sub_pd(iy2,jy2);
904             dz22             = _mm256_sub_pd(iz2,jz2);
905             dx23             = _mm256_sub_pd(ix2,jx3);
906             dy23             = _mm256_sub_pd(iy2,jy3);
907             dz23             = _mm256_sub_pd(iz2,jz3);
908             dx31             = _mm256_sub_pd(ix3,jx1);
909             dy31             = _mm256_sub_pd(iy3,jy1);
910             dz31             = _mm256_sub_pd(iz3,jz1);
911             dx32             = _mm256_sub_pd(ix3,jx2);
912             dy32             = _mm256_sub_pd(iy3,jy2);
913             dz32             = _mm256_sub_pd(iz3,jz2);
914             dx33             = _mm256_sub_pd(ix3,jx3);
915             dy33             = _mm256_sub_pd(iy3,jy3);
916             dz33             = _mm256_sub_pd(iz3,jz3);
917
918             /* Calculate squared distance and things based on it */
919             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
920             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
921             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
922             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
923             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
924             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
925             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
926             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
927             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
928             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
929
930             rinv11           = avx256_invsqrt_d(rsq11);
931             rinv12           = avx256_invsqrt_d(rsq12);
932             rinv13           = avx256_invsqrt_d(rsq13);
933             rinv21           = avx256_invsqrt_d(rsq21);
934             rinv22           = avx256_invsqrt_d(rsq22);
935             rinv23           = avx256_invsqrt_d(rsq23);
936             rinv31           = avx256_invsqrt_d(rsq31);
937             rinv32           = avx256_invsqrt_d(rsq32);
938             rinv33           = avx256_invsqrt_d(rsq33);
939
940             rinvsq00         = avx256_inv_d(rsq00);
941             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
942             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
943             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
944             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
945             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
946             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
947             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
948             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
949             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
950
951             fjx0             = _mm256_setzero_pd();
952             fjy0             = _mm256_setzero_pd();
953             fjz0             = _mm256_setzero_pd();
954             fjx1             = _mm256_setzero_pd();
955             fjy1             = _mm256_setzero_pd();
956             fjz1             = _mm256_setzero_pd();
957             fjx2             = _mm256_setzero_pd();
958             fjy2             = _mm256_setzero_pd();
959             fjz2             = _mm256_setzero_pd();
960             fjx3             = _mm256_setzero_pd();
961             fjy3             = _mm256_setzero_pd();
962             fjz3             = _mm256_setzero_pd();
963
964             /**************************
965              * CALCULATE INTERACTIONS *
966              **************************/
967
968             if (gmx_mm256_any_lt(rsq00,rcutoff2))
969             {
970
971             /* LENNARD-JONES DISPERSION/REPULSION */
972
973             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
974             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
975             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
976             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
977                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
978             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
979
980             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
981
982             /* Update potential sum for this i atom from the interaction with this j atom. */
983             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
984             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
985             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
986
987             fscal            = fvdw;
988
989             fscal            = _mm256_and_pd(fscal,cutoff_mask);
990
991             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
992
993             /* Calculate temporary vectorial force */
994             tx               = _mm256_mul_pd(fscal,dx00);
995             ty               = _mm256_mul_pd(fscal,dy00);
996             tz               = _mm256_mul_pd(fscal,dz00);
997
998             /* Update vectorial force */
999             fix0             = _mm256_add_pd(fix0,tx);
1000             fiy0             = _mm256_add_pd(fiy0,ty);
1001             fiz0             = _mm256_add_pd(fiz0,tz);
1002
1003             fjx0             = _mm256_add_pd(fjx0,tx);
1004             fjy0             = _mm256_add_pd(fjy0,ty);
1005             fjz0             = _mm256_add_pd(fjz0,tz);
1006
1007             }
1008
1009             /**************************
1010              * CALCULATE INTERACTIONS *
1011              **************************/
1012
1013             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1014             {
1015
1016             r11              = _mm256_mul_pd(rsq11,rinv11);
1017             r11              = _mm256_andnot_pd(dummy_mask,r11);
1018
1019             /* EWALD ELECTROSTATICS */
1020
1021             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1022             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1023             ewitab           = _mm256_cvttpd_epi32(ewrt);
1024             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1025             ewitab           = _mm_slli_epi32(ewitab,2);
1026             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1027             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1028             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1029             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1030             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1031             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1032             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1033             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1034             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1035
1036             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1037
1038             /* Update potential sum for this i atom from the interaction with this j atom. */
1039             velec            = _mm256_and_pd(velec,cutoff_mask);
1040             velec            = _mm256_andnot_pd(dummy_mask,velec);
1041             velecsum         = _mm256_add_pd(velecsum,velec);
1042
1043             fscal            = felec;
1044
1045             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1046
1047             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1048
1049             /* Calculate temporary vectorial force */
1050             tx               = _mm256_mul_pd(fscal,dx11);
1051             ty               = _mm256_mul_pd(fscal,dy11);
1052             tz               = _mm256_mul_pd(fscal,dz11);
1053
1054             /* Update vectorial force */
1055             fix1             = _mm256_add_pd(fix1,tx);
1056             fiy1             = _mm256_add_pd(fiy1,ty);
1057             fiz1             = _mm256_add_pd(fiz1,tz);
1058
1059             fjx1             = _mm256_add_pd(fjx1,tx);
1060             fjy1             = _mm256_add_pd(fjy1,ty);
1061             fjz1             = _mm256_add_pd(fjz1,tz);
1062
1063             }
1064
1065             /**************************
1066              * CALCULATE INTERACTIONS *
1067              **************************/
1068
1069             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1070             {
1071
1072             r12              = _mm256_mul_pd(rsq12,rinv12);
1073             r12              = _mm256_andnot_pd(dummy_mask,r12);
1074
1075             /* EWALD ELECTROSTATICS */
1076
1077             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1078             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1079             ewitab           = _mm256_cvttpd_epi32(ewrt);
1080             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1081             ewitab           = _mm_slli_epi32(ewitab,2);
1082             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1083             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1084             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1085             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1086             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1087             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1088             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1089             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1090             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1091
1092             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1093
1094             /* Update potential sum for this i atom from the interaction with this j atom. */
1095             velec            = _mm256_and_pd(velec,cutoff_mask);
1096             velec            = _mm256_andnot_pd(dummy_mask,velec);
1097             velecsum         = _mm256_add_pd(velecsum,velec);
1098
1099             fscal            = felec;
1100
1101             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1102
1103             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1104
1105             /* Calculate temporary vectorial force */
1106             tx               = _mm256_mul_pd(fscal,dx12);
1107             ty               = _mm256_mul_pd(fscal,dy12);
1108             tz               = _mm256_mul_pd(fscal,dz12);
1109
1110             /* Update vectorial force */
1111             fix1             = _mm256_add_pd(fix1,tx);
1112             fiy1             = _mm256_add_pd(fiy1,ty);
1113             fiz1             = _mm256_add_pd(fiz1,tz);
1114
1115             fjx2             = _mm256_add_pd(fjx2,tx);
1116             fjy2             = _mm256_add_pd(fjy2,ty);
1117             fjz2             = _mm256_add_pd(fjz2,tz);
1118
1119             }
1120
1121             /**************************
1122              * CALCULATE INTERACTIONS *
1123              **************************/
1124
1125             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1126             {
1127
1128             r13              = _mm256_mul_pd(rsq13,rinv13);
1129             r13              = _mm256_andnot_pd(dummy_mask,r13);
1130
1131             /* EWALD ELECTROSTATICS */
1132
1133             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1134             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1135             ewitab           = _mm256_cvttpd_epi32(ewrt);
1136             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1137             ewitab           = _mm_slli_epi32(ewitab,2);
1138             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1139             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1140             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1141             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1142             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1143             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1144             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1145             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
1146             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1147
1148             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1149
1150             /* Update potential sum for this i atom from the interaction with this j atom. */
1151             velec            = _mm256_and_pd(velec,cutoff_mask);
1152             velec            = _mm256_andnot_pd(dummy_mask,velec);
1153             velecsum         = _mm256_add_pd(velecsum,velec);
1154
1155             fscal            = felec;
1156
1157             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1158
1159             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1160
1161             /* Calculate temporary vectorial force */
1162             tx               = _mm256_mul_pd(fscal,dx13);
1163             ty               = _mm256_mul_pd(fscal,dy13);
1164             tz               = _mm256_mul_pd(fscal,dz13);
1165
1166             /* Update vectorial force */
1167             fix1             = _mm256_add_pd(fix1,tx);
1168             fiy1             = _mm256_add_pd(fiy1,ty);
1169             fiz1             = _mm256_add_pd(fiz1,tz);
1170
1171             fjx3             = _mm256_add_pd(fjx3,tx);
1172             fjy3             = _mm256_add_pd(fjy3,ty);
1173             fjz3             = _mm256_add_pd(fjz3,tz);
1174
1175             }
1176
1177             /**************************
1178              * CALCULATE INTERACTIONS *
1179              **************************/
1180
1181             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1182             {
1183
1184             r21              = _mm256_mul_pd(rsq21,rinv21);
1185             r21              = _mm256_andnot_pd(dummy_mask,r21);
1186
1187             /* EWALD ELECTROSTATICS */
1188
1189             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1190             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1191             ewitab           = _mm256_cvttpd_epi32(ewrt);
1192             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1193             ewitab           = _mm_slli_epi32(ewitab,2);
1194             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1195             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1196             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1197             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1198             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1199             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1200             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1201             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1202             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1203
1204             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1205
1206             /* Update potential sum for this i atom from the interaction with this j atom. */
1207             velec            = _mm256_and_pd(velec,cutoff_mask);
1208             velec            = _mm256_andnot_pd(dummy_mask,velec);
1209             velecsum         = _mm256_add_pd(velecsum,velec);
1210
1211             fscal            = felec;
1212
1213             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1214
1215             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1216
1217             /* Calculate temporary vectorial force */
1218             tx               = _mm256_mul_pd(fscal,dx21);
1219             ty               = _mm256_mul_pd(fscal,dy21);
1220             tz               = _mm256_mul_pd(fscal,dz21);
1221
1222             /* Update vectorial force */
1223             fix2             = _mm256_add_pd(fix2,tx);
1224             fiy2             = _mm256_add_pd(fiy2,ty);
1225             fiz2             = _mm256_add_pd(fiz2,tz);
1226
1227             fjx1             = _mm256_add_pd(fjx1,tx);
1228             fjy1             = _mm256_add_pd(fjy1,ty);
1229             fjz1             = _mm256_add_pd(fjz1,tz);
1230
1231             }
1232
1233             /**************************
1234              * CALCULATE INTERACTIONS *
1235              **************************/
1236
1237             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1238             {
1239
1240             r22              = _mm256_mul_pd(rsq22,rinv22);
1241             r22              = _mm256_andnot_pd(dummy_mask,r22);
1242
1243             /* EWALD ELECTROSTATICS */
1244
1245             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1246             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1247             ewitab           = _mm256_cvttpd_epi32(ewrt);
1248             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1249             ewitab           = _mm_slli_epi32(ewitab,2);
1250             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1251             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1252             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1253             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1254             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1255             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1256             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1257             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1258             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1259
1260             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1261
1262             /* Update potential sum for this i atom from the interaction with this j atom. */
1263             velec            = _mm256_and_pd(velec,cutoff_mask);
1264             velec            = _mm256_andnot_pd(dummy_mask,velec);
1265             velecsum         = _mm256_add_pd(velecsum,velec);
1266
1267             fscal            = felec;
1268
1269             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1270
1271             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1272
1273             /* Calculate temporary vectorial force */
1274             tx               = _mm256_mul_pd(fscal,dx22);
1275             ty               = _mm256_mul_pd(fscal,dy22);
1276             tz               = _mm256_mul_pd(fscal,dz22);
1277
1278             /* Update vectorial force */
1279             fix2             = _mm256_add_pd(fix2,tx);
1280             fiy2             = _mm256_add_pd(fiy2,ty);
1281             fiz2             = _mm256_add_pd(fiz2,tz);
1282
1283             fjx2             = _mm256_add_pd(fjx2,tx);
1284             fjy2             = _mm256_add_pd(fjy2,ty);
1285             fjz2             = _mm256_add_pd(fjz2,tz);
1286
1287             }
1288
1289             /**************************
1290              * CALCULATE INTERACTIONS *
1291              **************************/
1292
1293             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1294             {
1295
1296             r23              = _mm256_mul_pd(rsq23,rinv23);
1297             r23              = _mm256_andnot_pd(dummy_mask,r23);
1298
1299             /* EWALD ELECTROSTATICS */
1300
1301             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1302             ewrt             = _mm256_mul_pd(r23,ewtabscale);
1303             ewitab           = _mm256_cvttpd_epi32(ewrt);
1304             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1305             ewitab           = _mm_slli_epi32(ewitab,2);
1306             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1307             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1308             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1309             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1310             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1311             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1312             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1313             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
1314             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1315
1316             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1317
1318             /* Update potential sum for this i atom from the interaction with this j atom. */
1319             velec            = _mm256_and_pd(velec,cutoff_mask);
1320             velec            = _mm256_andnot_pd(dummy_mask,velec);
1321             velecsum         = _mm256_add_pd(velecsum,velec);
1322
1323             fscal            = felec;
1324
1325             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1326
1327             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1328
1329             /* Calculate temporary vectorial force */
1330             tx               = _mm256_mul_pd(fscal,dx23);
1331             ty               = _mm256_mul_pd(fscal,dy23);
1332             tz               = _mm256_mul_pd(fscal,dz23);
1333
1334             /* Update vectorial force */
1335             fix2             = _mm256_add_pd(fix2,tx);
1336             fiy2             = _mm256_add_pd(fiy2,ty);
1337             fiz2             = _mm256_add_pd(fiz2,tz);
1338
1339             fjx3             = _mm256_add_pd(fjx3,tx);
1340             fjy3             = _mm256_add_pd(fjy3,ty);
1341             fjz3             = _mm256_add_pd(fjz3,tz);
1342
1343             }
1344
1345             /**************************
1346              * CALCULATE INTERACTIONS *
1347              **************************/
1348
1349             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1350             {
1351
1352             r31              = _mm256_mul_pd(rsq31,rinv31);
1353             r31              = _mm256_andnot_pd(dummy_mask,r31);
1354
1355             /* EWALD ELECTROSTATICS */
1356
1357             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1358             ewrt             = _mm256_mul_pd(r31,ewtabscale);
1359             ewitab           = _mm256_cvttpd_epi32(ewrt);
1360             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1361             ewitab           = _mm_slli_epi32(ewitab,2);
1362             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1363             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1364             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1365             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1366             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1367             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1368             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1369             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
1370             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1371
1372             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1373
1374             /* Update potential sum for this i atom from the interaction with this j atom. */
1375             velec            = _mm256_and_pd(velec,cutoff_mask);
1376             velec            = _mm256_andnot_pd(dummy_mask,velec);
1377             velecsum         = _mm256_add_pd(velecsum,velec);
1378
1379             fscal            = felec;
1380
1381             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1382
1383             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1384
1385             /* Calculate temporary vectorial force */
1386             tx               = _mm256_mul_pd(fscal,dx31);
1387             ty               = _mm256_mul_pd(fscal,dy31);
1388             tz               = _mm256_mul_pd(fscal,dz31);
1389
1390             /* Update vectorial force */
1391             fix3             = _mm256_add_pd(fix3,tx);
1392             fiy3             = _mm256_add_pd(fiy3,ty);
1393             fiz3             = _mm256_add_pd(fiz3,tz);
1394
1395             fjx1             = _mm256_add_pd(fjx1,tx);
1396             fjy1             = _mm256_add_pd(fjy1,ty);
1397             fjz1             = _mm256_add_pd(fjz1,tz);
1398
1399             }
1400
1401             /**************************
1402              * CALCULATE INTERACTIONS *
1403              **************************/
1404
1405             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1406             {
1407
1408             r32              = _mm256_mul_pd(rsq32,rinv32);
1409             r32              = _mm256_andnot_pd(dummy_mask,r32);
1410
1411             /* EWALD ELECTROSTATICS */
1412
1413             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1414             ewrt             = _mm256_mul_pd(r32,ewtabscale);
1415             ewitab           = _mm256_cvttpd_epi32(ewrt);
1416             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1417             ewitab           = _mm_slli_epi32(ewitab,2);
1418             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1419             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1420             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1421             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1422             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1423             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1424             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1425             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
1426             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1427
1428             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1429
1430             /* Update potential sum for this i atom from the interaction with this j atom. */
1431             velec            = _mm256_and_pd(velec,cutoff_mask);
1432             velec            = _mm256_andnot_pd(dummy_mask,velec);
1433             velecsum         = _mm256_add_pd(velecsum,velec);
1434
1435             fscal            = felec;
1436
1437             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1438
1439             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1440
1441             /* Calculate temporary vectorial force */
1442             tx               = _mm256_mul_pd(fscal,dx32);
1443             ty               = _mm256_mul_pd(fscal,dy32);
1444             tz               = _mm256_mul_pd(fscal,dz32);
1445
1446             /* Update vectorial force */
1447             fix3             = _mm256_add_pd(fix3,tx);
1448             fiy3             = _mm256_add_pd(fiy3,ty);
1449             fiz3             = _mm256_add_pd(fiz3,tz);
1450
1451             fjx2             = _mm256_add_pd(fjx2,tx);
1452             fjy2             = _mm256_add_pd(fjy2,ty);
1453             fjz2             = _mm256_add_pd(fjz2,tz);
1454
1455             }
1456
1457             /**************************
1458              * CALCULATE INTERACTIONS *
1459              **************************/
1460
1461             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1462             {
1463
1464             r33              = _mm256_mul_pd(rsq33,rinv33);
1465             r33              = _mm256_andnot_pd(dummy_mask,r33);
1466
1467             /* EWALD ELECTROSTATICS */
1468
1469             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1470             ewrt             = _mm256_mul_pd(r33,ewtabscale);
1471             ewitab           = _mm256_cvttpd_epi32(ewrt);
1472             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1473             ewitab           = _mm_slli_epi32(ewitab,2);
1474             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1475             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1476             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1477             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1478             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1479             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1480             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1481             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
1482             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1483
1484             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1485
1486             /* Update potential sum for this i atom from the interaction with this j atom. */
1487             velec            = _mm256_and_pd(velec,cutoff_mask);
1488             velec            = _mm256_andnot_pd(dummy_mask,velec);
1489             velecsum         = _mm256_add_pd(velecsum,velec);
1490
1491             fscal            = felec;
1492
1493             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1494
1495             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1496
1497             /* Calculate temporary vectorial force */
1498             tx               = _mm256_mul_pd(fscal,dx33);
1499             ty               = _mm256_mul_pd(fscal,dy33);
1500             tz               = _mm256_mul_pd(fscal,dz33);
1501
1502             /* Update vectorial force */
1503             fix3             = _mm256_add_pd(fix3,tx);
1504             fiy3             = _mm256_add_pd(fiy3,ty);
1505             fiz3             = _mm256_add_pd(fiz3,tz);
1506
1507             fjx3             = _mm256_add_pd(fjx3,tx);
1508             fjy3             = _mm256_add_pd(fjy3,ty);
1509             fjz3             = _mm256_add_pd(fjz3,tz);
1510
1511             }
1512
1513             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1514             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1515             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1516             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1517
1518             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1519                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1520                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1521
1522             /* Inner loop uses 467 flops */
1523         }
1524
1525         /* End of innermost loop */
1526
1527         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1528                                                  f+i_coord_offset,fshift+i_shift_offset);
1529
1530         ggid                        = gid[iidx];
1531         /* Update potential energies */
1532         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1533         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1534
1535         /* Increment number of inner iterations */
1536         inneriter                  += j_index_end - j_index_start;
1537
1538         /* Outer loop uses 26 flops */
1539     }
1540
1541     /* Increment number of outer iterations */
1542     outeriter        += nri;
1543
1544     /* Update outer/inner flops */
1545
1546     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*467);
1547 }
1548 /*
1549  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double
1550  * Electrostatics interaction: Ewald
1551  * VdW interaction:            LennardJones
1552  * Geometry:                   Water4-Water4
1553  * Calculate force/pot:        Force
1554  */
1555 void
1556 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double
1557                     (t_nblist                    * gmx_restrict       nlist,
1558                      rvec                        * gmx_restrict          xx,
1559                      rvec                        * gmx_restrict          ff,
1560                      struct t_forcerec           * gmx_restrict          fr,
1561                      t_mdatoms                   * gmx_restrict     mdatoms,
1562                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1563                      t_nrnb                      * gmx_restrict        nrnb)
1564 {
1565     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1566      * just 0 for non-waters.
1567      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1568      * jnr indices corresponding to data put in the four positions in the SIMD register.
1569      */
1570     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1571     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1572     int              jnrA,jnrB,jnrC,jnrD;
1573     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1574     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1575     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1576     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1577     real             rcutoff_scalar;
1578     real             *shiftvec,*fshift,*x,*f;
1579     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1580     real             scratch[4*DIM];
1581     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1582     real *           vdwioffsetptr0;
1583     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1584     real *           vdwioffsetptr1;
1585     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1586     real *           vdwioffsetptr2;
1587     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1588     real *           vdwioffsetptr3;
1589     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1590     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1591     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1592     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1593     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1594     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1595     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1596     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1597     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1598     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1599     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1600     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1601     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1602     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1603     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1604     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1605     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1606     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1607     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1608     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1609     real             *charge;
1610     int              nvdwtype;
1611     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1612     int              *vdwtype;
1613     real             *vdwparam;
1614     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1615     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1616     __m128i          ewitab;
1617     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1618     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1619     real             *ewtab;
1620     __m256d          dummy_mask,cutoff_mask;
1621     __m128           tmpmask0,tmpmask1;
1622     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1623     __m256d          one     = _mm256_set1_pd(1.0);
1624     __m256d          two     = _mm256_set1_pd(2.0);
1625     x                = xx[0];
1626     f                = ff[0];
1627
1628     nri              = nlist->nri;
1629     iinr             = nlist->iinr;
1630     jindex           = nlist->jindex;
1631     jjnr             = nlist->jjnr;
1632     shiftidx         = nlist->shift;
1633     gid              = nlist->gid;
1634     shiftvec         = fr->shift_vec[0];
1635     fshift           = fr->fshift[0];
1636     facel            = _mm256_set1_pd(fr->ic->epsfac);
1637     charge           = mdatoms->chargeA;
1638     nvdwtype         = fr->ntype;
1639     vdwparam         = fr->nbfp;
1640     vdwtype          = mdatoms->typeA;
1641
1642     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1643     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1644     beta2            = _mm256_mul_pd(beta,beta);
1645     beta3            = _mm256_mul_pd(beta,beta2);
1646
1647     ewtab            = fr->ic->tabq_coul_F;
1648     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1649     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1650
1651     /* Setup water-specific parameters */
1652     inr              = nlist->iinr[0];
1653     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1654     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1655     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1656     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1657
1658     jq1              = _mm256_set1_pd(charge[inr+1]);
1659     jq2              = _mm256_set1_pd(charge[inr+2]);
1660     jq3              = _mm256_set1_pd(charge[inr+3]);
1661     vdwjidx0A        = 2*vdwtype[inr+0];
1662     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1663     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1664     qq11             = _mm256_mul_pd(iq1,jq1);
1665     qq12             = _mm256_mul_pd(iq1,jq2);
1666     qq13             = _mm256_mul_pd(iq1,jq3);
1667     qq21             = _mm256_mul_pd(iq2,jq1);
1668     qq22             = _mm256_mul_pd(iq2,jq2);
1669     qq23             = _mm256_mul_pd(iq2,jq3);
1670     qq31             = _mm256_mul_pd(iq3,jq1);
1671     qq32             = _mm256_mul_pd(iq3,jq2);
1672     qq33             = _mm256_mul_pd(iq3,jq3);
1673
1674     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1675     rcutoff_scalar   = fr->ic->rcoulomb;
1676     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1677     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1678
1679     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
1680     rvdw             = _mm256_set1_pd(fr->ic->rvdw);
1681
1682     /* Avoid stupid compiler warnings */
1683     jnrA = jnrB = jnrC = jnrD = 0;
1684     j_coord_offsetA = 0;
1685     j_coord_offsetB = 0;
1686     j_coord_offsetC = 0;
1687     j_coord_offsetD = 0;
1688
1689     outeriter        = 0;
1690     inneriter        = 0;
1691
1692     for(iidx=0;iidx<4*DIM;iidx++)
1693     {
1694         scratch[iidx] = 0.0;
1695     }
1696
1697     /* Start outer loop over neighborlists */
1698     for(iidx=0; iidx<nri; iidx++)
1699     {
1700         /* Load shift vector for this list */
1701         i_shift_offset   = DIM*shiftidx[iidx];
1702
1703         /* Load limits for loop over neighbors */
1704         j_index_start    = jindex[iidx];
1705         j_index_end      = jindex[iidx+1];
1706
1707         /* Get outer coordinate index */
1708         inr              = iinr[iidx];
1709         i_coord_offset   = DIM*inr;
1710
1711         /* Load i particle coords and add shift vector */
1712         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1713                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1714
1715         fix0             = _mm256_setzero_pd();
1716         fiy0             = _mm256_setzero_pd();
1717         fiz0             = _mm256_setzero_pd();
1718         fix1             = _mm256_setzero_pd();
1719         fiy1             = _mm256_setzero_pd();
1720         fiz1             = _mm256_setzero_pd();
1721         fix2             = _mm256_setzero_pd();
1722         fiy2             = _mm256_setzero_pd();
1723         fiz2             = _mm256_setzero_pd();
1724         fix3             = _mm256_setzero_pd();
1725         fiy3             = _mm256_setzero_pd();
1726         fiz3             = _mm256_setzero_pd();
1727
1728         /* Start inner kernel loop */
1729         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1730         {
1731
1732             /* Get j neighbor index, and coordinate index */
1733             jnrA             = jjnr[jidx];
1734             jnrB             = jjnr[jidx+1];
1735             jnrC             = jjnr[jidx+2];
1736             jnrD             = jjnr[jidx+3];
1737             j_coord_offsetA  = DIM*jnrA;
1738             j_coord_offsetB  = DIM*jnrB;
1739             j_coord_offsetC  = DIM*jnrC;
1740             j_coord_offsetD  = DIM*jnrD;
1741
1742             /* load j atom coordinates */
1743             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1744                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1745                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1746                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1747
1748             /* Calculate displacement vector */
1749             dx00             = _mm256_sub_pd(ix0,jx0);
1750             dy00             = _mm256_sub_pd(iy0,jy0);
1751             dz00             = _mm256_sub_pd(iz0,jz0);
1752             dx11             = _mm256_sub_pd(ix1,jx1);
1753             dy11             = _mm256_sub_pd(iy1,jy1);
1754             dz11             = _mm256_sub_pd(iz1,jz1);
1755             dx12             = _mm256_sub_pd(ix1,jx2);
1756             dy12             = _mm256_sub_pd(iy1,jy2);
1757             dz12             = _mm256_sub_pd(iz1,jz2);
1758             dx13             = _mm256_sub_pd(ix1,jx3);
1759             dy13             = _mm256_sub_pd(iy1,jy3);
1760             dz13             = _mm256_sub_pd(iz1,jz3);
1761             dx21             = _mm256_sub_pd(ix2,jx1);
1762             dy21             = _mm256_sub_pd(iy2,jy1);
1763             dz21             = _mm256_sub_pd(iz2,jz1);
1764             dx22             = _mm256_sub_pd(ix2,jx2);
1765             dy22             = _mm256_sub_pd(iy2,jy2);
1766             dz22             = _mm256_sub_pd(iz2,jz2);
1767             dx23             = _mm256_sub_pd(ix2,jx3);
1768             dy23             = _mm256_sub_pd(iy2,jy3);
1769             dz23             = _mm256_sub_pd(iz2,jz3);
1770             dx31             = _mm256_sub_pd(ix3,jx1);
1771             dy31             = _mm256_sub_pd(iy3,jy1);
1772             dz31             = _mm256_sub_pd(iz3,jz1);
1773             dx32             = _mm256_sub_pd(ix3,jx2);
1774             dy32             = _mm256_sub_pd(iy3,jy2);
1775             dz32             = _mm256_sub_pd(iz3,jz2);
1776             dx33             = _mm256_sub_pd(ix3,jx3);
1777             dy33             = _mm256_sub_pd(iy3,jy3);
1778             dz33             = _mm256_sub_pd(iz3,jz3);
1779
1780             /* Calculate squared distance and things based on it */
1781             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1782             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1783             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1784             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1785             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1786             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1787             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1788             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1789             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1790             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1791
1792             rinv11           = avx256_invsqrt_d(rsq11);
1793             rinv12           = avx256_invsqrt_d(rsq12);
1794             rinv13           = avx256_invsqrt_d(rsq13);
1795             rinv21           = avx256_invsqrt_d(rsq21);
1796             rinv22           = avx256_invsqrt_d(rsq22);
1797             rinv23           = avx256_invsqrt_d(rsq23);
1798             rinv31           = avx256_invsqrt_d(rsq31);
1799             rinv32           = avx256_invsqrt_d(rsq32);
1800             rinv33           = avx256_invsqrt_d(rsq33);
1801
1802             rinvsq00         = avx256_inv_d(rsq00);
1803             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1804             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1805             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1806             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1807             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1808             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1809             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1810             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1811             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1812
1813             fjx0             = _mm256_setzero_pd();
1814             fjy0             = _mm256_setzero_pd();
1815             fjz0             = _mm256_setzero_pd();
1816             fjx1             = _mm256_setzero_pd();
1817             fjy1             = _mm256_setzero_pd();
1818             fjz1             = _mm256_setzero_pd();
1819             fjx2             = _mm256_setzero_pd();
1820             fjy2             = _mm256_setzero_pd();
1821             fjz2             = _mm256_setzero_pd();
1822             fjx3             = _mm256_setzero_pd();
1823             fjy3             = _mm256_setzero_pd();
1824             fjz3             = _mm256_setzero_pd();
1825
1826             /**************************
1827              * CALCULATE INTERACTIONS *
1828              **************************/
1829
1830             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1831             {
1832
1833             /* LENNARD-JONES DISPERSION/REPULSION */
1834
1835             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1836             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1837
1838             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1839
1840             fscal            = fvdw;
1841
1842             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1843
1844             /* Calculate temporary vectorial force */
1845             tx               = _mm256_mul_pd(fscal,dx00);
1846             ty               = _mm256_mul_pd(fscal,dy00);
1847             tz               = _mm256_mul_pd(fscal,dz00);
1848
1849             /* Update vectorial force */
1850             fix0             = _mm256_add_pd(fix0,tx);
1851             fiy0             = _mm256_add_pd(fiy0,ty);
1852             fiz0             = _mm256_add_pd(fiz0,tz);
1853
1854             fjx0             = _mm256_add_pd(fjx0,tx);
1855             fjy0             = _mm256_add_pd(fjy0,ty);
1856             fjz0             = _mm256_add_pd(fjz0,tz);
1857
1858             }
1859
1860             /**************************
1861              * CALCULATE INTERACTIONS *
1862              **************************/
1863
1864             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1865             {
1866
1867             r11              = _mm256_mul_pd(rsq11,rinv11);
1868
1869             /* EWALD ELECTROSTATICS */
1870
1871             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1872             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1873             ewitab           = _mm256_cvttpd_epi32(ewrt);
1874             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1875             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1876                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1877                                             &ewtabF,&ewtabFn);
1878             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1879             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1880
1881             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1882
1883             fscal            = felec;
1884
1885             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1886
1887             /* Calculate temporary vectorial force */
1888             tx               = _mm256_mul_pd(fscal,dx11);
1889             ty               = _mm256_mul_pd(fscal,dy11);
1890             tz               = _mm256_mul_pd(fscal,dz11);
1891
1892             /* Update vectorial force */
1893             fix1             = _mm256_add_pd(fix1,tx);
1894             fiy1             = _mm256_add_pd(fiy1,ty);
1895             fiz1             = _mm256_add_pd(fiz1,tz);
1896
1897             fjx1             = _mm256_add_pd(fjx1,tx);
1898             fjy1             = _mm256_add_pd(fjy1,ty);
1899             fjz1             = _mm256_add_pd(fjz1,tz);
1900
1901             }
1902
1903             /**************************
1904              * CALCULATE INTERACTIONS *
1905              **************************/
1906
1907             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1908             {
1909
1910             r12              = _mm256_mul_pd(rsq12,rinv12);
1911
1912             /* EWALD ELECTROSTATICS */
1913
1914             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1915             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1916             ewitab           = _mm256_cvttpd_epi32(ewrt);
1917             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1918             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1919                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1920                                             &ewtabF,&ewtabFn);
1921             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1922             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1923
1924             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1925
1926             fscal            = felec;
1927
1928             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1929
1930             /* Calculate temporary vectorial force */
1931             tx               = _mm256_mul_pd(fscal,dx12);
1932             ty               = _mm256_mul_pd(fscal,dy12);
1933             tz               = _mm256_mul_pd(fscal,dz12);
1934
1935             /* Update vectorial force */
1936             fix1             = _mm256_add_pd(fix1,tx);
1937             fiy1             = _mm256_add_pd(fiy1,ty);
1938             fiz1             = _mm256_add_pd(fiz1,tz);
1939
1940             fjx2             = _mm256_add_pd(fjx2,tx);
1941             fjy2             = _mm256_add_pd(fjy2,ty);
1942             fjz2             = _mm256_add_pd(fjz2,tz);
1943
1944             }
1945
1946             /**************************
1947              * CALCULATE INTERACTIONS *
1948              **************************/
1949
1950             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1951             {
1952
1953             r13              = _mm256_mul_pd(rsq13,rinv13);
1954
1955             /* EWALD ELECTROSTATICS */
1956
1957             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1958             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1959             ewitab           = _mm256_cvttpd_epi32(ewrt);
1960             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1961             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1962                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1963                                             &ewtabF,&ewtabFn);
1964             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1965             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1966
1967             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1968
1969             fscal            = felec;
1970
1971             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1972
1973             /* Calculate temporary vectorial force */
1974             tx               = _mm256_mul_pd(fscal,dx13);
1975             ty               = _mm256_mul_pd(fscal,dy13);
1976             tz               = _mm256_mul_pd(fscal,dz13);
1977
1978             /* Update vectorial force */
1979             fix1             = _mm256_add_pd(fix1,tx);
1980             fiy1             = _mm256_add_pd(fiy1,ty);
1981             fiz1             = _mm256_add_pd(fiz1,tz);
1982
1983             fjx3             = _mm256_add_pd(fjx3,tx);
1984             fjy3             = _mm256_add_pd(fjy3,ty);
1985             fjz3             = _mm256_add_pd(fjz3,tz);
1986
1987             }
1988
1989             /**************************
1990              * CALCULATE INTERACTIONS *
1991              **************************/
1992
1993             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1994             {
1995
1996             r21              = _mm256_mul_pd(rsq21,rinv21);
1997
1998             /* EWALD ELECTROSTATICS */
1999
2000             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2001             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2002             ewitab           = _mm256_cvttpd_epi32(ewrt);
2003             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2004             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2005                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2006                                             &ewtabF,&ewtabFn);
2007             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2008             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2009
2010             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2011
2012             fscal            = felec;
2013
2014             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2015
2016             /* Calculate temporary vectorial force */
2017             tx               = _mm256_mul_pd(fscal,dx21);
2018             ty               = _mm256_mul_pd(fscal,dy21);
2019             tz               = _mm256_mul_pd(fscal,dz21);
2020
2021             /* Update vectorial force */
2022             fix2             = _mm256_add_pd(fix2,tx);
2023             fiy2             = _mm256_add_pd(fiy2,ty);
2024             fiz2             = _mm256_add_pd(fiz2,tz);
2025
2026             fjx1             = _mm256_add_pd(fjx1,tx);
2027             fjy1             = _mm256_add_pd(fjy1,ty);
2028             fjz1             = _mm256_add_pd(fjz1,tz);
2029
2030             }
2031
2032             /**************************
2033              * CALCULATE INTERACTIONS *
2034              **************************/
2035
2036             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2037             {
2038
2039             r22              = _mm256_mul_pd(rsq22,rinv22);
2040
2041             /* EWALD ELECTROSTATICS */
2042
2043             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2044             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2045             ewitab           = _mm256_cvttpd_epi32(ewrt);
2046             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2047             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2048                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2049                                             &ewtabF,&ewtabFn);
2050             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2051             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2052
2053             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2054
2055             fscal            = felec;
2056
2057             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2058
2059             /* Calculate temporary vectorial force */
2060             tx               = _mm256_mul_pd(fscal,dx22);
2061             ty               = _mm256_mul_pd(fscal,dy22);
2062             tz               = _mm256_mul_pd(fscal,dz22);
2063
2064             /* Update vectorial force */
2065             fix2             = _mm256_add_pd(fix2,tx);
2066             fiy2             = _mm256_add_pd(fiy2,ty);
2067             fiz2             = _mm256_add_pd(fiz2,tz);
2068
2069             fjx2             = _mm256_add_pd(fjx2,tx);
2070             fjy2             = _mm256_add_pd(fjy2,ty);
2071             fjz2             = _mm256_add_pd(fjz2,tz);
2072
2073             }
2074
2075             /**************************
2076              * CALCULATE INTERACTIONS *
2077              **************************/
2078
2079             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2080             {
2081
2082             r23              = _mm256_mul_pd(rsq23,rinv23);
2083
2084             /* EWALD ELECTROSTATICS */
2085
2086             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2087             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2088             ewitab           = _mm256_cvttpd_epi32(ewrt);
2089             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2090             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2091                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2092                                             &ewtabF,&ewtabFn);
2093             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2094             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2095
2096             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2097
2098             fscal            = felec;
2099
2100             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2101
2102             /* Calculate temporary vectorial force */
2103             tx               = _mm256_mul_pd(fscal,dx23);
2104             ty               = _mm256_mul_pd(fscal,dy23);
2105             tz               = _mm256_mul_pd(fscal,dz23);
2106
2107             /* Update vectorial force */
2108             fix2             = _mm256_add_pd(fix2,tx);
2109             fiy2             = _mm256_add_pd(fiy2,ty);
2110             fiz2             = _mm256_add_pd(fiz2,tz);
2111
2112             fjx3             = _mm256_add_pd(fjx3,tx);
2113             fjy3             = _mm256_add_pd(fjy3,ty);
2114             fjz3             = _mm256_add_pd(fjz3,tz);
2115
2116             }
2117
2118             /**************************
2119              * CALCULATE INTERACTIONS *
2120              **************************/
2121
2122             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2123             {
2124
2125             r31              = _mm256_mul_pd(rsq31,rinv31);
2126
2127             /* EWALD ELECTROSTATICS */
2128
2129             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2130             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2131             ewitab           = _mm256_cvttpd_epi32(ewrt);
2132             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2133             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2134                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2135                                             &ewtabF,&ewtabFn);
2136             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2137             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2138
2139             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2140
2141             fscal            = felec;
2142
2143             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2144
2145             /* Calculate temporary vectorial force */
2146             tx               = _mm256_mul_pd(fscal,dx31);
2147             ty               = _mm256_mul_pd(fscal,dy31);
2148             tz               = _mm256_mul_pd(fscal,dz31);
2149
2150             /* Update vectorial force */
2151             fix3             = _mm256_add_pd(fix3,tx);
2152             fiy3             = _mm256_add_pd(fiy3,ty);
2153             fiz3             = _mm256_add_pd(fiz3,tz);
2154
2155             fjx1             = _mm256_add_pd(fjx1,tx);
2156             fjy1             = _mm256_add_pd(fjy1,ty);
2157             fjz1             = _mm256_add_pd(fjz1,tz);
2158
2159             }
2160
2161             /**************************
2162              * CALCULATE INTERACTIONS *
2163              **************************/
2164
2165             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2166             {
2167
2168             r32              = _mm256_mul_pd(rsq32,rinv32);
2169
2170             /* EWALD ELECTROSTATICS */
2171
2172             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2173             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2174             ewitab           = _mm256_cvttpd_epi32(ewrt);
2175             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2176             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2177                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2178                                             &ewtabF,&ewtabFn);
2179             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2180             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2181
2182             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2183
2184             fscal            = felec;
2185
2186             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2187
2188             /* Calculate temporary vectorial force */
2189             tx               = _mm256_mul_pd(fscal,dx32);
2190             ty               = _mm256_mul_pd(fscal,dy32);
2191             tz               = _mm256_mul_pd(fscal,dz32);
2192
2193             /* Update vectorial force */
2194             fix3             = _mm256_add_pd(fix3,tx);
2195             fiy3             = _mm256_add_pd(fiy3,ty);
2196             fiz3             = _mm256_add_pd(fiz3,tz);
2197
2198             fjx2             = _mm256_add_pd(fjx2,tx);
2199             fjy2             = _mm256_add_pd(fjy2,ty);
2200             fjz2             = _mm256_add_pd(fjz2,tz);
2201
2202             }
2203
2204             /**************************
2205              * CALCULATE INTERACTIONS *
2206              **************************/
2207
2208             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2209             {
2210
2211             r33              = _mm256_mul_pd(rsq33,rinv33);
2212
2213             /* EWALD ELECTROSTATICS */
2214
2215             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2216             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2217             ewitab           = _mm256_cvttpd_epi32(ewrt);
2218             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2219             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2220                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2221                                             &ewtabF,&ewtabFn);
2222             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2223             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2224
2225             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2226
2227             fscal            = felec;
2228
2229             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2230
2231             /* Calculate temporary vectorial force */
2232             tx               = _mm256_mul_pd(fscal,dx33);
2233             ty               = _mm256_mul_pd(fscal,dy33);
2234             tz               = _mm256_mul_pd(fscal,dz33);
2235
2236             /* Update vectorial force */
2237             fix3             = _mm256_add_pd(fix3,tx);
2238             fiy3             = _mm256_add_pd(fiy3,ty);
2239             fiz3             = _mm256_add_pd(fiz3,tz);
2240
2241             fjx3             = _mm256_add_pd(fjx3,tx);
2242             fjy3             = _mm256_add_pd(fjy3,ty);
2243             fjz3             = _mm256_add_pd(fjz3,tz);
2244
2245             }
2246
2247             fjptrA             = f+j_coord_offsetA;
2248             fjptrB             = f+j_coord_offsetB;
2249             fjptrC             = f+j_coord_offsetC;
2250             fjptrD             = f+j_coord_offsetD;
2251
2252             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2253                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2254                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2255
2256             /* Inner loop uses 384 flops */
2257         }
2258
2259         if(jidx<j_index_end)
2260         {
2261
2262             /* Get j neighbor index, and coordinate index */
2263             jnrlistA         = jjnr[jidx];
2264             jnrlistB         = jjnr[jidx+1];
2265             jnrlistC         = jjnr[jidx+2];
2266             jnrlistD         = jjnr[jidx+3];
2267             /* Sign of each element will be negative for non-real atoms.
2268              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2269              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2270              */
2271             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2272
2273             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2274             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2275             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2276
2277             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2278             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2279             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2280             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2281             j_coord_offsetA  = DIM*jnrA;
2282             j_coord_offsetB  = DIM*jnrB;
2283             j_coord_offsetC  = DIM*jnrC;
2284             j_coord_offsetD  = DIM*jnrD;
2285
2286             /* load j atom coordinates */
2287             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2288                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2289                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2290                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2291
2292             /* Calculate displacement vector */
2293             dx00             = _mm256_sub_pd(ix0,jx0);
2294             dy00             = _mm256_sub_pd(iy0,jy0);
2295             dz00             = _mm256_sub_pd(iz0,jz0);
2296             dx11             = _mm256_sub_pd(ix1,jx1);
2297             dy11             = _mm256_sub_pd(iy1,jy1);
2298             dz11             = _mm256_sub_pd(iz1,jz1);
2299             dx12             = _mm256_sub_pd(ix1,jx2);
2300             dy12             = _mm256_sub_pd(iy1,jy2);
2301             dz12             = _mm256_sub_pd(iz1,jz2);
2302             dx13             = _mm256_sub_pd(ix1,jx3);
2303             dy13             = _mm256_sub_pd(iy1,jy3);
2304             dz13             = _mm256_sub_pd(iz1,jz3);
2305             dx21             = _mm256_sub_pd(ix2,jx1);
2306             dy21             = _mm256_sub_pd(iy2,jy1);
2307             dz21             = _mm256_sub_pd(iz2,jz1);
2308             dx22             = _mm256_sub_pd(ix2,jx2);
2309             dy22             = _mm256_sub_pd(iy2,jy2);
2310             dz22             = _mm256_sub_pd(iz2,jz2);
2311             dx23             = _mm256_sub_pd(ix2,jx3);
2312             dy23             = _mm256_sub_pd(iy2,jy3);
2313             dz23             = _mm256_sub_pd(iz2,jz3);
2314             dx31             = _mm256_sub_pd(ix3,jx1);
2315             dy31             = _mm256_sub_pd(iy3,jy1);
2316             dz31             = _mm256_sub_pd(iz3,jz1);
2317             dx32             = _mm256_sub_pd(ix3,jx2);
2318             dy32             = _mm256_sub_pd(iy3,jy2);
2319             dz32             = _mm256_sub_pd(iz3,jz2);
2320             dx33             = _mm256_sub_pd(ix3,jx3);
2321             dy33             = _mm256_sub_pd(iy3,jy3);
2322             dz33             = _mm256_sub_pd(iz3,jz3);
2323
2324             /* Calculate squared distance and things based on it */
2325             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2326             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2327             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2328             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2329             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2330             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2331             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2332             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2333             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2334             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2335
2336             rinv11           = avx256_invsqrt_d(rsq11);
2337             rinv12           = avx256_invsqrt_d(rsq12);
2338             rinv13           = avx256_invsqrt_d(rsq13);
2339             rinv21           = avx256_invsqrt_d(rsq21);
2340             rinv22           = avx256_invsqrt_d(rsq22);
2341             rinv23           = avx256_invsqrt_d(rsq23);
2342             rinv31           = avx256_invsqrt_d(rsq31);
2343             rinv32           = avx256_invsqrt_d(rsq32);
2344             rinv33           = avx256_invsqrt_d(rsq33);
2345
2346             rinvsq00         = avx256_inv_d(rsq00);
2347             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2348             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2349             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2350             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2351             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2352             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2353             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2354             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2355             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2356
2357             fjx0             = _mm256_setzero_pd();
2358             fjy0             = _mm256_setzero_pd();
2359             fjz0             = _mm256_setzero_pd();
2360             fjx1             = _mm256_setzero_pd();
2361             fjy1             = _mm256_setzero_pd();
2362             fjz1             = _mm256_setzero_pd();
2363             fjx2             = _mm256_setzero_pd();
2364             fjy2             = _mm256_setzero_pd();
2365             fjz2             = _mm256_setzero_pd();
2366             fjx3             = _mm256_setzero_pd();
2367             fjy3             = _mm256_setzero_pd();
2368             fjz3             = _mm256_setzero_pd();
2369
2370             /**************************
2371              * CALCULATE INTERACTIONS *
2372              **************************/
2373
2374             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2375             {
2376
2377             /* LENNARD-JONES DISPERSION/REPULSION */
2378
2379             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2380             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2381
2382             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2383
2384             fscal            = fvdw;
2385
2386             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2387
2388             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2389
2390             /* Calculate temporary vectorial force */
2391             tx               = _mm256_mul_pd(fscal,dx00);
2392             ty               = _mm256_mul_pd(fscal,dy00);
2393             tz               = _mm256_mul_pd(fscal,dz00);
2394
2395             /* Update vectorial force */
2396             fix0             = _mm256_add_pd(fix0,tx);
2397             fiy0             = _mm256_add_pd(fiy0,ty);
2398             fiz0             = _mm256_add_pd(fiz0,tz);
2399
2400             fjx0             = _mm256_add_pd(fjx0,tx);
2401             fjy0             = _mm256_add_pd(fjy0,ty);
2402             fjz0             = _mm256_add_pd(fjz0,tz);
2403
2404             }
2405
2406             /**************************
2407              * CALCULATE INTERACTIONS *
2408              **************************/
2409
2410             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2411             {
2412
2413             r11              = _mm256_mul_pd(rsq11,rinv11);
2414             r11              = _mm256_andnot_pd(dummy_mask,r11);
2415
2416             /* EWALD ELECTROSTATICS */
2417
2418             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2419             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2420             ewitab           = _mm256_cvttpd_epi32(ewrt);
2421             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2422             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2423                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2424                                             &ewtabF,&ewtabFn);
2425             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2426             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2427
2428             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2429
2430             fscal            = felec;
2431
2432             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2433
2434             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2435
2436             /* Calculate temporary vectorial force */
2437             tx               = _mm256_mul_pd(fscal,dx11);
2438             ty               = _mm256_mul_pd(fscal,dy11);
2439             tz               = _mm256_mul_pd(fscal,dz11);
2440
2441             /* Update vectorial force */
2442             fix1             = _mm256_add_pd(fix1,tx);
2443             fiy1             = _mm256_add_pd(fiy1,ty);
2444             fiz1             = _mm256_add_pd(fiz1,tz);
2445
2446             fjx1             = _mm256_add_pd(fjx1,tx);
2447             fjy1             = _mm256_add_pd(fjy1,ty);
2448             fjz1             = _mm256_add_pd(fjz1,tz);
2449
2450             }
2451
2452             /**************************
2453              * CALCULATE INTERACTIONS *
2454              **************************/
2455
2456             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2457             {
2458
2459             r12              = _mm256_mul_pd(rsq12,rinv12);
2460             r12              = _mm256_andnot_pd(dummy_mask,r12);
2461
2462             /* EWALD ELECTROSTATICS */
2463
2464             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2465             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2466             ewitab           = _mm256_cvttpd_epi32(ewrt);
2467             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2468             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2469                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2470                                             &ewtabF,&ewtabFn);
2471             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2472             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2473
2474             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2475
2476             fscal            = felec;
2477
2478             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2479
2480             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2481
2482             /* Calculate temporary vectorial force */
2483             tx               = _mm256_mul_pd(fscal,dx12);
2484             ty               = _mm256_mul_pd(fscal,dy12);
2485             tz               = _mm256_mul_pd(fscal,dz12);
2486
2487             /* Update vectorial force */
2488             fix1             = _mm256_add_pd(fix1,tx);
2489             fiy1             = _mm256_add_pd(fiy1,ty);
2490             fiz1             = _mm256_add_pd(fiz1,tz);
2491
2492             fjx2             = _mm256_add_pd(fjx2,tx);
2493             fjy2             = _mm256_add_pd(fjy2,ty);
2494             fjz2             = _mm256_add_pd(fjz2,tz);
2495
2496             }
2497
2498             /**************************
2499              * CALCULATE INTERACTIONS *
2500              **************************/
2501
2502             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2503             {
2504
2505             r13              = _mm256_mul_pd(rsq13,rinv13);
2506             r13              = _mm256_andnot_pd(dummy_mask,r13);
2507
2508             /* EWALD ELECTROSTATICS */
2509
2510             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2511             ewrt             = _mm256_mul_pd(r13,ewtabscale);
2512             ewitab           = _mm256_cvttpd_epi32(ewrt);
2513             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2514             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2515                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2516                                             &ewtabF,&ewtabFn);
2517             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2518             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2519
2520             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2521
2522             fscal            = felec;
2523
2524             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2525
2526             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2527
2528             /* Calculate temporary vectorial force */
2529             tx               = _mm256_mul_pd(fscal,dx13);
2530             ty               = _mm256_mul_pd(fscal,dy13);
2531             tz               = _mm256_mul_pd(fscal,dz13);
2532
2533             /* Update vectorial force */
2534             fix1             = _mm256_add_pd(fix1,tx);
2535             fiy1             = _mm256_add_pd(fiy1,ty);
2536             fiz1             = _mm256_add_pd(fiz1,tz);
2537
2538             fjx3             = _mm256_add_pd(fjx3,tx);
2539             fjy3             = _mm256_add_pd(fjy3,ty);
2540             fjz3             = _mm256_add_pd(fjz3,tz);
2541
2542             }
2543
2544             /**************************
2545              * CALCULATE INTERACTIONS *
2546              **************************/
2547
2548             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2549             {
2550
2551             r21              = _mm256_mul_pd(rsq21,rinv21);
2552             r21              = _mm256_andnot_pd(dummy_mask,r21);
2553
2554             /* EWALD ELECTROSTATICS */
2555
2556             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2557             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2558             ewitab           = _mm256_cvttpd_epi32(ewrt);
2559             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2560             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2561                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2562                                             &ewtabF,&ewtabFn);
2563             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2564             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2565
2566             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2567
2568             fscal            = felec;
2569
2570             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2571
2572             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2573
2574             /* Calculate temporary vectorial force */
2575             tx               = _mm256_mul_pd(fscal,dx21);
2576             ty               = _mm256_mul_pd(fscal,dy21);
2577             tz               = _mm256_mul_pd(fscal,dz21);
2578
2579             /* Update vectorial force */
2580             fix2             = _mm256_add_pd(fix2,tx);
2581             fiy2             = _mm256_add_pd(fiy2,ty);
2582             fiz2             = _mm256_add_pd(fiz2,tz);
2583
2584             fjx1             = _mm256_add_pd(fjx1,tx);
2585             fjy1             = _mm256_add_pd(fjy1,ty);
2586             fjz1             = _mm256_add_pd(fjz1,tz);
2587
2588             }
2589
2590             /**************************
2591              * CALCULATE INTERACTIONS *
2592              **************************/
2593
2594             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2595             {
2596
2597             r22              = _mm256_mul_pd(rsq22,rinv22);
2598             r22              = _mm256_andnot_pd(dummy_mask,r22);
2599
2600             /* EWALD ELECTROSTATICS */
2601
2602             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2603             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2604             ewitab           = _mm256_cvttpd_epi32(ewrt);
2605             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2606             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2607                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2608                                             &ewtabF,&ewtabFn);
2609             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2610             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2611
2612             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2613
2614             fscal            = felec;
2615
2616             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2617
2618             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2619
2620             /* Calculate temporary vectorial force */
2621             tx               = _mm256_mul_pd(fscal,dx22);
2622             ty               = _mm256_mul_pd(fscal,dy22);
2623             tz               = _mm256_mul_pd(fscal,dz22);
2624
2625             /* Update vectorial force */
2626             fix2             = _mm256_add_pd(fix2,tx);
2627             fiy2             = _mm256_add_pd(fiy2,ty);
2628             fiz2             = _mm256_add_pd(fiz2,tz);
2629
2630             fjx2             = _mm256_add_pd(fjx2,tx);
2631             fjy2             = _mm256_add_pd(fjy2,ty);
2632             fjz2             = _mm256_add_pd(fjz2,tz);
2633
2634             }
2635
2636             /**************************
2637              * CALCULATE INTERACTIONS *
2638              **************************/
2639
2640             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2641             {
2642
2643             r23              = _mm256_mul_pd(rsq23,rinv23);
2644             r23              = _mm256_andnot_pd(dummy_mask,r23);
2645
2646             /* EWALD ELECTROSTATICS */
2647
2648             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2649             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2650             ewitab           = _mm256_cvttpd_epi32(ewrt);
2651             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2652             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2653                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2654                                             &ewtabF,&ewtabFn);
2655             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2656             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2657
2658             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2659
2660             fscal            = felec;
2661
2662             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2663
2664             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2665
2666             /* Calculate temporary vectorial force */
2667             tx               = _mm256_mul_pd(fscal,dx23);
2668             ty               = _mm256_mul_pd(fscal,dy23);
2669             tz               = _mm256_mul_pd(fscal,dz23);
2670
2671             /* Update vectorial force */
2672             fix2             = _mm256_add_pd(fix2,tx);
2673             fiy2             = _mm256_add_pd(fiy2,ty);
2674             fiz2             = _mm256_add_pd(fiz2,tz);
2675
2676             fjx3             = _mm256_add_pd(fjx3,tx);
2677             fjy3             = _mm256_add_pd(fjy3,ty);
2678             fjz3             = _mm256_add_pd(fjz3,tz);
2679
2680             }
2681
2682             /**************************
2683              * CALCULATE INTERACTIONS *
2684              **************************/
2685
2686             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2687             {
2688
2689             r31              = _mm256_mul_pd(rsq31,rinv31);
2690             r31              = _mm256_andnot_pd(dummy_mask,r31);
2691
2692             /* EWALD ELECTROSTATICS */
2693
2694             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2695             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2696             ewitab           = _mm256_cvttpd_epi32(ewrt);
2697             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2698             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2699                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2700                                             &ewtabF,&ewtabFn);
2701             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2702             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2703
2704             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2705
2706             fscal            = felec;
2707
2708             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2709
2710             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2711
2712             /* Calculate temporary vectorial force */
2713             tx               = _mm256_mul_pd(fscal,dx31);
2714             ty               = _mm256_mul_pd(fscal,dy31);
2715             tz               = _mm256_mul_pd(fscal,dz31);
2716
2717             /* Update vectorial force */
2718             fix3             = _mm256_add_pd(fix3,tx);
2719             fiy3             = _mm256_add_pd(fiy3,ty);
2720             fiz3             = _mm256_add_pd(fiz3,tz);
2721
2722             fjx1             = _mm256_add_pd(fjx1,tx);
2723             fjy1             = _mm256_add_pd(fjy1,ty);
2724             fjz1             = _mm256_add_pd(fjz1,tz);
2725
2726             }
2727
2728             /**************************
2729              * CALCULATE INTERACTIONS *
2730              **************************/
2731
2732             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2733             {
2734
2735             r32              = _mm256_mul_pd(rsq32,rinv32);
2736             r32              = _mm256_andnot_pd(dummy_mask,r32);
2737
2738             /* EWALD ELECTROSTATICS */
2739
2740             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2741             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2742             ewitab           = _mm256_cvttpd_epi32(ewrt);
2743             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2744             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2745                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2746                                             &ewtabF,&ewtabFn);
2747             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2748             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2749
2750             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2751
2752             fscal            = felec;
2753
2754             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2755
2756             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2757
2758             /* Calculate temporary vectorial force */
2759             tx               = _mm256_mul_pd(fscal,dx32);
2760             ty               = _mm256_mul_pd(fscal,dy32);
2761             tz               = _mm256_mul_pd(fscal,dz32);
2762
2763             /* Update vectorial force */
2764             fix3             = _mm256_add_pd(fix3,tx);
2765             fiy3             = _mm256_add_pd(fiy3,ty);
2766             fiz3             = _mm256_add_pd(fiz3,tz);
2767
2768             fjx2             = _mm256_add_pd(fjx2,tx);
2769             fjy2             = _mm256_add_pd(fjy2,ty);
2770             fjz2             = _mm256_add_pd(fjz2,tz);
2771
2772             }
2773
2774             /**************************
2775              * CALCULATE INTERACTIONS *
2776              **************************/
2777
2778             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2779             {
2780
2781             r33              = _mm256_mul_pd(rsq33,rinv33);
2782             r33              = _mm256_andnot_pd(dummy_mask,r33);
2783
2784             /* EWALD ELECTROSTATICS */
2785
2786             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2787             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2788             ewitab           = _mm256_cvttpd_epi32(ewrt);
2789             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2790             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2791                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2792                                             &ewtabF,&ewtabFn);
2793             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2794             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2795
2796             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2797
2798             fscal            = felec;
2799
2800             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2801
2802             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2803
2804             /* Calculate temporary vectorial force */
2805             tx               = _mm256_mul_pd(fscal,dx33);
2806             ty               = _mm256_mul_pd(fscal,dy33);
2807             tz               = _mm256_mul_pd(fscal,dz33);
2808
2809             /* Update vectorial force */
2810             fix3             = _mm256_add_pd(fix3,tx);
2811             fiy3             = _mm256_add_pd(fiy3,ty);
2812             fiz3             = _mm256_add_pd(fiz3,tz);
2813
2814             fjx3             = _mm256_add_pd(fjx3,tx);
2815             fjy3             = _mm256_add_pd(fjy3,ty);
2816             fjz3             = _mm256_add_pd(fjz3,tz);
2817
2818             }
2819
2820             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2821             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2822             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2823             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2824
2825             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2826                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2827                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2828
2829             /* Inner loop uses 393 flops */
2830         }
2831
2832         /* End of innermost loop */
2833
2834         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2835                                                  f+i_coord_offset,fshift+i_shift_offset);
2836
2837         /* Increment number of inner iterations */
2838         inneriter                  += j_index_end - j_index_start;
2839
2840         /* Outer loop uses 24 flops */
2841     }
2842
2843     /* Increment number of outer iterations */
2844     outeriter        += nri;
2845
2846     /* Update outer/inner flops */
2847
2848     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*393);
2849 }