303bf5430b1f5010cadfd5b9271216c3b7afe97a
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82     real             scratch[4*DIM];
83     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84     real *           vdwioffsetptr0;
85     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86     real *           vdwioffsetptr1;
87     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88     real *           vdwioffsetptr2;
89     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90     real *           vdwioffsetptr3;
91     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
92     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
99     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
100     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
104     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
107     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
108     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
109     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
110     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
111     real             *charge;
112     int              nvdwtype;
113     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
114     int              *vdwtype;
115     real             *vdwparam;
116     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
117     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
118     __m128i          ewitab;
119     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
120     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
121     real             *ewtab;
122     __m256d          dummy_mask,cutoff_mask;
123     __m128           tmpmask0,tmpmask1;
124     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
125     __m256d          one     = _mm256_set1_pd(1.0);
126     __m256d          two     = _mm256_set1_pd(2.0);
127     x                = xx[0];
128     f                = ff[0];
129
130     nri              = nlist->nri;
131     iinr             = nlist->iinr;
132     jindex           = nlist->jindex;
133     jjnr             = nlist->jjnr;
134     shiftidx         = nlist->shift;
135     gid              = nlist->gid;
136     shiftvec         = fr->shift_vec[0];
137     fshift           = fr->fshift[0];
138     facel            = _mm256_set1_pd(fr->epsfac);
139     charge           = mdatoms->chargeA;
140     nvdwtype         = fr->ntype;
141     vdwparam         = fr->nbfp;
142     vdwtype          = mdatoms->typeA;
143
144     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
145     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
146     beta2            = _mm256_mul_pd(beta,beta);
147     beta3            = _mm256_mul_pd(beta,beta2);
148
149     ewtab            = fr->ic->tabq_coul_FDV0;
150     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
151     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
152
153     /* Setup water-specific parameters */
154     inr              = nlist->iinr[0];
155     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
156     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
157     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
158     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
159
160     jq1              = _mm256_set1_pd(charge[inr+1]);
161     jq2              = _mm256_set1_pd(charge[inr+2]);
162     jq3              = _mm256_set1_pd(charge[inr+3]);
163     vdwjidx0A        = 2*vdwtype[inr+0];
164     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
165     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
166     qq11             = _mm256_mul_pd(iq1,jq1);
167     qq12             = _mm256_mul_pd(iq1,jq2);
168     qq13             = _mm256_mul_pd(iq1,jq3);
169     qq21             = _mm256_mul_pd(iq2,jq1);
170     qq22             = _mm256_mul_pd(iq2,jq2);
171     qq23             = _mm256_mul_pd(iq2,jq3);
172     qq31             = _mm256_mul_pd(iq3,jq1);
173     qq32             = _mm256_mul_pd(iq3,jq2);
174     qq33             = _mm256_mul_pd(iq3,jq3);
175
176     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
177     rcutoff_scalar   = fr->rcoulomb;
178     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
179     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
180
181     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
182     rvdw             = _mm256_set1_pd(fr->rvdw);
183
184     /* Avoid stupid compiler warnings */
185     jnrA = jnrB = jnrC = jnrD = 0;
186     j_coord_offsetA = 0;
187     j_coord_offsetB = 0;
188     j_coord_offsetC = 0;
189     j_coord_offsetD = 0;
190
191     outeriter        = 0;
192     inneriter        = 0;
193
194     for(iidx=0;iidx<4*DIM;iidx++)
195     {
196         scratch[iidx] = 0.0;
197     }
198
199     /* Start outer loop over neighborlists */
200     for(iidx=0; iidx<nri; iidx++)
201     {
202         /* Load shift vector for this list */
203         i_shift_offset   = DIM*shiftidx[iidx];
204
205         /* Load limits for loop over neighbors */
206         j_index_start    = jindex[iidx];
207         j_index_end      = jindex[iidx+1];
208
209         /* Get outer coordinate index */
210         inr              = iinr[iidx];
211         i_coord_offset   = DIM*inr;
212
213         /* Load i particle coords and add shift vector */
214         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
215                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
216
217         fix0             = _mm256_setzero_pd();
218         fiy0             = _mm256_setzero_pd();
219         fiz0             = _mm256_setzero_pd();
220         fix1             = _mm256_setzero_pd();
221         fiy1             = _mm256_setzero_pd();
222         fiz1             = _mm256_setzero_pd();
223         fix2             = _mm256_setzero_pd();
224         fiy2             = _mm256_setzero_pd();
225         fiz2             = _mm256_setzero_pd();
226         fix3             = _mm256_setzero_pd();
227         fiy3             = _mm256_setzero_pd();
228         fiz3             = _mm256_setzero_pd();
229
230         /* Reset potential sums */
231         velecsum         = _mm256_setzero_pd();
232         vvdwsum          = _mm256_setzero_pd();
233
234         /* Start inner kernel loop */
235         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
236         {
237
238             /* Get j neighbor index, and coordinate index */
239             jnrA             = jjnr[jidx];
240             jnrB             = jjnr[jidx+1];
241             jnrC             = jjnr[jidx+2];
242             jnrD             = jjnr[jidx+3];
243             j_coord_offsetA  = DIM*jnrA;
244             j_coord_offsetB  = DIM*jnrB;
245             j_coord_offsetC  = DIM*jnrC;
246             j_coord_offsetD  = DIM*jnrD;
247
248             /* load j atom coordinates */
249             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
250                                                  x+j_coord_offsetC,x+j_coord_offsetD,
251                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
252                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
253
254             /* Calculate displacement vector */
255             dx00             = _mm256_sub_pd(ix0,jx0);
256             dy00             = _mm256_sub_pd(iy0,jy0);
257             dz00             = _mm256_sub_pd(iz0,jz0);
258             dx11             = _mm256_sub_pd(ix1,jx1);
259             dy11             = _mm256_sub_pd(iy1,jy1);
260             dz11             = _mm256_sub_pd(iz1,jz1);
261             dx12             = _mm256_sub_pd(ix1,jx2);
262             dy12             = _mm256_sub_pd(iy1,jy2);
263             dz12             = _mm256_sub_pd(iz1,jz2);
264             dx13             = _mm256_sub_pd(ix1,jx3);
265             dy13             = _mm256_sub_pd(iy1,jy3);
266             dz13             = _mm256_sub_pd(iz1,jz3);
267             dx21             = _mm256_sub_pd(ix2,jx1);
268             dy21             = _mm256_sub_pd(iy2,jy1);
269             dz21             = _mm256_sub_pd(iz2,jz1);
270             dx22             = _mm256_sub_pd(ix2,jx2);
271             dy22             = _mm256_sub_pd(iy2,jy2);
272             dz22             = _mm256_sub_pd(iz2,jz2);
273             dx23             = _mm256_sub_pd(ix2,jx3);
274             dy23             = _mm256_sub_pd(iy2,jy3);
275             dz23             = _mm256_sub_pd(iz2,jz3);
276             dx31             = _mm256_sub_pd(ix3,jx1);
277             dy31             = _mm256_sub_pd(iy3,jy1);
278             dz31             = _mm256_sub_pd(iz3,jz1);
279             dx32             = _mm256_sub_pd(ix3,jx2);
280             dy32             = _mm256_sub_pd(iy3,jy2);
281             dz32             = _mm256_sub_pd(iz3,jz2);
282             dx33             = _mm256_sub_pd(ix3,jx3);
283             dy33             = _mm256_sub_pd(iy3,jy3);
284             dz33             = _mm256_sub_pd(iz3,jz3);
285
286             /* Calculate squared distance and things based on it */
287             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
288             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
289             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
290             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
291             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
292             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
293             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
294             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
295             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
296             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
297
298             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
299             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
300             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
301             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
302             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
303             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
304             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
305             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
306             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
307
308             rinvsq00         = gmx_mm256_inv_pd(rsq00);
309             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
310             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
311             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
312             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
313             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
314             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
315             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
316             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
317             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
318
319             fjx0             = _mm256_setzero_pd();
320             fjy0             = _mm256_setzero_pd();
321             fjz0             = _mm256_setzero_pd();
322             fjx1             = _mm256_setzero_pd();
323             fjy1             = _mm256_setzero_pd();
324             fjz1             = _mm256_setzero_pd();
325             fjx2             = _mm256_setzero_pd();
326             fjy2             = _mm256_setzero_pd();
327             fjz2             = _mm256_setzero_pd();
328             fjx3             = _mm256_setzero_pd();
329             fjy3             = _mm256_setzero_pd();
330             fjz3             = _mm256_setzero_pd();
331
332             /**************************
333              * CALCULATE INTERACTIONS *
334              **************************/
335
336             if (gmx_mm256_any_lt(rsq00,rcutoff2))
337             {
338
339             /* LENNARD-JONES DISPERSION/REPULSION */
340
341             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
342             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
343             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
344             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
345                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
346             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
347
348             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
349
350             /* Update potential sum for this i atom from the interaction with this j atom. */
351             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
352             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
353
354             fscal            = fvdw;
355
356             fscal            = _mm256_and_pd(fscal,cutoff_mask);
357
358             /* Calculate temporary vectorial force */
359             tx               = _mm256_mul_pd(fscal,dx00);
360             ty               = _mm256_mul_pd(fscal,dy00);
361             tz               = _mm256_mul_pd(fscal,dz00);
362
363             /* Update vectorial force */
364             fix0             = _mm256_add_pd(fix0,tx);
365             fiy0             = _mm256_add_pd(fiy0,ty);
366             fiz0             = _mm256_add_pd(fiz0,tz);
367
368             fjx0             = _mm256_add_pd(fjx0,tx);
369             fjy0             = _mm256_add_pd(fjy0,ty);
370             fjz0             = _mm256_add_pd(fjz0,tz);
371
372             }
373
374             /**************************
375              * CALCULATE INTERACTIONS *
376              **************************/
377
378             if (gmx_mm256_any_lt(rsq11,rcutoff2))
379             {
380
381             r11              = _mm256_mul_pd(rsq11,rinv11);
382
383             /* EWALD ELECTROSTATICS */
384
385             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
386             ewrt             = _mm256_mul_pd(r11,ewtabscale);
387             ewitab           = _mm256_cvttpd_epi32(ewrt);
388             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
389             ewitab           = _mm_slli_epi32(ewitab,2);
390             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
391             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
392             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
393             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
394             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
395             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
396             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
397             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
398             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
399
400             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
401
402             /* Update potential sum for this i atom from the interaction with this j atom. */
403             velec            = _mm256_and_pd(velec,cutoff_mask);
404             velecsum         = _mm256_add_pd(velecsum,velec);
405
406             fscal            = felec;
407
408             fscal            = _mm256_and_pd(fscal,cutoff_mask);
409
410             /* Calculate temporary vectorial force */
411             tx               = _mm256_mul_pd(fscal,dx11);
412             ty               = _mm256_mul_pd(fscal,dy11);
413             tz               = _mm256_mul_pd(fscal,dz11);
414
415             /* Update vectorial force */
416             fix1             = _mm256_add_pd(fix1,tx);
417             fiy1             = _mm256_add_pd(fiy1,ty);
418             fiz1             = _mm256_add_pd(fiz1,tz);
419
420             fjx1             = _mm256_add_pd(fjx1,tx);
421             fjy1             = _mm256_add_pd(fjy1,ty);
422             fjz1             = _mm256_add_pd(fjz1,tz);
423
424             }
425
426             /**************************
427              * CALCULATE INTERACTIONS *
428              **************************/
429
430             if (gmx_mm256_any_lt(rsq12,rcutoff2))
431             {
432
433             r12              = _mm256_mul_pd(rsq12,rinv12);
434
435             /* EWALD ELECTROSTATICS */
436
437             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
438             ewrt             = _mm256_mul_pd(r12,ewtabscale);
439             ewitab           = _mm256_cvttpd_epi32(ewrt);
440             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
441             ewitab           = _mm_slli_epi32(ewitab,2);
442             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
443             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
444             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
445             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
446             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
447             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
448             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
449             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
450             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
451
452             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
453
454             /* Update potential sum for this i atom from the interaction with this j atom. */
455             velec            = _mm256_and_pd(velec,cutoff_mask);
456             velecsum         = _mm256_add_pd(velecsum,velec);
457
458             fscal            = felec;
459
460             fscal            = _mm256_and_pd(fscal,cutoff_mask);
461
462             /* Calculate temporary vectorial force */
463             tx               = _mm256_mul_pd(fscal,dx12);
464             ty               = _mm256_mul_pd(fscal,dy12);
465             tz               = _mm256_mul_pd(fscal,dz12);
466
467             /* Update vectorial force */
468             fix1             = _mm256_add_pd(fix1,tx);
469             fiy1             = _mm256_add_pd(fiy1,ty);
470             fiz1             = _mm256_add_pd(fiz1,tz);
471
472             fjx2             = _mm256_add_pd(fjx2,tx);
473             fjy2             = _mm256_add_pd(fjy2,ty);
474             fjz2             = _mm256_add_pd(fjz2,tz);
475
476             }
477
478             /**************************
479              * CALCULATE INTERACTIONS *
480              **************************/
481
482             if (gmx_mm256_any_lt(rsq13,rcutoff2))
483             {
484
485             r13              = _mm256_mul_pd(rsq13,rinv13);
486
487             /* EWALD ELECTROSTATICS */
488
489             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
490             ewrt             = _mm256_mul_pd(r13,ewtabscale);
491             ewitab           = _mm256_cvttpd_epi32(ewrt);
492             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
493             ewitab           = _mm_slli_epi32(ewitab,2);
494             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
495             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
496             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
497             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
498             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
499             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
500             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
501             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
502             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
503
504             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
505
506             /* Update potential sum for this i atom from the interaction with this j atom. */
507             velec            = _mm256_and_pd(velec,cutoff_mask);
508             velecsum         = _mm256_add_pd(velecsum,velec);
509
510             fscal            = felec;
511
512             fscal            = _mm256_and_pd(fscal,cutoff_mask);
513
514             /* Calculate temporary vectorial force */
515             tx               = _mm256_mul_pd(fscal,dx13);
516             ty               = _mm256_mul_pd(fscal,dy13);
517             tz               = _mm256_mul_pd(fscal,dz13);
518
519             /* Update vectorial force */
520             fix1             = _mm256_add_pd(fix1,tx);
521             fiy1             = _mm256_add_pd(fiy1,ty);
522             fiz1             = _mm256_add_pd(fiz1,tz);
523
524             fjx3             = _mm256_add_pd(fjx3,tx);
525             fjy3             = _mm256_add_pd(fjy3,ty);
526             fjz3             = _mm256_add_pd(fjz3,tz);
527
528             }
529
530             /**************************
531              * CALCULATE INTERACTIONS *
532              **************************/
533
534             if (gmx_mm256_any_lt(rsq21,rcutoff2))
535             {
536
537             r21              = _mm256_mul_pd(rsq21,rinv21);
538
539             /* EWALD ELECTROSTATICS */
540
541             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
542             ewrt             = _mm256_mul_pd(r21,ewtabscale);
543             ewitab           = _mm256_cvttpd_epi32(ewrt);
544             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
545             ewitab           = _mm_slli_epi32(ewitab,2);
546             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
547             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
548             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
549             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
550             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
551             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
552             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
553             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
554             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
555
556             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
557
558             /* Update potential sum for this i atom from the interaction with this j atom. */
559             velec            = _mm256_and_pd(velec,cutoff_mask);
560             velecsum         = _mm256_add_pd(velecsum,velec);
561
562             fscal            = felec;
563
564             fscal            = _mm256_and_pd(fscal,cutoff_mask);
565
566             /* Calculate temporary vectorial force */
567             tx               = _mm256_mul_pd(fscal,dx21);
568             ty               = _mm256_mul_pd(fscal,dy21);
569             tz               = _mm256_mul_pd(fscal,dz21);
570
571             /* Update vectorial force */
572             fix2             = _mm256_add_pd(fix2,tx);
573             fiy2             = _mm256_add_pd(fiy2,ty);
574             fiz2             = _mm256_add_pd(fiz2,tz);
575
576             fjx1             = _mm256_add_pd(fjx1,tx);
577             fjy1             = _mm256_add_pd(fjy1,ty);
578             fjz1             = _mm256_add_pd(fjz1,tz);
579
580             }
581
582             /**************************
583              * CALCULATE INTERACTIONS *
584              **************************/
585
586             if (gmx_mm256_any_lt(rsq22,rcutoff2))
587             {
588
589             r22              = _mm256_mul_pd(rsq22,rinv22);
590
591             /* EWALD ELECTROSTATICS */
592
593             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
594             ewrt             = _mm256_mul_pd(r22,ewtabscale);
595             ewitab           = _mm256_cvttpd_epi32(ewrt);
596             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
597             ewitab           = _mm_slli_epi32(ewitab,2);
598             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
599             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
600             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
601             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
602             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
603             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
604             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
605             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
606             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
607
608             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
609
610             /* Update potential sum for this i atom from the interaction with this j atom. */
611             velec            = _mm256_and_pd(velec,cutoff_mask);
612             velecsum         = _mm256_add_pd(velecsum,velec);
613
614             fscal            = felec;
615
616             fscal            = _mm256_and_pd(fscal,cutoff_mask);
617
618             /* Calculate temporary vectorial force */
619             tx               = _mm256_mul_pd(fscal,dx22);
620             ty               = _mm256_mul_pd(fscal,dy22);
621             tz               = _mm256_mul_pd(fscal,dz22);
622
623             /* Update vectorial force */
624             fix2             = _mm256_add_pd(fix2,tx);
625             fiy2             = _mm256_add_pd(fiy2,ty);
626             fiz2             = _mm256_add_pd(fiz2,tz);
627
628             fjx2             = _mm256_add_pd(fjx2,tx);
629             fjy2             = _mm256_add_pd(fjy2,ty);
630             fjz2             = _mm256_add_pd(fjz2,tz);
631
632             }
633
634             /**************************
635              * CALCULATE INTERACTIONS *
636              **************************/
637
638             if (gmx_mm256_any_lt(rsq23,rcutoff2))
639             {
640
641             r23              = _mm256_mul_pd(rsq23,rinv23);
642
643             /* EWALD ELECTROSTATICS */
644
645             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
646             ewrt             = _mm256_mul_pd(r23,ewtabscale);
647             ewitab           = _mm256_cvttpd_epi32(ewrt);
648             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
649             ewitab           = _mm_slli_epi32(ewitab,2);
650             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
651             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
652             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
653             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
654             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
655             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
656             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
657             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
658             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
659
660             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
661
662             /* Update potential sum for this i atom from the interaction with this j atom. */
663             velec            = _mm256_and_pd(velec,cutoff_mask);
664             velecsum         = _mm256_add_pd(velecsum,velec);
665
666             fscal            = felec;
667
668             fscal            = _mm256_and_pd(fscal,cutoff_mask);
669
670             /* Calculate temporary vectorial force */
671             tx               = _mm256_mul_pd(fscal,dx23);
672             ty               = _mm256_mul_pd(fscal,dy23);
673             tz               = _mm256_mul_pd(fscal,dz23);
674
675             /* Update vectorial force */
676             fix2             = _mm256_add_pd(fix2,tx);
677             fiy2             = _mm256_add_pd(fiy2,ty);
678             fiz2             = _mm256_add_pd(fiz2,tz);
679
680             fjx3             = _mm256_add_pd(fjx3,tx);
681             fjy3             = _mm256_add_pd(fjy3,ty);
682             fjz3             = _mm256_add_pd(fjz3,tz);
683
684             }
685
686             /**************************
687              * CALCULATE INTERACTIONS *
688              **************************/
689
690             if (gmx_mm256_any_lt(rsq31,rcutoff2))
691             {
692
693             r31              = _mm256_mul_pd(rsq31,rinv31);
694
695             /* EWALD ELECTROSTATICS */
696
697             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
698             ewrt             = _mm256_mul_pd(r31,ewtabscale);
699             ewitab           = _mm256_cvttpd_epi32(ewrt);
700             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
701             ewitab           = _mm_slli_epi32(ewitab,2);
702             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
703             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
704             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
705             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
706             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
707             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
708             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
709             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
710             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
711
712             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
713
714             /* Update potential sum for this i atom from the interaction with this j atom. */
715             velec            = _mm256_and_pd(velec,cutoff_mask);
716             velecsum         = _mm256_add_pd(velecsum,velec);
717
718             fscal            = felec;
719
720             fscal            = _mm256_and_pd(fscal,cutoff_mask);
721
722             /* Calculate temporary vectorial force */
723             tx               = _mm256_mul_pd(fscal,dx31);
724             ty               = _mm256_mul_pd(fscal,dy31);
725             tz               = _mm256_mul_pd(fscal,dz31);
726
727             /* Update vectorial force */
728             fix3             = _mm256_add_pd(fix3,tx);
729             fiy3             = _mm256_add_pd(fiy3,ty);
730             fiz3             = _mm256_add_pd(fiz3,tz);
731
732             fjx1             = _mm256_add_pd(fjx1,tx);
733             fjy1             = _mm256_add_pd(fjy1,ty);
734             fjz1             = _mm256_add_pd(fjz1,tz);
735
736             }
737
738             /**************************
739              * CALCULATE INTERACTIONS *
740              **************************/
741
742             if (gmx_mm256_any_lt(rsq32,rcutoff2))
743             {
744
745             r32              = _mm256_mul_pd(rsq32,rinv32);
746
747             /* EWALD ELECTROSTATICS */
748
749             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
750             ewrt             = _mm256_mul_pd(r32,ewtabscale);
751             ewitab           = _mm256_cvttpd_epi32(ewrt);
752             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
753             ewitab           = _mm_slli_epi32(ewitab,2);
754             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
755             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
756             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
757             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
758             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
759             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
760             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
761             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
762             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
763
764             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
765
766             /* Update potential sum for this i atom from the interaction with this j atom. */
767             velec            = _mm256_and_pd(velec,cutoff_mask);
768             velecsum         = _mm256_add_pd(velecsum,velec);
769
770             fscal            = felec;
771
772             fscal            = _mm256_and_pd(fscal,cutoff_mask);
773
774             /* Calculate temporary vectorial force */
775             tx               = _mm256_mul_pd(fscal,dx32);
776             ty               = _mm256_mul_pd(fscal,dy32);
777             tz               = _mm256_mul_pd(fscal,dz32);
778
779             /* Update vectorial force */
780             fix3             = _mm256_add_pd(fix3,tx);
781             fiy3             = _mm256_add_pd(fiy3,ty);
782             fiz3             = _mm256_add_pd(fiz3,tz);
783
784             fjx2             = _mm256_add_pd(fjx2,tx);
785             fjy2             = _mm256_add_pd(fjy2,ty);
786             fjz2             = _mm256_add_pd(fjz2,tz);
787
788             }
789
790             /**************************
791              * CALCULATE INTERACTIONS *
792              **************************/
793
794             if (gmx_mm256_any_lt(rsq33,rcutoff2))
795             {
796
797             r33              = _mm256_mul_pd(rsq33,rinv33);
798
799             /* EWALD ELECTROSTATICS */
800
801             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
802             ewrt             = _mm256_mul_pd(r33,ewtabscale);
803             ewitab           = _mm256_cvttpd_epi32(ewrt);
804             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
805             ewitab           = _mm_slli_epi32(ewitab,2);
806             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
807             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
808             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
809             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
810             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
811             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
812             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
813             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
814             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
815
816             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
817
818             /* Update potential sum for this i atom from the interaction with this j atom. */
819             velec            = _mm256_and_pd(velec,cutoff_mask);
820             velecsum         = _mm256_add_pd(velecsum,velec);
821
822             fscal            = felec;
823
824             fscal            = _mm256_and_pd(fscal,cutoff_mask);
825
826             /* Calculate temporary vectorial force */
827             tx               = _mm256_mul_pd(fscal,dx33);
828             ty               = _mm256_mul_pd(fscal,dy33);
829             tz               = _mm256_mul_pd(fscal,dz33);
830
831             /* Update vectorial force */
832             fix3             = _mm256_add_pd(fix3,tx);
833             fiy3             = _mm256_add_pd(fiy3,ty);
834             fiz3             = _mm256_add_pd(fiz3,tz);
835
836             fjx3             = _mm256_add_pd(fjx3,tx);
837             fjy3             = _mm256_add_pd(fjy3,ty);
838             fjz3             = _mm256_add_pd(fjz3,tz);
839
840             }
841
842             fjptrA             = f+j_coord_offsetA;
843             fjptrB             = f+j_coord_offsetB;
844             fjptrC             = f+j_coord_offsetC;
845             fjptrD             = f+j_coord_offsetD;
846
847             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
848                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
849                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
850
851             /* Inner loop uses 458 flops */
852         }
853
854         if(jidx<j_index_end)
855         {
856
857             /* Get j neighbor index, and coordinate index */
858             jnrlistA         = jjnr[jidx];
859             jnrlistB         = jjnr[jidx+1];
860             jnrlistC         = jjnr[jidx+2];
861             jnrlistD         = jjnr[jidx+3];
862             /* Sign of each element will be negative for non-real atoms.
863              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
864              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
865              */
866             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
867
868             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
869             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
870             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
871
872             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
873             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
874             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
875             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
876             j_coord_offsetA  = DIM*jnrA;
877             j_coord_offsetB  = DIM*jnrB;
878             j_coord_offsetC  = DIM*jnrC;
879             j_coord_offsetD  = DIM*jnrD;
880
881             /* load j atom coordinates */
882             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
883                                                  x+j_coord_offsetC,x+j_coord_offsetD,
884                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
885                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
886
887             /* Calculate displacement vector */
888             dx00             = _mm256_sub_pd(ix0,jx0);
889             dy00             = _mm256_sub_pd(iy0,jy0);
890             dz00             = _mm256_sub_pd(iz0,jz0);
891             dx11             = _mm256_sub_pd(ix1,jx1);
892             dy11             = _mm256_sub_pd(iy1,jy1);
893             dz11             = _mm256_sub_pd(iz1,jz1);
894             dx12             = _mm256_sub_pd(ix1,jx2);
895             dy12             = _mm256_sub_pd(iy1,jy2);
896             dz12             = _mm256_sub_pd(iz1,jz2);
897             dx13             = _mm256_sub_pd(ix1,jx3);
898             dy13             = _mm256_sub_pd(iy1,jy3);
899             dz13             = _mm256_sub_pd(iz1,jz3);
900             dx21             = _mm256_sub_pd(ix2,jx1);
901             dy21             = _mm256_sub_pd(iy2,jy1);
902             dz21             = _mm256_sub_pd(iz2,jz1);
903             dx22             = _mm256_sub_pd(ix2,jx2);
904             dy22             = _mm256_sub_pd(iy2,jy2);
905             dz22             = _mm256_sub_pd(iz2,jz2);
906             dx23             = _mm256_sub_pd(ix2,jx3);
907             dy23             = _mm256_sub_pd(iy2,jy3);
908             dz23             = _mm256_sub_pd(iz2,jz3);
909             dx31             = _mm256_sub_pd(ix3,jx1);
910             dy31             = _mm256_sub_pd(iy3,jy1);
911             dz31             = _mm256_sub_pd(iz3,jz1);
912             dx32             = _mm256_sub_pd(ix3,jx2);
913             dy32             = _mm256_sub_pd(iy3,jy2);
914             dz32             = _mm256_sub_pd(iz3,jz2);
915             dx33             = _mm256_sub_pd(ix3,jx3);
916             dy33             = _mm256_sub_pd(iy3,jy3);
917             dz33             = _mm256_sub_pd(iz3,jz3);
918
919             /* Calculate squared distance and things based on it */
920             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
921             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
922             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
923             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
924             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
925             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
926             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
927             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
928             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
929             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
930
931             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
932             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
933             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
934             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
935             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
936             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
937             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
938             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
939             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
940
941             rinvsq00         = gmx_mm256_inv_pd(rsq00);
942             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
943             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
944             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
945             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
946             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
947             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
948             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
949             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
950             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
951
952             fjx0             = _mm256_setzero_pd();
953             fjy0             = _mm256_setzero_pd();
954             fjz0             = _mm256_setzero_pd();
955             fjx1             = _mm256_setzero_pd();
956             fjy1             = _mm256_setzero_pd();
957             fjz1             = _mm256_setzero_pd();
958             fjx2             = _mm256_setzero_pd();
959             fjy2             = _mm256_setzero_pd();
960             fjz2             = _mm256_setzero_pd();
961             fjx3             = _mm256_setzero_pd();
962             fjy3             = _mm256_setzero_pd();
963             fjz3             = _mm256_setzero_pd();
964
965             /**************************
966              * CALCULATE INTERACTIONS *
967              **************************/
968
969             if (gmx_mm256_any_lt(rsq00,rcutoff2))
970             {
971
972             /* LENNARD-JONES DISPERSION/REPULSION */
973
974             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
975             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
976             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
977             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
978                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
979             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
980
981             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
982
983             /* Update potential sum for this i atom from the interaction with this j atom. */
984             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
985             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
986             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
987
988             fscal            = fvdw;
989
990             fscal            = _mm256_and_pd(fscal,cutoff_mask);
991
992             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
993
994             /* Calculate temporary vectorial force */
995             tx               = _mm256_mul_pd(fscal,dx00);
996             ty               = _mm256_mul_pd(fscal,dy00);
997             tz               = _mm256_mul_pd(fscal,dz00);
998
999             /* Update vectorial force */
1000             fix0             = _mm256_add_pd(fix0,tx);
1001             fiy0             = _mm256_add_pd(fiy0,ty);
1002             fiz0             = _mm256_add_pd(fiz0,tz);
1003
1004             fjx0             = _mm256_add_pd(fjx0,tx);
1005             fjy0             = _mm256_add_pd(fjy0,ty);
1006             fjz0             = _mm256_add_pd(fjz0,tz);
1007
1008             }
1009
1010             /**************************
1011              * CALCULATE INTERACTIONS *
1012              **************************/
1013
1014             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1015             {
1016
1017             r11              = _mm256_mul_pd(rsq11,rinv11);
1018             r11              = _mm256_andnot_pd(dummy_mask,r11);
1019
1020             /* EWALD ELECTROSTATICS */
1021
1022             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1023             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1024             ewitab           = _mm256_cvttpd_epi32(ewrt);
1025             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1026             ewitab           = _mm_slli_epi32(ewitab,2);
1027             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1028             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1029             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1030             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1031             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1032             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1033             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1034             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1035             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1036
1037             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1038
1039             /* Update potential sum for this i atom from the interaction with this j atom. */
1040             velec            = _mm256_and_pd(velec,cutoff_mask);
1041             velec            = _mm256_andnot_pd(dummy_mask,velec);
1042             velecsum         = _mm256_add_pd(velecsum,velec);
1043
1044             fscal            = felec;
1045
1046             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1047
1048             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1049
1050             /* Calculate temporary vectorial force */
1051             tx               = _mm256_mul_pd(fscal,dx11);
1052             ty               = _mm256_mul_pd(fscal,dy11);
1053             tz               = _mm256_mul_pd(fscal,dz11);
1054
1055             /* Update vectorial force */
1056             fix1             = _mm256_add_pd(fix1,tx);
1057             fiy1             = _mm256_add_pd(fiy1,ty);
1058             fiz1             = _mm256_add_pd(fiz1,tz);
1059
1060             fjx1             = _mm256_add_pd(fjx1,tx);
1061             fjy1             = _mm256_add_pd(fjy1,ty);
1062             fjz1             = _mm256_add_pd(fjz1,tz);
1063
1064             }
1065
1066             /**************************
1067              * CALCULATE INTERACTIONS *
1068              **************************/
1069
1070             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1071             {
1072
1073             r12              = _mm256_mul_pd(rsq12,rinv12);
1074             r12              = _mm256_andnot_pd(dummy_mask,r12);
1075
1076             /* EWALD ELECTROSTATICS */
1077
1078             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1079             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1080             ewitab           = _mm256_cvttpd_epi32(ewrt);
1081             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1082             ewitab           = _mm_slli_epi32(ewitab,2);
1083             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1084             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1085             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1086             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1087             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1088             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1089             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1090             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1091             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1092
1093             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1094
1095             /* Update potential sum for this i atom from the interaction with this j atom. */
1096             velec            = _mm256_and_pd(velec,cutoff_mask);
1097             velec            = _mm256_andnot_pd(dummy_mask,velec);
1098             velecsum         = _mm256_add_pd(velecsum,velec);
1099
1100             fscal            = felec;
1101
1102             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1103
1104             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1105
1106             /* Calculate temporary vectorial force */
1107             tx               = _mm256_mul_pd(fscal,dx12);
1108             ty               = _mm256_mul_pd(fscal,dy12);
1109             tz               = _mm256_mul_pd(fscal,dz12);
1110
1111             /* Update vectorial force */
1112             fix1             = _mm256_add_pd(fix1,tx);
1113             fiy1             = _mm256_add_pd(fiy1,ty);
1114             fiz1             = _mm256_add_pd(fiz1,tz);
1115
1116             fjx2             = _mm256_add_pd(fjx2,tx);
1117             fjy2             = _mm256_add_pd(fjy2,ty);
1118             fjz2             = _mm256_add_pd(fjz2,tz);
1119
1120             }
1121
1122             /**************************
1123              * CALCULATE INTERACTIONS *
1124              **************************/
1125
1126             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1127             {
1128
1129             r13              = _mm256_mul_pd(rsq13,rinv13);
1130             r13              = _mm256_andnot_pd(dummy_mask,r13);
1131
1132             /* EWALD ELECTROSTATICS */
1133
1134             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1135             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1136             ewitab           = _mm256_cvttpd_epi32(ewrt);
1137             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1138             ewitab           = _mm_slli_epi32(ewitab,2);
1139             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1140             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1141             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1142             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1143             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1144             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1145             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1146             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
1147             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1148
1149             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1150
1151             /* Update potential sum for this i atom from the interaction with this j atom. */
1152             velec            = _mm256_and_pd(velec,cutoff_mask);
1153             velec            = _mm256_andnot_pd(dummy_mask,velec);
1154             velecsum         = _mm256_add_pd(velecsum,velec);
1155
1156             fscal            = felec;
1157
1158             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1159
1160             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1161
1162             /* Calculate temporary vectorial force */
1163             tx               = _mm256_mul_pd(fscal,dx13);
1164             ty               = _mm256_mul_pd(fscal,dy13);
1165             tz               = _mm256_mul_pd(fscal,dz13);
1166
1167             /* Update vectorial force */
1168             fix1             = _mm256_add_pd(fix1,tx);
1169             fiy1             = _mm256_add_pd(fiy1,ty);
1170             fiz1             = _mm256_add_pd(fiz1,tz);
1171
1172             fjx3             = _mm256_add_pd(fjx3,tx);
1173             fjy3             = _mm256_add_pd(fjy3,ty);
1174             fjz3             = _mm256_add_pd(fjz3,tz);
1175
1176             }
1177
1178             /**************************
1179              * CALCULATE INTERACTIONS *
1180              **************************/
1181
1182             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1183             {
1184
1185             r21              = _mm256_mul_pd(rsq21,rinv21);
1186             r21              = _mm256_andnot_pd(dummy_mask,r21);
1187
1188             /* EWALD ELECTROSTATICS */
1189
1190             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1191             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1192             ewitab           = _mm256_cvttpd_epi32(ewrt);
1193             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1194             ewitab           = _mm_slli_epi32(ewitab,2);
1195             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1196             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1197             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1198             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1199             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1200             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1201             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1202             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1203             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1204
1205             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1206
1207             /* Update potential sum for this i atom from the interaction with this j atom. */
1208             velec            = _mm256_and_pd(velec,cutoff_mask);
1209             velec            = _mm256_andnot_pd(dummy_mask,velec);
1210             velecsum         = _mm256_add_pd(velecsum,velec);
1211
1212             fscal            = felec;
1213
1214             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1215
1216             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1217
1218             /* Calculate temporary vectorial force */
1219             tx               = _mm256_mul_pd(fscal,dx21);
1220             ty               = _mm256_mul_pd(fscal,dy21);
1221             tz               = _mm256_mul_pd(fscal,dz21);
1222
1223             /* Update vectorial force */
1224             fix2             = _mm256_add_pd(fix2,tx);
1225             fiy2             = _mm256_add_pd(fiy2,ty);
1226             fiz2             = _mm256_add_pd(fiz2,tz);
1227
1228             fjx1             = _mm256_add_pd(fjx1,tx);
1229             fjy1             = _mm256_add_pd(fjy1,ty);
1230             fjz1             = _mm256_add_pd(fjz1,tz);
1231
1232             }
1233
1234             /**************************
1235              * CALCULATE INTERACTIONS *
1236              **************************/
1237
1238             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1239             {
1240
1241             r22              = _mm256_mul_pd(rsq22,rinv22);
1242             r22              = _mm256_andnot_pd(dummy_mask,r22);
1243
1244             /* EWALD ELECTROSTATICS */
1245
1246             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1247             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1248             ewitab           = _mm256_cvttpd_epi32(ewrt);
1249             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1250             ewitab           = _mm_slli_epi32(ewitab,2);
1251             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1252             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1253             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1254             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1255             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1256             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1257             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1258             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1259             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1260
1261             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1262
1263             /* Update potential sum for this i atom from the interaction with this j atom. */
1264             velec            = _mm256_and_pd(velec,cutoff_mask);
1265             velec            = _mm256_andnot_pd(dummy_mask,velec);
1266             velecsum         = _mm256_add_pd(velecsum,velec);
1267
1268             fscal            = felec;
1269
1270             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1271
1272             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1273
1274             /* Calculate temporary vectorial force */
1275             tx               = _mm256_mul_pd(fscal,dx22);
1276             ty               = _mm256_mul_pd(fscal,dy22);
1277             tz               = _mm256_mul_pd(fscal,dz22);
1278
1279             /* Update vectorial force */
1280             fix2             = _mm256_add_pd(fix2,tx);
1281             fiy2             = _mm256_add_pd(fiy2,ty);
1282             fiz2             = _mm256_add_pd(fiz2,tz);
1283
1284             fjx2             = _mm256_add_pd(fjx2,tx);
1285             fjy2             = _mm256_add_pd(fjy2,ty);
1286             fjz2             = _mm256_add_pd(fjz2,tz);
1287
1288             }
1289
1290             /**************************
1291              * CALCULATE INTERACTIONS *
1292              **************************/
1293
1294             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1295             {
1296
1297             r23              = _mm256_mul_pd(rsq23,rinv23);
1298             r23              = _mm256_andnot_pd(dummy_mask,r23);
1299
1300             /* EWALD ELECTROSTATICS */
1301
1302             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1303             ewrt             = _mm256_mul_pd(r23,ewtabscale);
1304             ewitab           = _mm256_cvttpd_epi32(ewrt);
1305             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1306             ewitab           = _mm_slli_epi32(ewitab,2);
1307             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1308             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1309             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1310             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1311             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1312             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1313             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1314             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
1315             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1316
1317             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1318
1319             /* Update potential sum for this i atom from the interaction with this j atom. */
1320             velec            = _mm256_and_pd(velec,cutoff_mask);
1321             velec            = _mm256_andnot_pd(dummy_mask,velec);
1322             velecsum         = _mm256_add_pd(velecsum,velec);
1323
1324             fscal            = felec;
1325
1326             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1327
1328             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1329
1330             /* Calculate temporary vectorial force */
1331             tx               = _mm256_mul_pd(fscal,dx23);
1332             ty               = _mm256_mul_pd(fscal,dy23);
1333             tz               = _mm256_mul_pd(fscal,dz23);
1334
1335             /* Update vectorial force */
1336             fix2             = _mm256_add_pd(fix2,tx);
1337             fiy2             = _mm256_add_pd(fiy2,ty);
1338             fiz2             = _mm256_add_pd(fiz2,tz);
1339
1340             fjx3             = _mm256_add_pd(fjx3,tx);
1341             fjy3             = _mm256_add_pd(fjy3,ty);
1342             fjz3             = _mm256_add_pd(fjz3,tz);
1343
1344             }
1345
1346             /**************************
1347              * CALCULATE INTERACTIONS *
1348              **************************/
1349
1350             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1351             {
1352
1353             r31              = _mm256_mul_pd(rsq31,rinv31);
1354             r31              = _mm256_andnot_pd(dummy_mask,r31);
1355
1356             /* EWALD ELECTROSTATICS */
1357
1358             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1359             ewrt             = _mm256_mul_pd(r31,ewtabscale);
1360             ewitab           = _mm256_cvttpd_epi32(ewrt);
1361             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1362             ewitab           = _mm_slli_epi32(ewitab,2);
1363             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1364             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1365             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1366             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1367             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1368             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1369             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1370             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
1371             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1372
1373             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1374
1375             /* Update potential sum for this i atom from the interaction with this j atom. */
1376             velec            = _mm256_and_pd(velec,cutoff_mask);
1377             velec            = _mm256_andnot_pd(dummy_mask,velec);
1378             velecsum         = _mm256_add_pd(velecsum,velec);
1379
1380             fscal            = felec;
1381
1382             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1383
1384             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1385
1386             /* Calculate temporary vectorial force */
1387             tx               = _mm256_mul_pd(fscal,dx31);
1388             ty               = _mm256_mul_pd(fscal,dy31);
1389             tz               = _mm256_mul_pd(fscal,dz31);
1390
1391             /* Update vectorial force */
1392             fix3             = _mm256_add_pd(fix3,tx);
1393             fiy3             = _mm256_add_pd(fiy3,ty);
1394             fiz3             = _mm256_add_pd(fiz3,tz);
1395
1396             fjx1             = _mm256_add_pd(fjx1,tx);
1397             fjy1             = _mm256_add_pd(fjy1,ty);
1398             fjz1             = _mm256_add_pd(fjz1,tz);
1399
1400             }
1401
1402             /**************************
1403              * CALCULATE INTERACTIONS *
1404              **************************/
1405
1406             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1407             {
1408
1409             r32              = _mm256_mul_pd(rsq32,rinv32);
1410             r32              = _mm256_andnot_pd(dummy_mask,r32);
1411
1412             /* EWALD ELECTROSTATICS */
1413
1414             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1415             ewrt             = _mm256_mul_pd(r32,ewtabscale);
1416             ewitab           = _mm256_cvttpd_epi32(ewrt);
1417             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1418             ewitab           = _mm_slli_epi32(ewitab,2);
1419             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1420             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1421             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1422             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1423             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1424             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1425             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1426             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
1427             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1428
1429             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1430
1431             /* Update potential sum for this i atom from the interaction with this j atom. */
1432             velec            = _mm256_and_pd(velec,cutoff_mask);
1433             velec            = _mm256_andnot_pd(dummy_mask,velec);
1434             velecsum         = _mm256_add_pd(velecsum,velec);
1435
1436             fscal            = felec;
1437
1438             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1439
1440             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1441
1442             /* Calculate temporary vectorial force */
1443             tx               = _mm256_mul_pd(fscal,dx32);
1444             ty               = _mm256_mul_pd(fscal,dy32);
1445             tz               = _mm256_mul_pd(fscal,dz32);
1446
1447             /* Update vectorial force */
1448             fix3             = _mm256_add_pd(fix3,tx);
1449             fiy3             = _mm256_add_pd(fiy3,ty);
1450             fiz3             = _mm256_add_pd(fiz3,tz);
1451
1452             fjx2             = _mm256_add_pd(fjx2,tx);
1453             fjy2             = _mm256_add_pd(fjy2,ty);
1454             fjz2             = _mm256_add_pd(fjz2,tz);
1455
1456             }
1457
1458             /**************************
1459              * CALCULATE INTERACTIONS *
1460              **************************/
1461
1462             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1463             {
1464
1465             r33              = _mm256_mul_pd(rsq33,rinv33);
1466             r33              = _mm256_andnot_pd(dummy_mask,r33);
1467
1468             /* EWALD ELECTROSTATICS */
1469
1470             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1471             ewrt             = _mm256_mul_pd(r33,ewtabscale);
1472             ewitab           = _mm256_cvttpd_epi32(ewrt);
1473             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1474             ewitab           = _mm_slli_epi32(ewitab,2);
1475             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1476             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1477             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1478             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1479             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1480             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1481             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1482             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
1483             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1484
1485             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1486
1487             /* Update potential sum for this i atom from the interaction with this j atom. */
1488             velec            = _mm256_and_pd(velec,cutoff_mask);
1489             velec            = _mm256_andnot_pd(dummy_mask,velec);
1490             velecsum         = _mm256_add_pd(velecsum,velec);
1491
1492             fscal            = felec;
1493
1494             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1495
1496             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1497
1498             /* Calculate temporary vectorial force */
1499             tx               = _mm256_mul_pd(fscal,dx33);
1500             ty               = _mm256_mul_pd(fscal,dy33);
1501             tz               = _mm256_mul_pd(fscal,dz33);
1502
1503             /* Update vectorial force */
1504             fix3             = _mm256_add_pd(fix3,tx);
1505             fiy3             = _mm256_add_pd(fiy3,ty);
1506             fiz3             = _mm256_add_pd(fiz3,tz);
1507
1508             fjx3             = _mm256_add_pd(fjx3,tx);
1509             fjy3             = _mm256_add_pd(fjy3,ty);
1510             fjz3             = _mm256_add_pd(fjz3,tz);
1511
1512             }
1513
1514             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1515             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1516             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1517             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1518
1519             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1520                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1521                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1522
1523             /* Inner loop uses 467 flops */
1524         }
1525
1526         /* End of innermost loop */
1527
1528         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1529                                                  f+i_coord_offset,fshift+i_shift_offset);
1530
1531         ggid                        = gid[iidx];
1532         /* Update potential energies */
1533         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1534         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1535
1536         /* Increment number of inner iterations */
1537         inneriter                  += j_index_end - j_index_start;
1538
1539         /* Outer loop uses 26 flops */
1540     }
1541
1542     /* Increment number of outer iterations */
1543     outeriter        += nri;
1544
1545     /* Update outer/inner flops */
1546
1547     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*467);
1548 }
1549 /*
1550  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double
1551  * Electrostatics interaction: Ewald
1552  * VdW interaction:            LennardJones
1553  * Geometry:                   Water4-Water4
1554  * Calculate force/pot:        Force
1555  */
1556 void
1557 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double
1558                     (t_nblist                    * gmx_restrict       nlist,
1559                      rvec                        * gmx_restrict          xx,
1560                      rvec                        * gmx_restrict          ff,
1561                      t_forcerec                  * gmx_restrict          fr,
1562                      t_mdatoms                   * gmx_restrict     mdatoms,
1563                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1564                      t_nrnb                      * gmx_restrict        nrnb)
1565 {
1566     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1567      * just 0 for non-waters.
1568      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1569      * jnr indices corresponding to data put in the four positions in the SIMD register.
1570      */
1571     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1572     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1573     int              jnrA,jnrB,jnrC,jnrD;
1574     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1575     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1576     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1577     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1578     real             rcutoff_scalar;
1579     real             *shiftvec,*fshift,*x,*f;
1580     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1581     real             scratch[4*DIM];
1582     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1583     real *           vdwioffsetptr0;
1584     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1585     real *           vdwioffsetptr1;
1586     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1587     real *           vdwioffsetptr2;
1588     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1589     real *           vdwioffsetptr3;
1590     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1591     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1592     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1593     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1594     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1595     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1596     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1597     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1598     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1599     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1600     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1601     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1602     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1603     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1604     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1605     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1606     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1607     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1608     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1609     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1610     real             *charge;
1611     int              nvdwtype;
1612     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1613     int              *vdwtype;
1614     real             *vdwparam;
1615     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1616     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1617     __m128i          ewitab;
1618     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1619     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1620     real             *ewtab;
1621     __m256d          dummy_mask,cutoff_mask;
1622     __m128           tmpmask0,tmpmask1;
1623     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1624     __m256d          one     = _mm256_set1_pd(1.0);
1625     __m256d          two     = _mm256_set1_pd(2.0);
1626     x                = xx[0];
1627     f                = ff[0];
1628
1629     nri              = nlist->nri;
1630     iinr             = nlist->iinr;
1631     jindex           = nlist->jindex;
1632     jjnr             = nlist->jjnr;
1633     shiftidx         = nlist->shift;
1634     gid              = nlist->gid;
1635     shiftvec         = fr->shift_vec[0];
1636     fshift           = fr->fshift[0];
1637     facel            = _mm256_set1_pd(fr->epsfac);
1638     charge           = mdatoms->chargeA;
1639     nvdwtype         = fr->ntype;
1640     vdwparam         = fr->nbfp;
1641     vdwtype          = mdatoms->typeA;
1642
1643     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1644     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1645     beta2            = _mm256_mul_pd(beta,beta);
1646     beta3            = _mm256_mul_pd(beta,beta2);
1647
1648     ewtab            = fr->ic->tabq_coul_F;
1649     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1650     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1651
1652     /* Setup water-specific parameters */
1653     inr              = nlist->iinr[0];
1654     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1655     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1656     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1657     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1658
1659     jq1              = _mm256_set1_pd(charge[inr+1]);
1660     jq2              = _mm256_set1_pd(charge[inr+2]);
1661     jq3              = _mm256_set1_pd(charge[inr+3]);
1662     vdwjidx0A        = 2*vdwtype[inr+0];
1663     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1664     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1665     qq11             = _mm256_mul_pd(iq1,jq1);
1666     qq12             = _mm256_mul_pd(iq1,jq2);
1667     qq13             = _mm256_mul_pd(iq1,jq3);
1668     qq21             = _mm256_mul_pd(iq2,jq1);
1669     qq22             = _mm256_mul_pd(iq2,jq2);
1670     qq23             = _mm256_mul_pd(iq2,jq3);
1671     qq31             = _mm256_mul_pd(iq3,jq1);
1672     qq32             = _mm256_mul_pd(iq3,jq2);
1673     qq33             = _mm256_mul_pd(iq3,jq3);
1674
1675     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1676     rcutoff_scalar   = fr->rcoulomb;
1677     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1678     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1679
1680     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
1681     rvdw             = _mm256_set1_pd(fr->rvdw);
1682
1683     /* Avoid stupid compiler warnings */
1684     jnrA = jnrB = jnrC = jnrD = 0;
1685     j_coord_offsetA = 0;
1686     j_coord_offsetB = 0;
1687     j_coord_offsetC = 0;
1688     j_coord_offsetD = 0;
1689
1690     outeriter        = 0;
1691     inneriter        = 0;
1692
1693     for(iidx=0;iidx<4*DIM;iidx++)
1694     {
1695         scratch[iidx] = 0.0;
1696     }
1697
1698     /* Start outer loop over neighborlists */
1699     for(iidx=0; iidx<nri; iidx++)
1700     {
1701         /* Load shift vector for this list */
1702         i_shift_offset   = DIM*shiftidx[iidx];
1703
1704         /* Load limits for loop over neighbors */
1705         j_index_start    = jindex[iidx];
1706         j_index_end      = jindex[iidx+1];
1707
1708         /* Get outer coordinate index */
1709         inr              = iinr[iidx];
1710         i_coord_offset   = DIM*inr;
1711
1712         /* Load i particle coords and add shift vector */
1713         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1714                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1715
1716         fix0             = _mm256_setzero_pd();
1717         fiy0             = _mm256_setzero_pd();
1718         fiz0             = _mm256_setzero_pd();
1719         fix1             = _mm256_setzero_pd();
1720         fiy1             = _mm256_setzero_pd();
1721         fiz1             = _mm256_setzero_pd();
1722         fix2             = _mm256_setzero_pd();
1723         fiy2             = _mm256_setzero_pd();
1724         fiz2             = _mm256_setzero_pd();
1725         fix3             = _mm256_setzero_pd();
1726         fiy3             = _mm256_setzero_pd();
1727         fiz3             = _mm256_setzero_pd();
1728
1729         /* Start inner kernel loop */
1730         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1731         {
1732
1733             /* Get j neighbor index, and coordinate index */
1734             jnrA             = jjnr[jidx];
1735             jnrB             = jjnr[jidx+1];
1736             jnrC             = jjnr[jidx+2];
1737             jnrD             = jjnr[jidx+3];
1738             j_coord_offsetA  = DIM*jnrA;
1739             j_coord_offsetB  = DIM*jnrB;
1740             j_coord_offsetC  = DIM*jnrC;
1741             j_coord_offsetD  = DIM*jnrD;
1742
1743             /* load j atom coordinates */
1744             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1745                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1746                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1747                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1748
1749             /* Calculate displacement vector */
1750             dx00             = _mm256_sub_pd(ix0,jx0);
1751             dy00             = _mm256_sub_pd(iy0,jy0);
1752             dz00             = _mm256_sub_pd(iz0,jz0);
1753             dx11             = _mm256_sub_pd(ix1,jx1);
1754             dy11             = _mm256_sub_pd(iy1,jy1);
1755             dz11             = _mm256_sub_pd(iz1,jz1);
1756             dx12             = _mm256_sub_pd(ix1,jx2);
1757             dy12             = _mm256_sub_pd(iy1,jy2);
1758             dz12             = _mm256_sub_pd(iz1,jz2);
1759             dx13             = _mm256_sub_pd(ix1,jx3);
1760             dy13             = _mm256_sub_pd(iy1,jy3);
1761             dz13             = _mm256_sub_pd(iz1,jz3);
1762             dx21             = _mm256_sub_pd(ix2,jx1);
1763             dy21             = _mm256_sub_pd(iy2,jy1);
1764             dz21             = _mm256_sub_pd(iz2,jz1);
1765             dx22             = _mm256_sub_pd(ix2,jx2);
1766             dy22             = _mm256_sub_pd(iy2,jy2);
1767             dz22             = _mm256_sub_pd(iz2,jz2);
1768             dx23             = _mm256_sub_pd(ix2,jx3);
1769             dy23             = _mm256_sub_pd(iy2,jy3);
1770             dz23             = _mm256_sub_pd(iz2,jz3);
1771             dx31             = _mm256_sub_pd(ix3,jx1);
1772             dy31             = _mm256_sub_pd(iy3,jy1);
1773             dz31             = _mm256_sub_pd(iz3,jz1);
1774             dx32             = _mm256_sub_pd(ix3,jx2);
1775             dy32             = _mm256_sub_pd(iy3,jy2);
1776             dz32             = _mm256_sub_pd(iz3,jz2);
1777             dx33             = _mm256_sub_pd(ix3,jx3);
1778             dy33             = _mm256_sub_pd(iy3,jy3);
1779             dz33             = _mm256_sub_pd(iz3,jz3);
1780
1781             /* Calculate squared distance and things based on it */
1782             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1783             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1784             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1785             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1786             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1787             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1788             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1789             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1790             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1791             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1792
1793             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1794             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1795             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1796             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1797             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1798             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1799             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1800             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1801             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1802
1803             rinvsq00         = gmx_mm256_inv_pd(rsq00);
1804             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1805             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1806             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1807             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1808             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1809             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1810             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1811             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1812             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1813
1814             fjx0             = _mm256_setzero_pd();
1815             fjy0             = _mm256_setzero_pd();
1816             fjz0             = _mm256_setzero_pd();
1817             fjx1             = _mm256_setzero_pd();
1818             fjy1             = _mm256_setzero_pd();
1819             fjz1             = _mm256_setzero_pd();
1820             fjx2             = _mm256_setzero_pd();
1821             fjy2             = _mm256_setzero_pd();
1822             fjz2             = _mm256_setzero_pd();
1823             fjx3             = _mm256_setzero_pd();
1824             fjy3             = _mm256_setzero_pd();
1825             fjz3             = _mm256_setzero_pd();
1826
1827             /**************************
1828              * CALCULATE INTERACTIONS *
1829              **************************/
1830
1831             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1832             {
1833
1834             /* LENNARD-JONES DISPERSION/REPULSION */
1835
1836             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1837             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1838
1839             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1840
1841             fscal            = fvdw;
1842
1843             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1844
1845             /* Calculate temporary vectorial force */
1846             tx               = _mm256_mul_pd(fscal,dx00);
1847             ty               = _mm256_mul_pd(fscal,dy00);
1848             tz               = _mm256_mul_pd(fscal,dz00);
1849
1850             /* Update vectorial force */
1851             fix0             = _mm256_add_pd(fix0,tx);
1852             fiy0             = _mm256_add_pd(fiy0,ty);
1853             fiz0             = _mm256_add_pd(fiz0,tz);
1854
1855             fjx0             = _mm256_add_pd(fjx0,tx);
1856             fjy0             = _mm256_add_pd(fjy0,ty);
1857             fjz0             = _mm256_add_pd(fjz0,tz);
1858
1859             }
1860
1861             /**************************
1862              * CALCULATE INTERACTIONS *
1863              **************************/
1864
1865             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1866             {
1867
1868             r11              = _mm256_mul_pd(rsq11,rinv11);
1869
1870             /* EWALD ELECTROSTATICS */
1871
1872             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1873             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1874             ewitab           = _mm256_cvttpd_epi32(ewrt);
1875             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1876             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1877                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1878                                             &ewtabF,&ewtabFn);
1879             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1880             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1881
1882             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1883
1884             fscal            = felec;
1885
1886             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1887
1888             /* Calculate temporary vectorial force */
1889             tx               = _mm256_mul_pd(fscal,dx11);
1890             ty               = _mm256_mul_pd(fscal,dy11);
1891             tz               = _mm256_mul_pd(fscal,dz11);
1892
1893             /* Update vectorial force */
1894             fix1             = _mm256_add_pd(fix1,tx);
1895             fiy1             = _mm256_add_pd(fiy1,ty);
1896             fiz1             = _mm256_add_pd(fiz1,tz);
1897
1898             fjx1             = _mm256_add_pd(fjx1,tx);
1899             fjy1             = _mm256_add_pd(fjy1,ty);
1900             fjz1             = _mm256_add_pd(fjz1,tz);
1901
1902             }
1903
1904             /**************************
1905              * CALCULATE INTERACTIONS *
1906              **************************/
1907
1908             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1909             {
1910
1911             r12              = _mm256_mul_pd(rsq12,rinv12);
1912
1913             /* EWALD ELECTROSTATICS */
1914
1915             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1916             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1917             ewitab           = _mm256_cvttpd_epi32(ewrt);
1918             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1919             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1920                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1921                                             &ewtabF,&ewtabFn);
1922             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1923             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1924
1925             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1926
1927             fscal            = felec;
1928
1929             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1930
1931             /* Calculate temporary vectorial force */
1932             tx               = _mm256_mul_pd(fscal,dx12);
1933             ty               = _mm256_mul_pd(fscal,dy12);
1934             tz               = _mm256_mul_pd(fscal,dz12);
1935
1936             /* Update vectorial force */
1937             fix1             = _mm256_add_pd(fix1,tx);
1938             fiy1             = _mm256_add_pd(fiy1,ty);
1939             fiz1             = _mm256_add_pd(fiz1,tz);
1940
1941             fjx2             = _mm256_add_pd(fjx2,tx);
1942             fjy2             = _mm256_add_pd(fjy2,ty);
1943             fjz2             = _mm256_add_pd(fjz2,tz);
1944
1945             }
1946
1947             /**************************
1948              * CALCULATE INTERACTIONS *
1949              **************************/
1950
1951             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1952             {
1953
1954             r13              = _mm256_mul_pd(rsq13,rinv13);
1955
1956             /* EWALD ELECTROSTATICS */
1957
1958             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1959             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1960             ewitab           = _mm256_cvttpd_epi32(ewrt);
1961             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1962             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1963                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1964                                             &ewtabF,&ewtabFn);
1965             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1966             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1967
1968             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1969
1970             fscal            = felec;
1971
1972             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1973
1974             /* Calculate temporary vectorial force */
1975             tx               = _mm256_mul_pd(fscal,dx13);
1976             ty               = _mm256_mul_pd(fscal,dy13);
1977             tz               = _mm256_mul_pd(fscal,dz13);
1978
1979             /* Update vectorial force */
1980             fix1             = _mm256_add_pd(fix1,tx);
1981             fiy1             = _mm256_add_pd(fiy1,ty);
1982             fiz1             = _mm256_add_pd(fiz1,tz);
1983
1984             fjx3             = _mm256_add_pd(fjx3,tx);
1985             fjy3             = _mm256_add_pd(fjy3,ty);
1986             fjz3             = _mm256_add_pd(fjz3,tz);
1987
1988             }
1989
1990             /**************************
1991              * CALCULATE INTERACTIONS *
1992              **************************/
1993
1994             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1995             {
1996
1997             r21              = _mm256_mul_pd(rsq21,rinv21);
1998
1999             /* EWALD ELECTROSTATICS */
2000
2001             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2002             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2003             ewitab           = _mm256_cvttpd_epi32(ewrt);
2004             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2005             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2006                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2007                                             &ewtabF,&ewtabFn);
2008             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2009             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2010
2011             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2012
2013             fscal            = felec;
2014
2015             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2016
2017             /* Calculate temporary vectorial force */
2018             tx               = _mm256_mul_pd(fscal,dx21);
2019             ty               = _mm256_mul_pd(fscal,dy21);
2020             tz               = _mm256_mul_pd(fscal,dz21);
2021
2022             /* Update vectorial force */
2023             fix2             = _mm256_add_pd(fix2,tx);
2024             fiy2             = _mm256_add_pd(fiy2,ty);
2025             fiz2             = _mm256_add_pd(fiz2,tz);
2026
2027             fjx1             = _mm256_add_pd(fjx1,tx);
2028             fjy1             = _mm256_add_pd(fjy1,ty);
2029             fjz1             = _mm256_add_pd(fjz1,tz);
2030
2031             }
2032
2033             /**************************
2034              * CALCULATE INTERACTIONS *
2035              **************************/
2036
2037             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2038             {
2039
2040             r22              = _mm256_mul_pd(rsq22,rinv22);
2041
2042             /* EWALD ELECTROSTATICS */
2043
2044             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2045             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2046             ewitab           = _mm256_cvttpd_epi32(ewrt);
2047             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2048             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2049                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2050                                             &ewtabF,&ewtabFn);
2051             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2052             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2053
2054             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2055
2056             fscal            = felec;
2057
2058             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2059
2060             /* Calculate temporary vectorial force */
2061             tx               = _mm256_mul_pd(fscal,dx22);
2062             ty               = _mm256_mul_pd(fscal,dy22);
2063             tz               = _mm256_mul_pd(fscal,dz22);
2064
2065             /* Update vectorial force */
2066             fix2             = _mm256_add_pd(fix2,tx);
2067             fiy2             = _mm256_add_pd(fiy2,ty);
2068             fiz2             = _mm256_add_pd(fiz2,tz);
2069
2070             fjx2             = _mm256_add_pd(fjx2,tx);
2071             fjy2             = _mm256_add_pd(fjy2,ty);
2072             fjz2             = _mm256_add_pd(fjz2,tz);
2073
2074             }
2075
2076             /**************************
2077              * CALCULATE INTERACTIONS *
2078              **************************/
2079
2080             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2081             {
2082
2083             r23              = _mm256_mul_pd(rsq23,rinv23);
2084
2085             /* EWALD ELECTROSTATICS */
2086
2087             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2088             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2089             ewitab           = _mm256_cvttpd_epi32(ewrt);
2090             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2091             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2092                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2093                                             &ewtabF,&ewtabFn);
2094             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2095             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2096
2097             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2098
2099             fscal            = felec;
2100
2101             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2102
2103             /* Calculate temporary vectorial force */
2104             tx               = _mm256_mul_pd(fscal,dx23);
2105             ty               = _mm256_mul_pd(fscal,dy23);
2106             tz               = _mm256_mul_pd(fscal,dz23);
2107
2108             /* Update vectorial force */
2109             fix2             = _mm256_add_pd(fix2,tx);
2110             fiy2             = _mm256_add_pd(fiy2,ty);
2111             fiz2             = _mm256_add_pd(fiz2,tz);
2112
2113             fjx3             = _mm256_add_pd(fjx3,tx);
2114             fjy3             = _mm256_add_pd(fjy3,ty);
2115             fjz3             = _mm256_add_pd(fjz3,tz);
2116
2117             }
2118
2119             /**************************
2120              * CALCULATE INTERACTIONS *
2121              **************************/
2122
2123             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2124             {
2125
2126             r31              = _mm256_mul_pd(rsq31,rinv31);
2127
2128             /* EWALD ELECTROSTATICS */
2129
2130             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2131             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2132             ewitab           = _mm256_cvttpd_epi32(ewrt);
2133             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2134             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2135                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2136                                             &ewtabF,&ewtabFn);
2137             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2138             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2139
2140             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2141
2142             fscal            = felec;
2143
2144             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2145
2146             /* Calculate temporary vectorial force */
2147             tx               = _mm256_mul_pd(fscal,dx31);
2148             ty               = _mm256_mul_pd(fscal,dy31);
2149             tz               = _mm256_mul_pd(fscal,dz31);
2150
2151             /* Update vectorial force */
2152             fix3             = _mm256_add_pd(fix3,tx);
2153             fiy3             = _mm256_add_pd(fiy3,ty);
2154             fiz3             = _mm256_add_pd(fiz3,tz);
2155
2156             fjx1             = _mm256_add_pd(fjx1,tx);
2157             fjy1             = _mm256_add_pd(fjy1,ty);
2158             fjz1             = _mm256_add_pd(fjz1,tz);
2159
2160             }
2161
2162             /**************************
2163              * CALCULATE INTERACTIONS *
2164              **************************/
2165
2166             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2167             {
2168
2169             r32              = _mm256_mul_pd(rsq32,rinv32);
2170
2171             /* EWALD ELECTROSTATICS */
2172
2173             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2174             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2175             ewitab           = _mm256_cvttpd_epi32(ewrt);
2176             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2177             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2178                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2179                                             &ewtabF,&ewtabFn);
2180             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2181             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2182
2183             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2184
2185             fscal            = felec;
2186
2187             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2188
2189             /* Calculate temporary vectorial force */
2190             tx               = _mm256_mul_pd(fscal,dx32);
2191             ty               = _mm256_mul_pd(fscal,dy32);
2192             tz               = _mm256_mul_pd(fscal,dz32);
2193
2194             /* Update vectorial force */
2195             fix3             = _mm256_add_pd(fix3,tx);
2196             fiy3             = _mm256_add_pd(fiy3,ty);
2197             fiz3             = _mm256_add_pd(fiz3,tz);
2198
2199             fjx2             = _mm256_add_pd(fjx2,tx);
2200             fjy2             = _mm256_add_pd(fjy2,ty);
2201             fjz2             = _mm256_add_pd(fjz2,tz);
2202
2203             }
2204
2205             /**************************
2206              * CALCULATE INTERACTIONS *
2207              **************************/
2208
2209             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2210             {
2211
2212             r33              = _mm256_mul_pd(rsq33,rinv33);
2213
2214             /* EWALD ELECTROSTATICS */
2215
2216             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2217             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2218             ewitab           = _mm256_cvttpd_epi32(ewrt);
2219             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2220             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2221                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2222                                             &ewtabF,&ewtabFn);
2223             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2224             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2225
2226             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2227
2228             fscal            = felec;
2229
2230             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2231
2232             /* Calculate temporary vectorial force */
2233             tx               = _mm256_mul_pd(fscal,dx33);
2234             ty               = _mm256_mul_pd(fscal,dy33);
2235             tz               = _mm256_mul_pd(fscal,dz33);
2236
2237             /* Update vectorial force */
2238             fix3             = _mm256_add_pd(fix3,tx);
2239             fiy3             = _mm256_add_pd(fiy3,ty);
2240             fiz3             = _mm256_add_pd(fiz3,tz);
2241
2242             fjx3             = _mm256_add_pd(fjx3,tx);
2243             fjy3             = _mm256_add_pd(fjy3,ty);
2244             fjz3             = _mm256_add_pd(fjz3,tz);
2245
2246             }
2247
2248             fjptrA             = f+j_coord_offsetA;
2249             fjptrB             = f+j_coord_offsetB;
2250             fjptrC             = f+j_coord_offsetC;
2251             fjptrD             = f+j_coord_offsetD;
2252
2253             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2254                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2255                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2256
2257             /* Inner loop uses 384 flops */
2258         }
2259
2260         if(jidx<j_index_end)
2261         {
2262
2263             /* Get j neighbor index, and coordinate index */
2264             jnrlistA         = jjnr[jidx];
2265             jnrlistB         = jjnr[jidx+1];
2266             jnrlistC         = jjnr[jidx+2];
2267             jnrlistD         = jjnr[jidx+3];
2268             /* Sign of each element will be negative for non-real atoms.
2269              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2270              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2271              */
2272             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2273
2274             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2275             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2276             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2277
2278             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2279             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2280             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2281             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2282             j_coord_offsetA  = DIM*jnrA;
2283             j_coord_offsetB  = DIM*jnrB;
2284             j_coord_offsetC  = DIM*jnrC;
2285             j_coord_offsetD  = DIM*jnrD;
2286
2287             /* load j atom coordinates */
2288             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2289                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2290                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2291                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2292
2293             /* Calculate displacement vector */
2294             dx00             = _mm256_sub_pd(ix0,jx0);
2295             dy00             = _mm256_sub_pd(iy0,jy0);
2296             dz00             = _mm256_sub_pd(iz0,jz0);
2297             dx11             = _mm256_sub_pd(ix1,jx1);
2298             dy11             = _mm256_sub_pd(iy1,jy1);
2299             dz11             = _mm256_sub_pd(iz1,jz1);
2300             dx12             = _mm256_sub_pd(ix1,jx2);
2301             dy12             = _mm256_sub_pd(iy1,jy2);
2302             dz12             = _mm256_sub_pd(iz1,jz2);
2303             dx13             = _mm256_sub_pd(ix1,jx3);
2304             dy13             = _mm256_sub_pd(iy1,jy3);
2305             dz13             = _mm256_sub_pd(iz1,jz3);
2306             dx21             = _mm256_sub_pd(ix2,jx1);
2307             dy21             = _mm256_sub_pd(iy2,jy1);
2308             dz21             = _mm256_sub_pd(iz2,jz1);
2309             dx22             = _mm256_sub_pd(ix2,jx2);
2310             dy22             = _mm256_sub_pd(iy2,jy2);
2311             dz22             = _mm256_sub_pd(iz2,jz2);
2312             dx23             = _mm256_sub_pd(ix2,jx3);
2313             dy23             = _mm256_sub_pd(iy2,jy3);
2314             dz23             = _mm256_sub_pd(iz2,jz3);
2315             dx31             = _mm256_sub_pd(ix3,jx1);
2316             dy31             = _mm256_sub_pd(iy3,jy1);
2317             dz31             = _mm256_sub_pd(iz3,jz1);
2318             dx32             = _mm256_sub_pd(ix3,jx2);
2319             dy32             = _mm256_sub_pd(iy3,jy2);
2320             dz32             = _mm256_sub_pd(iz3,jz2);
2321             dx33             = _mm256_sub_pd(ix3,jx3);
2322             dy33             = _mm256_sub_pd(iy3,jy3);
2323             dz33             = _mm256_sub_pd(iz3,jz3);
2324
2325             /* Calculate squared distance and things based on it */
2326             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2327             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2328             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2329             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2330             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2331             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2332             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2333             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2334             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2335             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2336
2337             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2338             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2339             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
2340             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2341             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2342             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
2343             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
2344             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
2345             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
2346
2347             rinvsq00         = gmx_mm256_inv_pd(rsq00);
2348             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2349             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2350             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2351             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2352             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2353             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2354             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2355             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2356             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2357
2358             fjx0             = _mm256_setzero_pd();
2359             fjy0             = _mm256_setzero_pd();
2360             fjz0             = _mm256_setzero_pd();
2361             fjx1             = _mm256_setzero_pd();
2362             fjy1             = _mm256_setzero_pd();
2363             fjz1             = _mm256_setzero_pd();
2364             fjx2             = _mm256_setzero_pd();
2365             fjy2             = _mm256_setzero_pd();
2366             fjz2             = _mm256_setzero_pd();
2367             fjx3             = _mm256_setzero_pd();
2368             fjy3             = _mm256_setzero_pd();
2369             fjz3             = _mm256_setzero_pd();
2370
2371             /**************************
2372              * CALCULATE INTERACTIONS *
2373              **************************/
2374
2375             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2376             {
2377
2378             /* LENNARD-JONES DISPERSION/REPULSION */
2379
2380             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2381             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2382
2383             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2384
2385             fscal            = fvdw;
2386
2387             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2388
2389             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2390
2391             /* Calculate temporary vectorial force */
2392             tx               = _mm256_mul_pd(fscal,dx00);
2393             ty               = _mm256_mul_pd(fscal,dy00);
2394             tz               = _mm256_mul_pd(fscal,dz00);
2395
2396             /* Update vectorial force */
2397             fix0             = _mm256_add_pd(fix0,tx);
2398             fiy0             = _mm256_add_pd(fiy0,ty);
2399             fiz0             = _mm256_add_pd(fiz0,tz);
2400
2401             fjx0             = _mm256_add_pd(fjx0,tx);
2402             fjy0             = _mm256_add_pd(fjy0,ty);
2403             fjz0             = _mm256_add_pd(fjz0,tz);
2404
2405             }
2406
2407             /**************************
2408              * CALCULATE INTERACTIONS *
2409              **************************/
2410
2411             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2412             {
2413
2414             r11              = _mm256_mul_pd(rsq11,rinv11);
2415             r11              = _mm256_andnot_pd(dummy_mask,r11);
2416
2417             /* EWALD ELECTROSTATICS */
2418
2419             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2420             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2421             ewitab           = _mm256_cvttpd_epi32(ewrt);
2422             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2423             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2424                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2425                                             &ewtabF,&ewtabFn);
2426             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2427             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2428
2429             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2430
2431             fscal            = felec;
2432
2433             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2434
2435             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2436
2437             /* Calculate temporary vectorial force */
2438             tx               = _mm256_mul_pd(fscal,dx11);
2439             ty               = _mm256_mul_pd(fscal,dy11);
2440             tz               = _mm256_mul_pd(fscal,dz11);
2441
2442             /* Update vectorial force */
2443             fix1             = _mm256_add_pd(fix1,tx);
2444             fiy1             = _mm256_add_pd(fiy1,ty);
2445             fiz1             = _mm256_add_pd(fiz1,tz);
2446
2447             fjx1             = _mm256_add_pd(fjx1,tx);
2448             fjy1             = _mm256_add_pd(fjy1,ty);
2449             fjz1             = _mm256_add_pd(fjz1,tz);
2450
2451             }
2452
2453             /**************************
2454              * CALCULATE INTERACTIONS *
2455              **************************/
2456
2457             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2458             {
2459
2460             r12              = _mm256_mul_pd(rsq12,rinv12);
2461             r12              = _mm256_andnot_pd(dummy_mask,r12);
2462
2463             /* EWALD ELECTROSTATICS */
2464
2465             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2466             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2467             ewitab           = _mm256_cvttpd_epi32(ewrt);
2468             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2469             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2470                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2471                                             &ewtabF,&ewtabFn);
2472             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2473             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2474
2475             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2476
2477             fscal            = felec;
2478
2479             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2480
2481             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2482
2483             /* Calculate temporary vectorial force */
2484             tx               = _mm256_mul_pd(fscal,dx12);
2485             ty               = _mm256_mul_pd(fscal,dy12);
2486             tz               = _mm256_mul_pd(fscal,dz12);
2487
2488             /* Update vectorial force */
2489             fix1             = _mm256_add_pd(fix1,tx);
2490             fiy1             = _mm256_add_pd(fiy1,ty);
2491             fiz1             = _mm256_add_pd(fiz1,tz);
2492
2493             fjx2             = _mm256_add_pd(fjx2,tx);
2494             fjy2             = _mm256_add_pd(fjy2,ty);
2495             fjz2             = _mm256_add_pd(fjz2,tz);
2496
2497             }
2498
2499             /**************************
2500              * CALCULATE INTERACTIONS *
2501              **************************/
2502
2503             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2504             {
2505
2506             r13              = _mm256_mul_pd(rsq13,rinv13);
2507             r13              = _mm256_andnot_pd(dummy_mask,r13);
2508
2509             /* EWALD ELECTROSTATICS */
2510
2511             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2512             ewrt             = _mm256_mul_pd(r13,ewtabscale);
2513             ewitab           = _mm256_cvttpd_epi32(ewrt);
2514             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2515             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2516                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2517                                             &ewtabF,&ewtabFn);
2518             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2519             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2520
2521             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2522
2523             fscal            = felec;
2524
2525             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2526
2527             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2528
2529             /* Calculate temporary vectorial force */
2530             tx               = _mm256_mul_pd(fscal,dx13);
2531             ty               = _mm256_mul_pd(fscal,dy13);
2532             tz               = _mm256_mul_pd(fscal,dz13);
2533
2534             /* Update vectorial force */
2535             fix1             = _mm256_add_pd(fix1,tx);
2536             fiy1             = _mm256_add_pd(fiy1,ty);
2537             fiz1             = _mm256_add_pd(fiz1,tz);
2538
2539             fjx3             = _mm256_add_pd(fjx3,tx);
2540             fjy3             = _mm256_add_pd(fjy3,ty);
2541             fjz3             = _mm256_add_pd(fjz3,tz);
2542
2543             }
2544
2545             /**************************
2546              * CALCULATE INTERACTIONS *
2547              **************************/
2548
2549             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2550             {
2551
2552             r21              = _mm256_mul_pd(rsq21,rinv21);
2553             r21              = _mm256_andnot_pd(dummy_mask,r21);
2554
2555             /* EWALD ELECTROSTATICS */
2556
2557             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2558             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2559             ewitab           = _mm256_cvttpd_epi32(ewrt);
2560             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2561             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2562                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2563                                             &ewtabF,&ewtabFn);
2564             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2565             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2566
2567             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2568
2569             fscal            = felec;
2570
2571             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2572
2573             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2574
2575             /* Calculate temporary vectorial force */
2576             tx               = _mm256_mul_pd(fscal,dx21);
2577             ty               = _mm256_mul_pd(fscal,dy21);
2578             tz               = _mm256_mul_pd(fscal,dz21);
2579
2580             /* Update vectorial force */
2581             fix2             = _mm256_add_pd(fix2,tx);
2582             fiy2             = _mm256_add_pd(fiy2,ty);
2583             fiz2             = _mm256_add_pd(fiz2,tz);
2584
2585             fjx1             = _mm256_add_pd(fjx1,tx);
2586             fjy1             = _mm256_add_pd(fjy1,ty);
2587             fjz1             = _mm256_add_pd(fjz1,tz);
2588
2589             }
2590
2591             /**************************
2592              * CALCULATE INTERACTIONS *
2593              **************************/
2594
2595             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2596             {
2597
2598             r22              = _mm256_mul_pd(rsq22,rinv22);
2599             r22              = _mm256_andnot_pd(dummy_mask,r22);
2600
2601             /* EWALD ELECTROSTATICS */
2602
2603             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2604             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2605             ewitab           = _mm256_cvttpd_epi32(ewrt);
2606             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2607             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2608                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2609                                             &ewtabF,&ewtabFn);
2610             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2611             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2612
2613             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2614
2615             fscal            = felec;
2616
2617             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2618
2619             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2620
2621             /* Calculate temporary vectorial force */
2622             tx               = _mm256_mul_pd(fscal,dx22);
2623             ty               = _mm256_mul_pd(fscal,dy22);
2624             tz               = _mm256_mul_pd(fscal,dz22);
2625
2626             /* Update vectorial force */
2627             fix2             = _mm256_add_pd(fix2,tx);
2628             fiy2             = _mm256_add_pd(fiy2,ty);
2629             fiz2             = _mm256_add_pd(fiz2,tz);
2630
2631             fjx2             = _mm256_add_pd(fjx2,tx);
2632             fjy2             = _mm256_add_pd(fjy2,ty);
2633             fjz2             = _mm256_add_pd(fjz2,tz);
2634
2635             }
2636
2637             /**************************
2638              * CALCULATE INTERACTIONS *
2639              **************************/
2640
2641             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2642             {
2643
2644             r23              = _mm256_mul_pd(rsq23,rinv23);
2645             r23              = _mm256_andnot_pd(dummy_mask,r23);
2646
2647             /* EWALD ELECTROSTATICS */
2648
2649             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2650             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2651             ewitab           = _mm256_cvttpd_epi32(ewrt);
2652             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2653             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2654                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2655                                             &ewtabF,&ewtabFn);
2656             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2657             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2658
2659             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2660
2661             fscal            = felec;
2662
2663             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2664
2665             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2666
2667             /* Calculate temporary vectorial force */
2668             tx               = _mm256_mul_pd(fscal,dx23);
2669             ty               = _mm256_mul_pd(fscal,dy23);
2670             tz               = _mm256_mul_pd(fscal,dz23);
2671
2672             /* Update vectorial force */
2673             fix2             = _mm256_add_pd(fix2,tx);
2674             fiy2             = _mm256_add_pd(fiy2,ty);
2675             fiz2             = _mm256_add_pd(fiz2,tz);
2676
2677             fjx3             = _mm256_add_pd(fjx3,tx);
2678             fjy3             = _mm256_add_pd(fjy3,ty);
2679             fjz3             = _mm256_add_pd(fjz3,tz);
2680
2681             }
2682
2683             /**************************
2684              * CALCULATE INTERACTIONS *
2685              **************************/
2686
2687             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2688             {
2689
2690             r31              = _mm256_mul_pd(rsq31,rinv31);
2691             r31              = _mm256_andnot_pd(dummy_mask,r31);
2692
2693             /* EWALD ELECTROSTATICS */
2694
2695             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2696             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2697             ewitab           = _mm256_cvttpd_epi32(ewrt);
2698             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2699             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2700                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2701                                             &ewtabF,&ewtabFn);
2702             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2703             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2704
2705             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2706
2707             fscal            = felec;
2708
2709             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2710
2711             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2712
2713             /* Calculate temporary vectorial force */
2714             tx               = _mm256_mul_pd(fscal,dx31);
2715             ty               = _mm256_mul_pd(fscal,dy31);
2716             tz               = _mm256_mul_pd(fscal,dz31);
2717
2718             /* Update vectorial force */
2719             fix3             = _mm256_add_pd(fix3,tx);
2720             fiy3             = _mm256_add_pd(fiy3,ty);
2721             fiz3             = _mm256_add_pd(fiz3,tz);
2722
2723             fjx1             = _mm256_add_pd(fjx1,tx);
2724             fjy1             = _mm256_add_pd(fjy1,ty);
2725             fjz1             = _mm256_add_pd(fjz1,tz);
2726
2727             }
2728
2729             /**************************
2730              * CALCULATE INTERACTIONS *
2731              **************************/
2732
2733             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2734             {
2735
2736             r32              = _mm256_mul_pd(rsq32,rinv32);
2737             r32              = _mm256_andnot_pd(dummy_mask,r32);
2738
2739             /* EWALD ELECTROSTATICS */
2740
2741             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2742             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2743             ewitab           = _mm256_cvttpd_epi32(ewrt);
2744             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2745             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2746                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2747                                             &ewtabF,&ewtabFn);
2748             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2749             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2750
2751             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2752
2753             fscal            = felec;
2754
2755             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2756
2757             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2758
2759             /* Calculate temporary vectorial force */
2760             tx               = _mm256_mul_pd(fscal,dx32);
2761             ty               = _mm256_mul_pd(fscal,dy32);
2762             tz               = _mm256_mul_pd(fscal,dz32);
2763
2764             /* Update vectorial force */
2765             fix3             = _mm256_add_pd(fix3,tx);
2766             fiy3             = _mm256_add_pd(fiy3,ty);
2767             fiz3             = _mm256_add_pd(fiz3,tz);
2768
2769             fjx2             = _mm256_add_pd(fjx2,tx);
2770             fjy2             = _mm256_add_pd(fjy2,ty);
2771             fjz2             = _mm256_add_pd(fjz2,tz);
2772
2773             }
2774
2775             /**************************
2776              * CALCULATE INTERACTIONS *
2777              **************************/
2778
2779             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2780             {
2781
2782             r33              = _mm256_mul_pd(rsq33,rinv33);
2783             r33              = _mm256_andnot_pd(dummy_mask,r33);
2784
2785             /* EWALD ELECTROSTATICS */
2786
2787             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2788             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2789             ewitab           = _mm256_cvttpd_epi32(ewrt);
2790             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2791             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2792                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2793                                             &ewtabF,&ewtabFn);
2794             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2795             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2796
2797             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2798
2799             fscal            = felec;
2800
2801             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2802
2803             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2804
2805             /* Calculate temporary vectorial force */
2806             tx               = _mm256_mul_pd(fscal,dx33);
2807             ty               = _mm256_mul_pd(fscal,dy33);
2808             tz               = _mm256_mul_pd(fscal,dz33);
2809
2810             /* Update vectorial force */
2811             fix3             = _mm256_add_pd(fix3,tx);
2812             fiy3             = _mm256_add_pd(fiy3,ty);
2813             fiz3             = _mm256_add_pd(fiz3,tz);
2814
2815             fjx3             = _mm256_add_pd(fjx3,tx);
2816             fjy3             = _mm256_add_pd(fjy3,ty);
2817             fjz3             = _mm256_add_pd(fjz3,tz);
2818
2819             }
2820
2821             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2822             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2823             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2824             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2825
2826             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2827                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2828                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2829
2830             /* Inner loop uses 393 flops */
2831         }
2832
2833         /* End of innermost loop */
2834
2835         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2836                                                  f+i_coord_offset,fshift+i_shift_offset);
2837
2838         /* Increment number of inner iterations */
2839         inneriter                  += j_index_end - j_index_start;
2840
2841         /* Outer loop uses 24 flops */
2842     }
2843
2844     /* Increment number of outer iterations */
2845     outeriter        += nri;
2846
2847     /* Update outer/inner flops */
2848
2849     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*393);
2850 }