Remove no-inline-max-size and suppress remark
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84     real             scratch[4*DIM];
85     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     real *           vdwioffsetptr3;
93     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
95     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
97     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
99     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
101     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
102     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
103     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
106     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
109     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
110     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
111     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
112     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
113     real             *charge;
114     int              nvdwtype;
115     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
116     int              *vdwtype;
117     real             *vdwparam;
118     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
119     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
120     __m128i          ewitab;
121     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
122     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
123     real             *ewtab;
124     __m256d          dummy_mask,cutoff_mask;
125     __m128           tmpmask0,tmpmask1;
126     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
127     __m256d          one     = _mm256_set1_pd(1.0);
128     __m256d          two     = _mm256_set1_pd(2.0);
129     x                = xx[0];
130     f                = ff[0];
131
132     nri              = nlist->nri;
133     iinr             = nlist->iinr;
134     jindex           = nlist->jindex;
135     jjnr             = nlist->jjnr;
136     shiftidx         = nlist->shift;
137     gid              = nlist->gid;
138     shiftvec         = fr->shift_vec[0];
139     fshift           = fr->fshift[0];
140     facel            = _mm256_set1_pd(fr->epsfac);
141     charge           = mdatoms->chargeA;
142     nvdwtype         = fr->ntype;
143     vdwparam         = fr->nbfp;
144     vdwtype          = mdatoms->typeA;
145
146     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
147     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
148     beta2            = _mm256_mul_pd(beta,beta);
149     beta3            = _mm256_mul_pd(beta,beta2);
150
151     ewtab            = fr->ic->tabq_coul_FDV0;
152     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
153     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
154
155     /* Setup water-specific parameters */
156     inr              = nlist->iinr[0];
157     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
158     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
159     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
160     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
161
162     jq1              = _mm256_set1_pd(charge[inr+1]);
163     jq2              = _mm256_set1_pd(charge[inr+2]);
164     jq3              = _mm256_set1_pd(charge[inr+3]);
165     vdwjidx0A        = 2*vdwtype[inr+0];
166     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
167     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
168     qq11             = _mm256_mul_pd(iq1,jq1);
169     qq12             = _mm256_mul_pd(iq1,jq2);
170     qq13             = _mm256_mul_pd(iq1,jq3);
171     qq21             = _mm256_mul_pd(iq2,jq1);
172     qq22             = _mm256_mul_pd(iq2,jq2);
173     qq23             = _mm256_mul_pd(iq2,jq3);
174     qq31             = _mm256_mul_pd(iq3,jq1);
175     qq32             = _mm256_mul_pd(iq3,jq2);
176     qq33             = _mm256_mul_pd(iq3,jq3);
177
178     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
179     rcutoff_scalar   = fr->rcoulomb;
180     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
181     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
182
183     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
184     rvdw             = _mm256_set1_pd(fr->rvdw);
185
186     /* Avoid stupid compiler warnings */
187     jnrA = jnrB = jnrC = jnrD = 0;
188     j_coord_offsetA = 0;
189     j_coord_offsetB = 0;
190     j_coord_offsetC = 0;
191     j_coord_offsetD = 0;
192
193     outeriter        = 0;
194     inneriter        = 0;
195
196     for(iidx=0;iidx<4*DIM;iidx++)
197     {
198         scratch[iidx] = 0.0;
199     }
200
201     /* Start outer loop over neighborlists */
202     for(iidx=0; iidx<nri; iidx++)
203     {
204         /* Load shift vector for this list */
205         i_shift_offset   = DIM*shiftidx[iidx];
206
207         /* Load limits for loop over neighbors */
208         j_index_start    = jindex[iidx];
209         j_index_end      = jindex[iidx+1];
210
211         /* Get outer coordinate index */
212         inr              = iinr[iidx];
213         i_coord_offset   = DIM*inr;
214
215         /* Load i particle coords and add shift vector */
216         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
217                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
218
219         fix0             = _mm256_setzero_pd();
220         fiy0             = _mm256_setzero_pd();
221         fiz0             = _mm256_setzero_pd();
222         fix1             = _mm256_setzero_pd();
223         fiy1             = _mm256_setzero_pd();
224         fiz1             = _mm256_setzero_pd();
225         fix2             = _mm256_setzero_pd();
226         fiy2             = _mm256_setzero_pd();
227         fiz2             = _mm256_setzero_pd();
228         fix3             = _mm256_setzero_pd();
229         fiy3             = _mm256_setzero_pd();
230         fiz3             = _mm256_setzero_pd();
231
232         /* Reset potential sums */
233         velecsum         = _mm256_setzero_pd();
234         vvdwsum          = _mm256_setzero_pd();
235
236         /* Start inner kernel loop */
237         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
238         {
239
240             /* Get j neighbor index, and coordinate index */
241             jnrA             = jjnr[jidx];
242             jnrB             = jjnr[jidx+1];
243             jnrC             = jjnr[jidx+2];
244             jnrD             = jjnr[jidx+3];
245             j_coord_offsetA  = DIM*jnrA;
246             j_coord_offsetB  = DIM*jnrB;
247             j_coord_offsetC  = DIM*jnrC;
248             j_coord_offsetD  = DIM*jnrD;
249
250             /* load j atom coordinates */
251             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
252                                                  x+j_coord_offsetC,x+j_coord_offsetD,
253                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
254                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
255
256             /* Calculate displacement vector */
257             dx00             = _mm256_sub_pd(ix0,jx0);
258             dy00             = _mm256_sub_pd(iy0,jy0);
259             dz00             = _mm256_sub_pd(iz0,jz0);
260             dx11             = _mm256_sub_pd(ix1,jx1);
261             dy11             = _mm256_sub_pd(iy1,jy1);
262             dz11             = _mm256_sub_pd(iz1,jz1);
263             dx12             = _mm256_sub_pd(ix1,jx2);
264             dy12             = _mm256_sub_pd(iy1,jy2);
265             dz12             = _mm256_sub_pd(iz1,jz2);
266             dx13             = _mm256_sub_pd(ix1,jx3);
267             dy13             = _mm256_sub_pd(iy1,jy3);
268             dz13             = _mm256_sub_pd(iz1,jz3);
269             dx21             = _mm256_sub_pd(ix2,jx1);
270             dy21             = _mm256_sub_pd(iy2,jy1);
271             dz21             = _mm256_sub_pd(iz2,jz1);
272             dx22             = _mm256_sub_pd(ix2,jx2);
273             dy22             = _mm256_sub_pd(iy2,jy2);
274             dz22             = _mm256_sub_pd(iz2,jz2);
275             dx23             = _mm256_sub_pd(ix2,jx3);
276             dy23             = _mm256_sub_pd(iy2,jy3);
277             dz23             = _mm256_sub_pd(iz2,jz3);
278             dx31             = _mm256_sub_pd(ix3,jx1);
279             dy31             = _mm256_sub_pd(iy3,jy1);
280             dz31             = _mm256_sub_pd(iz3,jz1);
281             dx32             = _mm256_sub_pd(ix3,jx2);
282             dy32             = _mm256_sub_pd(iy3,jy2);
283             dz32             = _mm256_sub_pd(iz3,jz2);
284             dx33             = _mm256_sub_pd(ix3,jx3);
285             dy33             = _mm256_sub_pd(iy3,jy3);
286             dz33             = _mm256_sub_pd(iz3,jz3);
287
288             /* Calculate squared distance and things based on it */
289             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
290             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
291             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
292             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
293             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
294             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
295             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
296             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
297             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
298             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
299
300             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
301             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
302             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
303             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
304             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
305             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
306             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
307             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
308             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
309
310             rinvsq00         = gmx_mm256_inv_pd(rsq00);
311             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
312             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
313             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
314             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
315             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
316             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
317             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
318             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
319             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
320
321             fjx0             = _mm256_setzero_pd();
322             fjy0             = _mm256_setzero_pd();
323             fjz0             = _mm256_setzero_pd();
324             fjx1             = _mm256_setzero_pd();
325             fjy1             = _mm256_setzero_pd();
326             fjz1             = _mm256_setzero_pd();
327             fjx2             = _mm256_setzero_pd();
328             fjy2             = _mm256_setzero_pd();
329             fjz2             = _mm256_setzero_pd();
330             fjx3             = _mm256_setzero_pd();
331             fjy3             = _mm256_setzero_pd();
332             fjz3             = _mm256_setzero_pd();
333
334             /**************************
335              * CALCULATE INTERACTIONS *
336              **************************/
337
338             if (gmx_mm256_any_lt(rsq00,rcutoff2))
339             {
340
341             /* LENNARD-JONES DISPERSION/REPULSION */
342
343             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
344             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
345             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
346             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
347                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
348             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
349
350             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
351
352             /* Update potential sum for this i atom from the interaction with this j atom. */
353             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
354             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
355
356             fscal            = fvdw;
357
358             fscal            = _mm256_and_pd(fscal,cutoff_mask);
359
360             /* Calculate temporary vectorial force */
361             tx               = _mm256_mul_pd(fscal,dx00);
362             ty               = _mm256_mul_pd(fscal,dy00);
363             tz               = _mm256_mul_pd(fscal,dz00);
364
365             /* Update vectorial force */
366             fix0             = _mm256_add_pd(fix0,tx);
367             fiy0             = _mm256_add_pd(fiy0,ty);
368             fiz0             = _mm256_add_pd(fiz0,tz);
369
370             fjx0             = _mm256_add_pd(fjx0,tx);
371             fjy0             = _mm256_add_pd(fjy0,ty);
372             fjz0             = _mm256_add_pd(fjz0,tz);
373
374             }
375
376             /**************************
377              * CALCULATE INTERACTIONS *
378              **************************/
379
380             if (gmx_mm256_any_lt(rsq11,rcutoff2))
381             {
382
383             r11              = _mm256_mul_pd(rsq11,rinv11);
384
385             /* EWALD ELECTROSTATICS */
386
387             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
388             ewrt             = _mm256_mul_pd(r11,ewtabscale);
389             ewitab           = _mm256_cvttpd_epi32(ewrt);
390             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
391             ewitab           = _mm_slli_epi32(ewitab,2);
392             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
393             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
394             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
395             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
396             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
397             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
398             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
399             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
400             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
401
402             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
403
404             /* Update potential sum for this i atom from the interaction with this j atom. */
405             velec            = _mm256_and_pd(velec,cutoff_mask);
406             velecsum         = _mm256_add_pd(velecsum,velec);
407
408             fscal            = felec;
409
410             fscal            = _mm256_and_pd(fscal,cutoff_mask);
411
412             /* Calculate temporary vectorial force */
413             tx               = _mm256_mul_pd(fscal,dx11);
414             ty               = _mm256_mul_pd(fscal,dy11);
415             tz               = _mm256_mul_pd(fscal,dz11);
416
417             /* Update vectorial force */
418             fix1             = _mm256_add_pd(fix1,tx);
419             fiy1             = _mm256_add_pd(fiy1,ty);
420             fiz1             = _mm256_add_pd(fiz1,tz);
421
422             fjx1             = _mm256_add_pd(fjx1,tx);
423             fjy1             = _mm256_add_pd(fjy1,ty);
424             fjz1             = _mm256_add_pd(fjz1,tz);
425
426             }
427
428             /**************************
429              * CALCULATE INTERACTIONS *
430              **************************/
431
432             if (gmx_mm256_any_lt(rsq12,rcutoff2))
433             {
434
435             r12              = _mm256_mul_pd(rsq12,rinv12);
436
437             /* EWALD ELECTROSTATICS */
438
439             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
440             ewrt             = _mm256_mul_pd(r12,ewtabscale);
441             ewitab           = _mm256_cvttpd_epi32(ewrt);
442             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
443             ewitab           = _mm_slli_epi32(ewitab,2);
444             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
445             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
446             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
447             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
448             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
449             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
450             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
451             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
452             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
453
454             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
455
456             /* Update potential sum for this i atom from the interaction with this j atom. */
457             velec            = _mm256_and_pd(velec,cutoff_mask);
458             velecsum         = _mm256_add_pd(velecsum,velec);
459
460             fscal            = felec;
461
462             fscal            = _mm256_and_pd(fscal,cutoff_mask);
463
464             /* Calculate temporary vectorial force */
465             tx               = _mm256_mul_pd(fscal,dx12);
466             ty               = _mm256_mul_pd(fscal,dy12);
467             tz               = _mm256_mul_pd(fscal,dz12);
468
469             /* Update vectorial force */
470             fix1             = _mm256_add_pd(fix1,tx);
471             fiy1             = _mm256_add_pd(fiy1,ty);
472             fiz1             = _mm256_add_pd(fiz1,tz);
473
474             fjx2             = _mm256_add_pd(fjx2,tx);
475             fjy2             = _mm256_add_pd(fjy2,ty);
476             fjz2             = _mm256_add_pd(fjz2,tz);
477
478             }
479
480             /**************************
481              * CALCULATE INTERACTIONS *
482              **************************/
483
484             if (gmx_mm256_any_lt(rsq13,rcutoff2))
485             {
486
487             r13              = _mm256_mul_pd(rsq13,rinv13);
488
489             /* EWALD ELECTROSTATICS */
490
491             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
492             ewrt             = _mm256_mul_pd(r13,ewtabscale);
493             ewitab           = _mm256_cvttpd_epi32(ewrt);
494             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
495             ewitab           = _mm_slli_epi32(ewitab,2);
496             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
497             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
498             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
499             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
500             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
501             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
502             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
503             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
504             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
505
506             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
507
508             /* Update potential sum for this i atom from the interaction with this j atom. */
509             velec            = _mm256_and_pd(velec,cutoff_mask);
510             velecsum         = _mm256_add_pd(velecsum,velec);
511
512             fscal            = felec;
513
514             fscal            = _mm256_and_pd(fscal,cutoff_mask);
515
516             /* Calculate temporary vectorial force */
517             tx               = _mm256_mul_pd(fscal,dx13);
518             ty               = _mm256_mul_pd(fscal,dy13);
519             tz               = _mm256_mul_pd(fscal,dz13);
520
521             /* Update vectorial force */
522             fix1             = _mm256_add_pd(fix1,tx);
523             fiy1             = _mm256_add_pd(fiy1,ty);
524             fiz1             = _mm256_add_pd(fiz1,tz);
525
526             fjx3             = _mm256_add_pd(fjx3,tx);
527             fjy3             = _mm256_add_pd(fjy3,ty);
528             fjz3             = _mm256_add_pd(fjz3,tz);
529
530             }
531
532             /**************************
533              * CALCULATE INTERACTIONS *
534              **************************/
535
536             if (gmx_mm256_any_lt(rsq21,rcutoff2))
537             {
538
539             r21              = _mm256_mul_pd(rsq21,rinv21);
540
541             /* EWALD ELECTROSTATICS */
542
543             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
544             ewrt             = _mm256_mul_pd(r21,ewtabscale);
545             ewitab           = _mm256_cvttpd_epi32(ewrt);
546             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
547             ewitab           = _mm_slli_epi32(ewitab,2);
548             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
549             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
550             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
551             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
552             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
553             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
554             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
555             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
556             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
557
558             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
559
560             /* Update potential sum for this i atom from the interaction with this j atom. */
561             velec            = _mm256_and_pd(velec,cutoff_mask);
562             velecsum         = _mm256_add_pd(velecsum,velec);
563
564             fscal            = felec;
565
566             fscal            = _mm256_and_pd(fscal,cutoff_mask);
567
568             /* Calculate temporary vectorial force */
569             tx               = _mm256_mul_pd(fscal,dx21);
570             ty               = _mm256_mul_pd(fscal,dy21);
571             tz               = _mm256_mul_pd(fscal,dz21);
572
573             /* Update vectorial force */
574             fix2             = _mm256_add_pd(fix2,tx);
575             fiy2             = _mm256_add_pd(fiy2,ty);
576             fiz2             = _mm256_add_pd(fiz2,tz);
577
578             fjx1             = _mm256_add_pd(fjx1,tx);
579             fjy1             = _mm256_add_pd(fjy1,ty);
580             fjz1             = _mm256_add_pd(fjz1,tz);
581
582             }
583
584             /**************************
585              * CALCULATE INTERACTIONS *
586              **************************/
587
588             if (gmx_mm256_any_lt(rsq22,rcutoff2))
589             {
590
591             r22              = _mm256_mul_pd(rsq22,rinv22);
592
593             /* EWALD ELECTROSTATICS */
594
595             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
596             ewrt             = _mm256_mul_pd(r22,ewtabscale);
597             ewitab           = _mm256_cvttpd_epi32(ewrt);
598             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
599             ewitab           = _mm_slli_epi32(ewitab,2);
600             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
601             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
602             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
603             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
604             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
605             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
606             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
607             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
608             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
609
610             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
611
612             /* Update potential sum for this i atom from the interaction with this j atom. */
613             velec            = _mm256_and_pd(velec,cutoff_mask);
614             velecsum         = _mm256_add_pd(velecsum,velec);
615
616             fscal            = felec;
617
618             fscal            = _mm256_and_pd(fscal,cutoff_mask);
619
620             /* Calculate temporary vectorial force */
621             tx               = _mm256_mul_pd(fscal,dx22);
622             ty               = _mm256_mul_pd(fscal,dy22);
623             tz               = _mm256_mul_pd(fscal,dz22);
624
625             /* Update vectorial force */
626             fix2             = _mm256_add_pd(fix2,tx);
627             fiy2             = _mm256_add_pd(fiy2,ty);
628             fiz2             = _mm256_add_pd(fiz2,tz);
629
630             fjx2             = _mm256_add_pd(fjx2,tx);
631             fjy2             = _mm256_add_pd(fjy2,ty);
632             fjz2             = _mm256_add_pd(fjz2,tz);
633
634             }
635
636             /**************************
637              * CALCULATE INTERACTIONS *
638              **************************/
639
640             if (gmx_mm256_any_lt(rsq23,rcutoff2))
641             {
642
643             r23              = _mm256_mul_pd(rsq23,rinv23);
644
645             /* EWALD ELECTROSTATICS */
646
647             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
648             ewrt             = _mm256_mul_pd(r23,ewtabscale);
649             ewitab           = _mm256_cvttpd_epi32(ewrt);
650             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
651             ewitab           = _mm_slli_epi32(ewitab,2);
652             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
653             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
654             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
655             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
656             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
657             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
658             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
659             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
660             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
661
662             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
663
664             /* Update potential sum for this i atom from the interaction with this j atom. */
665             velec            = _mm256_and_pd(velec,cutoff_mask);
666             velecsum         = _mm256_add_pd(velecsum,velec);
667
668             fscal            = felec;
669
670             fscal            = _mm256_and_pd(fscal,cutoff_mask);
671
672             /* Calculate temporary vectorial force */
673             tx               = _mm256_mul_pd(fscal,dx23);
674             ty               = _mm256_mul_pd(fscal,dy23);
675             tz               = _mm256_mul_pd(fscal,dz23);
676
677             /* Update vectorial force */
678             fix2             = _mm256_add_pd(fix2,tx);
679             fiy2             = _mm256_add_pd(fiy2,ty);
680             fiz2             = _mm256_add_pd(fiz2,tz);
681
682             fjx3             = _mm256_add_pd(fjx3,tx);
683             fjy3             = _mm256_add_pd(fjy3,ty);
684             fjz3             = _mm256_add_pd(fjz3,tz);
685
686             }
687
688             /**************************
689              * CALCULATE INTERACTIONS *
690              **************************/
691
692             if (gmx_mm256_any_lt(rsq31,rcutoff2))
693             {
694
695             r31              = _mm256_mul_pd(rsq31,rinv31);
696
697             /* EWALD ELECTROSTATICS */
698
699             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
700             ewrt             = _mm256_mul_pd(r31,ewtabscale);
701             ewitab           = _mm256_cvttpd_epi32(ewrt);
702             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
703             ewitab           = _mm_slli_epi32(ewitab,2);
704             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
705             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
706             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
707             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
708             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
709             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
710             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
711             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
712             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
713
714             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
715
716             /* Update potential sum for this i atom from the interaction with this j atom. */
717             velec            = _mm256_and_pd(velec,cutoff_mask);
718             velecsum         = _mm256_add_pd(velecsum,velec);
719
720             fscal            = felec;
721
722             fscal            = _mm256_and_pd(fscal,cutoff_mask);
723
724             /* Calculate temporary vectorial force */
725             tx               = _mm256_mul_pd(fscal,dx31);
726             ty               = _mm256_mul_pd(fscal,dy31);
727             tz               = _mm256_mul_pd(fscal,dz31);
728
729             /* Update vectorial force */
730             fix3             = _mm256_add_pd(fix3,tx);
731             fiy3             = _mm256_add_pd(fiy3,ty);
732             fiz3             = _mm256_add_pd(fiz3,tz);
733
734             fjx1             = _mm256_add_pd(fjx1,tx);
735             fjy1             = _mm256_add_pd(fjy1,ty);
736             fjz1             = _mm256_add_pd(fjz1,tz);
737
738             }
739
740             /**************************
741              * CALCULATE INTERACTIONS *
742              **************************/
743
744             if (gmx_mm256_any_lt(rsq32,rcutoff2))
745             {
746
747             r32              = _mm256_mul_pd(rsq32,rinv32);
748
749             /* EWALD ELECTROSTATICS */
750
751             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
752             ewrt             = _mm256_mul_pd(r32,ewtabscale);
753             ewitab           = _mm256_cvttpd_epi32(ewrt);
754             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
755             ewitab           = _mm_slli_epi32(ewitab,2);
756             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
757             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
758             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
759             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
760             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
761             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
762             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
763             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
764             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
765
766             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
767
768             /* Update potential sum for this i atom from the interaction with this j atom. */
769             velec            = _mm256_and_pd(velec,cutoff_mask);
770             velecsum         = _mm256_add_pd(velecsum,velec);
771
772             fscal            = felec;
773
774             fscal            = _mm256_and_pd(fscal,cutoff_mask);
775
776             /* Calculate temporary vectorial force */
777             tx               = _mm256_mul_pd(fscal,dx32);
778             ty               = _mm256_mul_pd(fscal,dy32);
779             tz               = _mm256_mul_pd(fscal,dz32);
780
781             /* Update vectorial force */
782             fix3             = _mm256_add_pd(fix3,tx);
783             fiy3             = _mm256_add_pd(fiy3,ty);
784             fiz3             = _mm256_add_pd(fiz3,tz);
785
786             fjx2             = _mm256_add_pd(fjx2,tx);
787             fjy2             = _mm256_add_pd(fjy2,ty);
788             fjz2             = _mm256_add_pd(fjz2,tz);
789
790             }
791
792             /**************************
793              * CALCULATE INTERACTIONS *
794              **************************/
795
796             if (gmx_mm256_any_lt(rsq33,rcutoff2))
797             {
798
799             r33              = _mm256_mul_pd(rsq33,rinv33);
800
801             /* EWALD ELECTROSTATICS */
802
803             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
804             ewrt             = _mm256_mul_pd(r33,ewtabscale);
805             ewitab           = _mm256_cvttpd_epi32(ewrt);
806             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
807             ewitab           = _mm_slli_epi32(ewitab,2);
808             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
809             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
810             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
811             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
812             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
813             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
814             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
815             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
816             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
817
818             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
819
820             /* Update potential sum for this i atom from the interaction with this j atom. */
821             velec            = _mm256_and_pd(velec,cutoff_mask);
822             velecsum         = _mm256_add_pd(velecsum,velec);
823
824             fscal            = felec;
825
826             fscal            = _mm256_and_pd(fscal,cutoff_mask);
827
828             /* Calculate temporary vectorial force */
829             tx               = _mm256_mul_pd(fscal,dx33);
830             ty               = _mm256_mul_pd(fscal,dy33);
831             tz               = _mm256_mul_pd(fscal,dz33);
832
833             /* Update vectorial force */
834             fix3             = _mm256_add_pd(fix3,tx);
835             fiy3             = _mm256_add_pd(fiy3,ty);
836             fiz3             = _mm256_add_pd(fiz3,tz);
837
838             fjx3             = _mm256_add_pd(fjx3,tx);
839             fjy3             = _mm256_add_pd(fjy3,ty);
840             fjz3             = _mm256_add_pd(fjz3,tz);
841
842             }
843
844             fjptrA             = f+j_coord_offsetA;
845             fjptrB             = f+j_coord_offsetB;
846             fjptrC             = f+j_coord_offsetC;
847             fjptrD             = f+j_coord_offsetD;
848
849             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
850                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
851                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
852
853             /* Inner loop uses 458 flops */
854         }
855
856         if(jidx<j_index_end)
857         {
858
859             /* Get j neighbor index, and coordinate index */
860             jnrlistA         = jjnr[jidx];
861             jnrlistB         = jjnr[jidx+1];
862             jnrlistC         = jjnr[jidx+2];
863             jnrlistD         = jjnr[jidx+3];
864             /* Sign of each element will be negative for non-real atoms.
865              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
866              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
867              */
868             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
869
870             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
871             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
872             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
873
874             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
875             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
876             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
877             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
878             j_coord_offsetA  = DIM*jnrA;
879             j_coord_offsetB  = DIM*jnrB;
880             j_coord_offsetC  = DIM*jnrC;
881             j_coord_offsetD  = DIM*jnrD;
882
883             /* load j atom coordinates */
884             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
885                                                  x+j_coord_offsetC,x+j_coord_offsetD,
886                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
887                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
888
889             /* Calculate displacement vector */
890             dx00             = _mm256_sub_pd(ix0,jx0);
891             dy00             = _mm256_sub_pd(iy0,jy0);
892             dz00             = _mm256_sub_pd(iz0,jz0);
893             dx11             = _mm256_sub_pd(ix1,jx1);
894             dy11             = _mm256_sub_pd(iy1,jy1);
895             dz11             = _mm256_sub_pd(iz1,jz1);
896             dx12             = _mm256_sub_pd(ix1,jx2);
897             dy12             = _mm256_sub_pd(iy1,jy2);
898             dz12             = _mm256_sub_pd(iz1,jz2);
899             dx13             = _mm256_sub_pd(ix1,jx3);
900             dy13             = _mm256_sub_pd(iy1,jy3);
901             dz13             = _mm256_sub_pd(iz1,jz3);
902             dx21             = _mm256_sub_pd(ix2,jx1);
903             dy21             = _mm256_sub_pd(iy2,jy1);
904             dz21             = _mm256_sub_pd(iz2,jz1);
905             dx22             = _mm256_sub_pd(ix2,jx2);
906             dy22             = _mm256_sub_pd(iy2,jy2);
907             dz22             = _mm256_sub_pd(iz2,jz2);
908             dx23             = _mm256_sub_pd(ix2,jx3);
909             dy23             = _mm256_sub_pd(iy2,jy3);
910             dz23             = _mm256_sub_pd(iz2,jz3);
911             dx31             = _mm256_sub_pd(ix3,jx1);
912             dy31             = _mm256_sub_pd(iy3,jy1);
913             dz31             = _mm256_sub_pd(iz3,jz1);
914             dx32             = _mm256_sub_pd(ix3,jx2);
915             dy32             = _mm256_sub_pd(iy3,jy2);
916             dz32             = _mm256_sub_pd(iz3,jz2);
917             dx33             = _mm256_sub_pd(ix3,jx3);
918             dy33             = _mm256_sub_pd(iy3,jy3);
919             dz33             = _mm256_sub_pd(iz3,jz3);
920
921             /* Calculate squared distance and things based on it */
922             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
923             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
924             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
925             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
926             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
927             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
928             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
929             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
930             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
931             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
932
933             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
934             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
935             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
936             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
937             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
938             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
939             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
940             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
941             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
942
943             rinvsq00         = gmx_mm256_inv_pd(rsq00);
944             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
945             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
946             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
947             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
948             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
949             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
950             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
951             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
952             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
953
954             fjx0             = _mm256_setzero_pd();
955             fjy0             = _mm256_setzero_pd();
956             fjz0             = _mm256_setzero_pd();
957             fjx1             = _mm256_setzero_pd();
958             fjy1             = _mm256_setzero_pd();
959             fjz1             = _mm256_setzero_pd();
960             fjx2             = _mm256_setzero_pd();
961             fjy2             = _mm256_setzero_pd();
962             fjz2             = _mm256_setzero_pd();
963             fjx3             = _mm256_setzero_pd();
964             fjy3             = _mm256_setzero_pd();
965             fjz3             = _mm256_setzero_pd();
966
967             /**************************
968              * CALCULATE INTERACTIONS *
969              **************************/
970
971             if (gmx_mm256_any_lt(rsq00,rcutoff2))
972             {
973
974             /* LENNARD-JONES DISPERSION/REPULSION */
975
976             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
977             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
978             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
979             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
980                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
981             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
982
983             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
984
985             /* Update potential sum for this i atom from the interaction with this j atom. */
986             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
987             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
988             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
989
990             fscal            = fvdw;
991
992             fscal            = _mm256_and_pd(fscal,cutoff_mask);
993
994             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
995
996             /* Calculate temporary vectorial force */
997             tx               = _mm256_mul_pd(fscal,dx00);
998             ty               = _mm256_mul_pd(fscal,dy00);
999             tz               = _mm256_mul_pd(fscal,dz00);
1000
1001             /* Update vectorial force */
1002             fix0             = _mm256_add_pd(fix0,tx);
1003             fiy0             = _mm256_add_pd(fiy0,ty);
1004             fiz0             = _mm256_add_pd(fiz0,tz);
1005
1006             fjx0             = _mm256_add_pd(fjx0,tx);
1007             fjy0             = _mm256_add_pd(fjy0,ty);
1008             fjz0             = _mm256_add_pd(fjz0,tz);
1009
1010             }
1011
1012             /**************************
1013              * CALCULATE INTERACTIONS *
1014              **************************/
1015
1016             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1017             {
1018
1019             r11              = _mm256_mul_pd(rsq11,rinv11);
1020             r11              = _mm256_andnot_pd(dummy_mask,r11);
1021
1022             /* EWALD ELECTROSTATICS */
1023
1024             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1025             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1026             ewitab           = _mm256_cvttpd_epi32(ewrt);
1027             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1028             ewitab           = _mm_slli_epi32(ewitab,2);
1029             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1030             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1031             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1032             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1033             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1034             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1035             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1036             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1037             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1038
1039             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1040
1041             /* Update potential sum for this i atom from the interaction with this j atom. */
1042             velec            = _mm256_and_pd(velec,cutoff_mask);
1043             velec            = _mm256_andnot_pd(dummy_mask,velec);
1044             velecsum         = _mm256_add_pd(velecsum,velec);
1045
1046             fscal            = felec;
1047
1048             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1049
1050             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1051
1052             /* Calculate temporary vectorial force */
1053             tx               = _mm256_mul_pd(fscal,dx11);
1054             ty               = _mm256_mul_pd(fscal,dy11);
1055             tz               = _mm256_mul_pd(fscal,dz11);
1056
1057             /* Update vectorial force */
1058             fix1             = _mm256_add_pd(fix1,tx);
1059             fiy1             = _mm256_add_pd(fiy1,ty);
1060             fiz1             = _mm256_add_pd(fiz1,tz);
1061
1062             fjx1             = _mm256_add_pd(fjx1,tx);
1063             fjy1             = _mm256_add_pd(fjy1,ty);
1064             fjz1             = _mm256_add_pd(fjz1,tz);
1065
1066             }
1067
1068             /**************************
1069              * CALCULATE INTERACTIONS *
1070              **************************/
1071
1072             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1073             {
1074
1075             r12              = _mm256_mul_pd(rsq12,rinv12);
1076             r12              = _mm256_andnot_pd(dummy_mask,r12);
1077
1078             /* EWALD ELECTROSTATICS */
1079
1080             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1081             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1082             ewitab           = _mm256_cvttpd_epi32(ewrt);
1083             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1084             ewitab           = _mm_slli_epi32(ewitab,2);
1085             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1086             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1087             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1088             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1089             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1090             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1091             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1092             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1093             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1094
1095             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1096
1097             /* Update potential sum for this i atom from the interaction with this j atom. */
1098             velec            = _mm256_and_pd(velec,cutoff_mask);
1099             velec            = _mm256_andnot_pd(dummy_mask,velec);
1100             velecsum         = _mm256_add_pd(velecsum,velec);
1101
1102             fscal            = felec;
1103
1104             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1105
1106             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1107
1108             /* Calculate temporary vectorial force */
1109             tx               = _mm256_mul_pd(fscal,dx12);
1110             ty               = _mm256_mul_pd(fscal,dy12);
1111             tz               = _mm256_mul_pd(fscal,dz12);
1112
1113             /* Update vectorial force */
1114             fix1             = _mm256_add_pd(fix1,tx);
1115             fiy1             = _mm256_add_pd(fiy1,ty);
1116             fiz1             = _mm256_add_pd(fiz1,tz);
1117
1118             fjx2             = _mm256_add_pd(fjx2,tx);
1119             fjy2             = _mm256_add_pd(fjy2,ty);
1120             fjz2             = _mm256_add_pd(fjz2,tz);
1121
1122             }
1123
1124             /**************************
1125              * CALCULATE INTERACTIONS *
1126              **************************/
1127
1128             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1129             {
1130
1131             r13              = _mm256_mul_pd(rsq13,rinv13);
1132             r13              = _mm256_andnot_pd(dummy_mask,r13);
1133
1134             /* EWALD ELECTROSTATICS */
1135
1136             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1137             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1138             ewitab           = _mm256_cvttpd_epi32(ewrt);
1139             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1140             ewitab           = _mm_slli_epi32(ewitab,2);
1141             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1142             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1143             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1144             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1145             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1146             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1147             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1148             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
1149             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1150
1151             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1152
1153             /* Update potential sum for this i atom from the interaction with this j atom. */
1154             velec            = _mm256_and_pd(velec,cutoff_mask);
1155             velec            = _mm256_andnot_pd(dummy_mask,velec);
1156             velecsum         = _mm256_add_pd(velecsum,velec);
1157
1158             fscal            = felec;
1159
1160             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1161
1162             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1163
1164             /* Calculate temporary vectorial force */
1165             tx               = _mm256_mul_pd(fscal,dx13);
1166             ty               = _mm256_mul_pd(fscal,dy13);
1167             tz               = _mm256_mul_pd(fscal,dz13);
1168
1169             /* Update vectorial force */
1170             fix1             = _mm256_add_pd(fix1,tx);
1171             fiy1             = _mm256_add_pd(fiy1,ty);
1172             fiz1             = _mm256_add_pd(fiz1,tz);
1173
1174             fjx3             = _mm256_add_pd(fjx3,tx);
1175             fjy3             = _mm256_add_pd(fjy3,ty);
1176             fjz3             = _mm256_add_pd(fjz3,tz);
1177
1178             }
1179
1180             /**************************
1181              * CALCULATE INTERACTIONS *
1182              **************************/
1183
1184             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1185             {
1186
1187             r21              = _mm256_mul_pd(rsq21,rinv21);
1188             r21              = _mm256_andnot_pd(dummy_mask,r21);
1189
1190             /* EWALD ELECTROSTATICS */
1191
1192             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1193             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1194             ewitab           = _mm256_cvttpd_epi32(ewrt);
1195             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1196             ewitab           = _mm_slli_epi32(ewitab,2);
1197             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1198             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1199             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1200             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1201             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1202             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1203             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1204             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1205             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1206
1207             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1208
1209             /* Update potential sum for this i atom from the interaction with this j atom. */
1210             velec            = _mm256_and_pd(velec,cutoff_mask);
1211             velec            = _mm256_andnot_pd(dummy_mask,velec);
1212             velecsum         = _mm256_add_pd(velecsum,velec);
1213
1214             fscal            = felec;
1215
1216             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1217
1218             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1219
1220             /* Calculate temporary vectorial force */
1221             tx               = _mm256_mul_pd(fscal,dx21);
1222             ty               = _mm256_mul_pd(fscal,dy21);
1223             tz               = _mm256_mul_pd(fscal,dz21);
1224
1225             /* Update vectorial force */
1226             fix2             = _mm256_add_pd(fix2,tx);
1227             fiy2             = _mm256_add_pd(fiy2,ty);
1228             fiz2             = _mm256_add_pd(fiz2,tz);
1229
1230             fjx1             = _mm256_add_pd(fjx1,tx);
1231             fjy1             = _mm256_add_pd(fjy1,ty);
1232             fjz1             = _mm256_add_pd(fjz1,tz);
1233
1234             }
1235
1236             /**************************
1237              * CALCULATE INTERACTIONS *
1238              **************************/
1239
1240             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1241             {
1242
1243             r22              = _mm256_mul_pd(rsq22,rinv22);
1244             r22              = _mm256_andnot_pd(dummy_mask,r22);
1245
1246             /* EWALD ELECTROSTATICS */
1247
1248             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1249             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1250             ewitab           = _mm256_cvttpd_epi32(ewrt);
1251             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1252             ewitab           = _mm_slli_epi32(ewitab,2);
1253             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1254             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1255             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1256             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1257             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1258             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1259             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1260             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1261             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1262
1263             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1264
1265             /* Update potential sum for this i atom from the interaction with this j atom. */
1266             velec            = _mm256_and_pd(velec,cutoff_mask);
1267             velec            = _mm256_andnot_pd(dummy_mask,velec);
1268             velecsum         = _mm256_add_pd(velecsum,velec);
1269
1270             fscal            = felec;
1271
1272             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1273
1274             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1275
1276             /* Calculate temporary vectorial force */
1277             tx               = _mm256_mul_pd(fscal,dx22);
1278             ty               = _mm256_mul_pd(fscal,dy22);
1279             tz               = _mm256_mul_pd(fscal,dz22);
1280
1281             /* Update vectorial force */
1282             fix2             = _mm256_add_pd(fix2,tx);
1283             fiy2             = _mm256_add_pd(fiy2,ty);
1284             fiz2             = _mm256_add_pd(fiz2,tz);
1285
1286             fjx2             = _mm256_add_pd(fjx2,tx);
1287             fjy2             = _mm256_add_pd(fjy2,ty);
1288             fjz2             = _mm256_add_pd(fjz2,tz);
1289
1290             }
1291
1292             /**************************
1293              * CALCULATE INTERACTIONS *
1294              **************************/
1295
1296             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1297             {
1298
1299             r23              = _mm256_mul_pd(rsq23,rinv23);
1300             r23              = _mm256_andnot_pd(dummy_mask,r23);
1301
1302             /* EWALD ELECTROSTATICS */
1303
1304             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1305             ewrt             = _mm256_mul_pd(r23,ewtabscale);
1306             ewitab           = _mm256_cvttpd_epi32(ewrt);
1307             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1308             ewitab           = _mm_slli_epi32(ewitab,2);
1309             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1310             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1311             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1312             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1313             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1314             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1315             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1316             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
1317             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1318
1319             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1320
1321             /* Update potential sum for this i atom from the interaction with this j atom. */
1322             velec            = _mm256_and_pd(velec,cutoff_mask);
1323             velec            = _mm256_andnot_pd(dummy_mask,velec);
1324             velecsum         = _mm256_add_pd(velecsum,velec);
1325
1326             fscal            = felec;
1327
1328             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1329
1330             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1331
1332             /* Calculate temporary vectorial force */
1333             tx               = _mm256_mul_pd(fscal,dx23);
1334             ty               = _mm256_mul_pd(fscal,dy23);
1335             tz               = _mm256_mul_pd(fscal,dz23);
1336
1337             /* Update vectorial force */
1338             fix2             = _mm256_add_pd(fix2,tx);
1339             fiy2             = _mm256_add_pd(fiy2,ty);
1340             fiz2             = _mm256_add_pd(fiz2,tz);
1341
1342             fjx3             = _mm256_add_pd(fjx3,tx);
1343             fjy3             = _mm256_add_pd(fjy3,ty);
1344             fjz3             = _mm256_add_pd(fjz3,tz);
1345
1346             }
1347
1348             /**************************
1349              * CALCULATE INTERACTIONS *
1350              **************************/
1351
1352             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1353             {
1354
1355             r31              = _mm256_mul_pd(rsq31,rinv31);
1356             r31              = _mm256_andnot_pd(dummy_mask,r31);
1357
1358             /* EWALD ELECTROSTATICS */
1359
1360             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1361             ewrt             = _mm256_mul_pd(r31,ewtabscale);
1362             ewitab           = _mm256_cvttpd_epi32(ewrt);
1363             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1364             ewitab           = _mm_slli_epi32(ewitab,2);
1365             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1366             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1367             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1368             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1369             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1370             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1371             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1372             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
1373             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1374
1375             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1376
1377             /* Update potential sum for this i atom from the interaction with this j atom. */
1378             velec            = _mm256_and_pd(velec,cutoff_mask);
1379             velec            = _mm256_andnot_pd(dummy_mask,velec);
1380             velecsum         = _mm256_add_pd(velecsum,velec);
1381
1382             fscal            = felec;
1383
1384             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1385
1386             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1387
1388             /* Calculate temporary vectorial force */
1389             tx               = _mm256_mul_pd(fscal,dx31);
1390             ty               = _mm256_mul_pd(fscal,dy31);
1391             tz               = _mm256_mul_pd(fscal,dz31);
1392
1393             /* Update vectorial force */
1394             fix3             = _mm256_add_pd(fix3,tx);
1395             fiy3             = _mm256_add_pd(fiy3,ty);
1396             fiz3             = _mm256_add_pd(fiz3,tz);
1397
1398             fjx1             = _mm256_add_pd(fjx1,tx);
1399             fjy1             = _mm256_add_pd(fjy1,ty);
1400             fjz1             = _mm256_add_pd(fjz1,tz);
1401
1402             }
1403
1404             /**************************
1405              * CALCULATE INTERACTIONS *
1406              **************************/
1407
1408             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1409             {
1410
1411             r32              = _mm256_mul_pd(rsq32,rinv32);
1412             r32              = _mm256_andnot_pd(dummy_mask,r32);
1413
1414             /* EWALD ELECTROSTATICS */
1415
1416             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1417             ewrt             = _mm256_mul_pd(r32,ewtabscale);
1418             ewitab           = _mm256_cvttpd_epi32(ewrt);
1419             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1420             ewitab           = _mm_slli_epi32(ewitab,2);
1421             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1422             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1423             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1424             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1425             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1426             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1427             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1428             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
1429             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1430
1431             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1432
1433             /* Update potential sum for this i atom from the interaction with this j atom. */
1434             velec            = _mm256_and_pd(velec,cutoff_mask);
1435             velec            = _mm256_andnot_pd(dummy_mask,velec);
1436             velecsum         = _mm256_add_pd(velecsum,velec);
1437
1438             fscal            = felec;
1439
1440             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1441
1442             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1443
1444             /* Calculate temporary vectorial force */
1445             tx               = _mm256_mul_pd(fscal,dx32);
1446             ty               = _mm256_mul_pd(fscal,dy32);
1447             tz               = _mm256_mul_pd(fscal,dz32);
1448
1449             /* Update vectorial force */
1450             fix3             = _mm256_add_pd(fix3,tx);
1451             fiy3             = _mm256_add_pd(fiy3,ty);
1452             fiz3             = _mm256_add_pd(fiz3,tz);
1453
1454             fjx2             = _mm256_add_pd(fjx2,tx);
1455             fjy2             = _mm256_add_pd(fjy2,ty);
1456             fjz2             = _mm256_add_pd(fjz2,tz);
1457
1458             }
1459
1460             /**************************
1461              * CALCULATE INTERACTIONS *
1462              **************************/
1463
1464             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1465             {
1466
1467             r33              = _mm256_mul_pd(rsq33,rinv33);
1468             r33              = _mm256_andnot_pd(dummy_mask,r33);
1469
1470             /* EWALD ELECTROSTATICS */
1471
1472             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1473             ewrt             = _mm256_mul_pd(r33,ewtabscale);
1474             ewitab           = _mm256_cvttpd_epi32(ewrt);
1475             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1476             ewitab           = _mm_slli_epi32(ewitab,2);
1477             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1478             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1479             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1480             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1481             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1482             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1483             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1484             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
1485             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1486
1487             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1488
1489             /* Update potential sum for this i atom from the interaction with this j atom. */
1490             velec            = _mm256_and_pd(velec,cutoff_mask);
1491             velec            = _mm256_andnot_pd(dummy_mask,velec);
1492             velecsum         = _mm256_add_pd(velecsum,velec);
1493
1494             fscal            = felec;
1495
1496             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1497
1498             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1499
1500             /* Calculate temporary vectorial force */
1501             tx               = _mm256_mul_pd(fscal,dx33);
1502             ty               = _mm256_mul_pd(fscal,dy33);
1503             tz               = _mm256_mul_pd(fscal,dz33);
1504
1505             /* Update vectorial force */
1506             fix3             = _mm256_add_pd(fix3,tx);
1507             fiy3             = _mm256_add_pd(fiy3,ty);
1508             fiz3             = _mm256_add_pd(fiz3,tz);
1509
1510             fjx3             = _mm256_add_pd(fjx3,tx);
1511             fjy3             = _mm256_add_pd(fjy3,ty);
1512             fjz3             = _mm256_add_pd(fjz3,tz);
1513
1514             }
1515
1516             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1517             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1518             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1519             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1520
1521             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1522                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1523                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1524
1525             /* Inner loop uses 467 flops */
1526         }
1527
1528         /* End of innermost loop */
1529
1530         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1531                                                  f+i_coord_offset,fshift+i_shift_offset);
1532
1533         ggid                        = gid[iidx];
1534         /* Update potential energies */
1535         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1536         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1537
1538         /* Increment number of inner iterations */
1539         inneriter                  += j_index_end - j_index_start;
1540
1541         /* Outer loop uses 26 flops */
1542     }
1543
1544     /* Increment number of outer iterations */
1545     outeriter        += nri;
1546
1547     /* Update outer/inner flops */
1548
1549     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*467);
1550 }
1551 /*
1552  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double
1553  * Electrostatics interaction: Ewald
1554  * VdW interaction:            LennardJones
1555  * Geometry:                   Water4-Water4
1556  * Calculate force/pot:        Force
1557  */
1558 void
1559 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double
1560                     (t_nblist                    * gmx_restrict       nlist,
1561                      rvec                        * gmx_restrict          xx,
1562                      rvec                        * gmx_restrict          ff,
1563                      t_forcerec                  * gmx_restrict          fr,
1564                      t_mdatoms                   * gmx_restrict     mdatoms,
1565                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1566                      t_nrnb                      * gmx_restrict        nrnb)
1567 {
1568     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1569      * just 0 for non-waters.
1570      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1571      * jnr indices corresponding to data put in the four positions in the SIMD register.
1572      */
1573     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1574     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1575     int              jnrA,jnrB,jnrC,jnrD;
1576     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1577     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1578     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1579     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1580     real             rcutoff_scalar;
1581     real             *shiftvec,*fshift,*x,*f;
1582     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1583     real             scratch[4*DIM];
1584     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1585     real *           vdwioffsetptr0;
1586     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1587     real *           vdwioffsetptr1;
1588     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1589     real *           vdwioffsetptr2;
1590     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1591     real *           vdwioffsetptr3;
1592     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1593     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1594     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1595     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1596     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1597     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1598     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1599     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1600     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1601     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1602     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1603     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1604     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1605     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1606     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1607     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1608     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1609     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1610     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1611     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1612     real             *charge;
1613     int              nvdwtype;
1614     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1615     int              *vdwtype;
1616     real             *vdwparam;
1617     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1618     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1619     __m128i          ewitab;
1620     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1621     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1622     real             *ewtab;
1623     __m256d          dummy_mask,cutoff_mask;
1624     __m128           tmpmask0,tmpmask1;
1625     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1626     __m256d          one     = _mm256_set1_pd(1.0);
1627     __m256d          two     = _mm256_set1_pd(2.0);
1628     x                = xx[0];
1629     f                = ff[0];
1630
1631     nri              = nlist->nri;
1632     iinr             = nlist->iinr;
1633     jindex           = nlist->jindex;
1634     jjnr             = nlist->jjnr;
1635     shiftidx         = nlist->shift;
1636     gid              = nlist->gid;
1637     shiftvec         = fr->shift_vec[0];
1638     fshift           = fr->fshift[0];
1639     facel            = _mm256_set1_pd(fr->epsfac);
1640     charge           = mdatoms->chargeA;
1641     nvdwtype         = fr->ntype;
1642     vdwparam         = fr->nbfp;
1643     vdwtype          = mdatoms->typeA;
1644
1645     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1646     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1647     beta2            = _mm256_mul_pd(beta,beta);
1648     beta3            = _mm256_mul_pd(beta,beta2);
1649
1650     ewtab            = fr->ic->tabq_coul_F;
1651     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1652     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1653
1654     /* Setup water-specific parameters */
1655     inr              = nlist->iinr[0];
1656     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1657     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1658     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1659     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1660
1661     jq1              = _mm256_set1_pd(charge[inr+1]);
1662     jq2              = _mm256_set1_pd(charge[inr+2]);
1663     jq3              = _mm256_set1_pd(charge[inr+3]);
1664     vdwjidx0A        = 2*vdwtype[inr+0];
1665     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1666     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1667     qq11             = _mm256_mul_pd(iq1,jq1);
1668     qq12             = _mm256_mul_pd(iq1,jq2);
1669     qq13             = _mm256_mul_pd(iq1,jq3);
1670     qq21             = _mm256_mul_pd(iq2,jq1);
1671     qq22             = _mm256_mul_pd(iq2,jq2);
1672     qq23             = _mm256_mul_pd(iq2,jq3);
1673     qq31             = _mm256_mul_pd(iq3,jq1);
1674     qq32             = _mm256_mul_pd(iq3,jq2);
1675     qq33             = _mm256_mul_pd(iq3,jq3);
1676
1677     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1678     rcutoff_scalar   = fr->rcoulomb;
1679     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1680     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1681
1682     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
1683     rvdw             = _mm256_set1_pd(fr->rvdw);
1684
1685     /* Avoid stupid compiler warnings */
1686     jnrA = jnrB = jnrC = jnrD = 0;
1687     j_coord_offsetA = 0;
1688     j_coord_offsetB = 0;
1689     j_coord_offsetC = 0;
1690     j_coord_offsetD = 0;
1691
1692     outeriter        = 0;
1693     inneriter        = 0;
1694
1695     for(iidx=0;iidx<4*DIM;iidx++)
1696     {
1697         scratch[iidx] = 0.0;
1698     }
1699
1700     /* Start outer loop over neighborlists */
1701     for(iidx=0; iidx<nri; iidx++)
1702     {
1703         /* Load shift vector for this list */
1704         i_shift_offset   = DIM*shiftidx[iidx];
1705
1706         /* Load limits for loop over neighbors */
1707         j_index_start    = jindex[iidx];
1708         j_index_end      = jindex[iidx+1];
1709
1710         /* Get outer coordinate index */
1711         inr              = iinr[iidx];
1712         i_coord_offset   = DIM*inr;
1713
1714         /* Load i particle coords and add shift vector */
1715         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1716                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1717
1718         fix0             = _mm256_setzero_pd();
1719         fiy0             = _mm256_setzero_pd();
1720         fiz0             = _mm256_setzero_pd();
1721         fix1             = _mm256_setzero_pd();
1722         fiy1             = _mm256_setzero_pd();
1723         fiz1             = _mm256_setzero_pd();
1724         fix2             = _mm256_setzero_pd();
1725         fiy2             = _mm256_setzero_pd();
1726         fiz2             = _mm256_setzero_pd();
1727         fix3             = _mm256_setzero_pd();
1728         fiy3             = _mm256_setzero_pd();
1729         fiz3             = _mm256_setzero_pd();
1730
1731         /* Start inner kernel loop */
1732         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1733         {
1734
1735             /* Get j neighbor index, and coordinate index */
1736             jnrA             = jjnr[jidx];
1737             jnrB             = jjnr[jidx+1];
1738             jnrC             = jjnr[jidx+2];
1739             jnrD             = jjnr[jidx+3];
1740             j_coord_offsetA  = DIM*jnrA;
1741             j_coord_offsetB  = DIM*jnrB;
1742             j_coord_offsetC  = DIM*jnrC;
1743             j_coord_offsetD  = DIM*jnrD;
1744
1745             /* load j atom coordinates */
1746             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1747                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1748                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1749                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1750
1751             /* Calculate displacement vector */
1752             dx00             = _mm256_sub_pd(ix0,jx0);
1753             dy00             = _mm256_sub_pd(iy0,jy0);
1754             dz00             = _mm256_sub_pd(iz0,jz0);
1755             dx11             = _mm256_sub_pd(ix1,jx1);
1756             dy11             = _mm256_sub_pd(iy1,jy1);
1757             dz11             = _mm256_sub_pd(iz1,jz1);
1758             dx12             = _mm256_sub_pd(ix1,jx2);
1759             dy12             = _mm256_sub_pd(iy1,jy2);
1760             dz12             = _mm256_sub_pd(iz1,jz2);
1761             dx13             = _mm256_sub_pd(ix1,jx3);
1762             dy13             = _mm256_sub_pd(iy1,jy3);
1763             dz13             = _mm256_sub_pd(iz1,jz3);
1764             dx21             = _mm256_sub_pd(ix2,jx1);
1765             dy21             = _mm256_sub_pd(iy2,jy1);
1766             dz21             = _mm256_sub_pd(iz2,jz1);
1767             dx22             = _mm256_sub_pd(ix2,jx2);
1768             dy22             = _mm256_sub_pd(iy2,jy2);
1769             dz22             = _mm256_sub_pd(iz2,jz2);
1770             dx23             = _mm256_sub_pd(ix2,jx3);
1771             dy23             = _mm256_sub_pd(iy2,jy3);
1772             dz23             = _mm256_sub_pd(iz2,jz3);
1773             dx31             = _mm256_sub_pd(ix3,jx1);
1774             dy31             = _mm256_sub_pd(iy3,jy1);
1775             dz31             = _mm256_sub_pd(iz3,jz1);
1776             dx32             = _mm256_sub_pd(ix3,jx2);
1777             dy32             = _mm256_sub_pd(iy3,jy2);
1778             dz32             = _mm256_sub_pd(iz3,jz2);
1779             dx33             = _mm256_sub_pd(ix3,jx3);
1780             dy33             = _mm256_sub_pd(iy3,jy3);
1781             dz33             = _mm256_sub_pd(iz3,jz3);
1782
1783             /* Calculate squared distance and things based on it */
1784             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1785             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1786             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1787             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1788             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1789             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1790             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1791             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1792             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1793             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1794
1795             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1796             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1797             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1798             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1799             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1800             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1801             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1802             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1803             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1804
1805             rinvsq00         = gmx_mm256_inv_pd(rsq00);
1806             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1807             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1808             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1809             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1810             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1811             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1812             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1813             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1814             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1815
1816             fjx0             = _mm256_setzero_pd();
1817             fjy0             = _mm256_setzero_pd();
1818             fjz0             = _mm256_setzero_pd();
1819             fjx1             = _mm256_setzero_pd();
1820             fjy1             = _mm256_setzero_pd();
1821             fjz1             = _mm256_setzero_pd();
1822             fjx2             = _mm256_setzero_pd();
1823             fjy2             = _mm256_setzero_pd();
1824             fjz2             = _mm256_setzero_pd();
1825             fjx3             = _mm256_setzero_pd();
1826             fjy3             = _mm256_setzero_pd();
1827             fjz3             = _mm256_setzero_pd();
1828
1829             /**************************
1830              * CALCULATE INTERACTIONS *
1831              **************************/
1832
1833             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1834             {
1835
1836             /* LENNARD-JONES DISPERSION/REPULSION */
1837
1838             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1839             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1840
1841             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1842
1843             fscal            = fvdw;
1844
1845             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1846
1847             /* Calculate temporary vectorial force */
1848             tx               = _mm256_mul_pd(fscal,dx00);
1849             ty               = _mm256_mul_pd(fscal,dy00);
1850             tz               = _mm256_mul_pd(fscal,dz00);
1851
1852             /* Update vectorial force */
1853             fix0             = _mm256_add_pd(fix0,tx);
1854             fiy0             = _mm256_add_pd(fiy0,ty);
1855             fiz0             = _mm256_add_pd(fiz0,tz);
1856
1857             fjx0             = _mm256_add_pd(fjx0,tx);
1858             fjy0             = _mm256_add_pd(fjy0,ty);
1859             fjz0             = _mm256_add_pd(fjz0,tz);
1860
1861             }
1862
1863             /**************************
1864              * CALCULATE INTERACTIONS *
1865              **************************/
1866
1867             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1868             {
1869
1870             r11              = _mm256_mul_pd(rsq11,rinv11);
1871
1872             /* EWALD ELECTROSTATICS */
1873
1874             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1875             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1876             ewitab           = _mm256_cvttpd_epi32(ewrt);
1877             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1878             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1879                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1880                                             &ewtabF,&ewtabFn);
1881             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1882             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1883
1884             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1885
1886             fscal            = felec;
1887
1888             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1889
1890             /* Calculate temporary vectorial force */
1891             tx               = _mm256_mul_pd(fscal,dx11);
1892             ty               = _mm256_mul_pd(fscal,dy11);
1893             tz               = _mm256_mul_pd(fscal,dz11);
1894
1895             /* Update vectorial force */
1896             fix1             = _mm256_add_pd(fix1,tx);
1897             fiy1             = _mm256_add_pd(fiy1,ty);
1898             fiz1             = _mm256_add_pd(fiz1,tz);
1899
1900             fjx1             = _mm256_add_pd(fjx1,tx);
1901             fjy1             = _mm256_add_pd(fjy1,ty);
1902             fjz1             = _mm256_add_pd(fjz1,tz);
1903
1904             }
1905
1906             /**************************
1907              * CALCULATE INTERACTIONS *
1908              **************************/
1909
1910             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1911             {
1912
1913             r12              = _mm256_mul_pd(rsq12,rinv12);
1914
1915             /* EWALD ELECTROSTATICS */
1916
1917             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1918             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1919             ewitab           = _mm256_cvttpd_epi32(ewrt);
1920             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1921             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1922                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1923                                             &ewtabF,&ewtabFn);
1924             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1925             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1926
1927             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1928
1929             fscal            = felec;
1930
1931             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1932
1933             /* Calculate temporary vectorial force */
1934             tx               = _mm256_mul_pd(fscal,dx12);
1935             ty               = _mm256_mul_pd(fscal,dy12);
1936             tz               = _mm256_mul_pd(fscal,dz12);
1937
1938             /* Update vectorial force */
1939             fix1             = _mm256_add_pd(fix1,tx);
1940             fiy1             = _mm256_add_pd(fiy1,ty);
1941             fiz1             = _mm256_add_pd(fiz1,tz);
1942
1943             fjx2             = _mm256_add_pd(fjx2,tx);
1944             fjy2             = _mm256_add_pd(fjy2,ty);
1945             fjz2             = _mm256_add_pd(fjz2,tz);
1946
1947             }
1948
1949             /**************************
1950              * CALCULATE INTERACTIONS *
1951              **************************/
1952
1953             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1954             {
1955
1956             r13              = _mm256_mul_pd(rsq13,rinv13);
1957
1958             /* EWALD ELECTROSTATICS */
1959
1960             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1961             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1962             ewitab           = _mm256_cvttpd_epi32(ewrt);
1963             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1964             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1965                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1966                                             &ewtabF,&ewtabFn);
1967             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1968             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1969
1970             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1971
1972             fscal            = felec;
1973
1974             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1975
1976             /* Calculate temporary vectorial force */
1977             tx               = _mm256_mul_pd(fscal,dx13);
1978             ty               = _mm256_mul_pd(fscal,dy13);
1979             tz               = _mm256_mul_pd(fscal,dz13);
1980
1981             /* Update vectorial force */
1982             fix1             = _mm256_add_pd(fix1,tx);
1983             fiy1             = _mm256_add_pd(fiy1,ty);
1984             fiz1             = _mm256_add_pd(fiz1,tz);
1985
1986             fjx3             = _mm256_add_pd(fjx3,tx);
1987             fjy3             = _mm256_add_pd(fjy3,ty);
1988             fjz3             = _mm256_add_pd(fjz3,tz);
1989
1990             }
1991
1992             /**************************
1993              * CALCULATE INTERACTIONS *
1994              **************************/
1995
1996             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1997             {
1998
1999             r21              = _mm256_mul_pd(rsq21,rinv21);
2000
2001             /* EWALD ELECTROSTATICS */
2002
2003             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2004             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2005             ewitab           = _mm256_cvttpd_epi32(ewrt);
2006             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2007             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2008                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2009                                             &ewtabF,&ewtabFn);
2010             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2011             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2012
2013             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2014
2015             fscal            = felec;
2016
2017             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2018
2019             /* Calculate temporary vectorial force */
2020             tx               = _mm256_mul_pd(fscal,dx21);
2021             ty               = _mm256_mul_pd(fscal,dy21);
2022             tz               = _mm256_mul_pd(fscal,dz21);
2023
2024             /* Update vectorial force */
2025             fix2             = _mm256_add_pd(fix2,tx);
2026             fiy2             = _mm256_add_pd(fiy2,ty);
2027             fiz2             = _mm256_add_pd(fiz2,tz);
2028
2029             fjx1             = _mm256_add_pd(fjx1,tx);
2030             fjy1             = _mm256_add_pd(fjy1,ty);
2031             fjz1             = _mm256_add_pd(fjz1,tz);
2032
2033             }
2034
2035             /**************************
2036              * CALCULATE INTERACTIONS *
2037              **************************/
2038
2039             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2040             {
2041
2042             r22              = _mm256_mul_pd(rsq22,rinv22);
2043
2044             /* EWALD ELECTROSTATICS */
2045
2046             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2047             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2048             ewitab           = _mm256_cvttpd_epi32(ewrt);
2049             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2050             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2051                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2052                                             &ewtabF,&ewtabFn);
2053             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2054             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2055
2056             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2057
2058             fscal            = felec;
2059
2060             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2061
2062             /* Calculate temporary vectorial force */
2063             tx               = _mm256_mul_pd(fscal,dx22);
2064             ty               = _mm256_mul_pd(fscal,dy22);
2065             tz               = _mm256_mul_pd(fscal,dz22);
2066
2067             /* Update vectorial force */
2068             fix2             = _mm256_add_pd(fix2,tx);
2069             fiy2             = _mm256_add_pd(fiy2,ty);
2070             fiz2             = _mm256_add_pd(fiz2,tz);
2071
2072             fjx2             = _mm256_add_pd(fjx2,tx);
2073             fjy2             = _mm256_add_pd(fjy2,ty);
2074             fjz2             = _mm256_add_pd(fjz2,tz);
2075
2076             }
2077
2078             /**************************
2079              * CALCULATE INTERACTIONS *
2080              **************************/
2081
2082             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2083             {
2084
2085             r23              = _mm256_mul_pd(rsq23,rinv23);
2086
2087             /* EWALD ELECTROSTATICS */
2088
2089             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2090             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2091             ewitab           = _mm256_cvttpd_epi32(ewrt);
2092             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2093             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2094                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2095                                             &ewtabF,&ewtabFn);
2096             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2097             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2098
2099             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2100
2101             fscal            = felec;
2102
2103             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2104
2105             /* Calculate temporary vectorial force */
2106             tx               = _mm256_mul_pd(fscal,dx23);
2107             ty               = _mm256_mul_pd(fscal,dy23);
2108             tz               = _mm256_mul_pd(fscal,dz23);
2109
2110             /* Update vectorial force */
2111             fix2             = _mm256_add_pd(fix2,tx);
2112             fiy2             = _mm256_add_pd(fiy2,ty);
2113             fiz2             = _mm256_add_pd(fiz2,tz);
2114
2115             fjx3             = _mm256_add_pd(fjx3,tx);
2116             fjy3             = _mm256_add_pd(fjy3,ty);
2117             fjz3             = _mm256_add_pd(fjz3,tz);
2118
2119             }
2120
2121             /**************************
2122              * CALCULATE INTERACTIONS *
2123              **************************/
2124
2125             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2126             {
2127
2128             r31              = _mm256_mul_pd(rsq31,rinv31);
2129
2130             /* EWALD ELECTROSTATICS */
2131
2132             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2133             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2134             ewitab           = _mm256_cvttpd_epi32(ewrt);
2135             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2136             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2137                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2138                                             &ewtabF,&ewtabFn);
2139             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2140             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2141
2142             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2143
2144             fscal            = felec;
2145
2146             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2147
2148             /* Calculate temporary vectorial force */
2149             tx               = _mm256_mul_pd(fscal,dx31);
2150             ty               = _mm256_mul_pd(fscal,dy31);
2151             tz               = _mm256_mul_pd(fscal,dz31);
2152
2153             /* Update vectorial force */
2154             fix3             = _mm256_add_pd(fix3,tx);
2155             fiy3             = _mm256_add_pd(fiy3,ty);
2156             fiz3             = _mm256_add_pd(fiz3,tz);
2157
2158             fjx1             = _mm256_add_pd(fjx1,tx);
2159             fjy1             = _mm256_add_pd(fjy1,ty);
2160             fjz1             = _mm256_add_pd(fjz1,tz);
2161
2162             }
2163
2164             /**************************
2165              * CALCULATE INTERACTIONS *
2166              **************************/
2167
2168             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2169             {
2170
2171             r32              = _mm256_mul_pd(rsq32,rinv32);
2172
2173             /* EWALD ELECTROSTATICS */
2174
2175             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2176             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2177             ewitab           = _mm256_cvttpd_epi32(ewrt);
2178             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2179             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2180                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2181                                             &ewtabF,&ewtabFn);
2182             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2183             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2184
2185             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2186
2187             fscal            = felec;
2188
2189             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2190
2191             /* Calculate temporary vectorial force */
2192             tx               = _mm256_mul_pd(fscal,dx32);
2193             ty               = _mm256_mul_pd(fscal,dy32);
2194             tz               = _mm256_mul_pd(fscal,dz32);
2195
2196             /* Update vectorial force */
2197             fix3             = _mm256_add_pd(fix3,tx);
2198             fiy3             = _mm256_add_pd(fiy3,ty);
2199             fiz3             = _mm256_add_pd(fiz3,tz);
2200
2201             fjx2             = _mm256_add_pd(fjx2,tx);
2202             fjy2             = _mm256_add_pd(fjy2,ty);
2203             fjz2             = _mm256_add_pd(fjz2,tz);
2204
2205             }
2206
2207             /**************************
2208              * CALCULATE INTERACTIONS *
2209              **************************/
2210
2211             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2212             {
2213
2214             r33              = _mm256_mul_pd(rsq33,rinv33);
2215
2216             /* EWALD ELECTROSTATICS */
2217
2218             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2219             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2220             ewitab           = _mm256_cvttpd_epi32(ewrt);
2221             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2222             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2223                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2224                                             &ewtabF,&ewtabFn);
2225             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2226             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2227
2228             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2229
2230             fscal            = felec;
2231
2232             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2233
2234             /* Calculate temporary vectorial force */
2235             tx               = _mm256_mul_pd(fscal,dx33);
2236             ty               = _mm256_mul_pd(fscal,dy33);
2237             tz               = _mm256_mul_pd(fscal,dz33);
2238
2239             /* Update vectorial force */
2240             fix3             = _mm256_add_pd(fix3,tx);
2241             fiy3             = _mm256_add_pd(fiy3,ty);
2242             fiz3             = _mm256_add_pd(fiz3,tz);
2243
2244             fjx3             = _mm256_add_pd(fjx3,tx);
2245             fjy3             = _mm256_add_pd(fjy3,ty);
2246             fjz3             = _mm256_add_pd(fjz3,tz);
2247
2248             }
2249
2250             fjptrA             = f+j_coord_offsetA;
2251             fjptrB             = f+j_coord_offsetB;
2252             fjptrC             = f+j_coord_offsetC;
2253             fjptrD             = f+j_coord_offsetD;
2254
2255             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2256                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2257                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2258
2259             /* Inner loop uses 384 flops */
2260         }
2261
2262         if(jidx<j_index_end)
2263         {
2264
2265             /* Get j neighbor index, and coordinate index */
2266             jnrlistA         = jjnr[jidx];
2267             jnrlistB         = jjnr[jidx+1];
2268             jnrlistC         = jjnr[jidx+2];
2269             jnrlistD         = jjnr[jidx+3];
2270             /* Sign of each element will be negative for non-real atoms.
2271              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2272              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2273              */
2274             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2275
2276             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2277             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2278             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2279
2280             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2281             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2282             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2283             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2284             j_coord_offsetA  = DIM*jnrA;
2285             j_coord_offsetB  = DIM*jnrB;
2286             j_coord_offsetC  = DIM*jnrC;
2287             j_coord_offsetD  = DIM*jnrD;
2288
2289             /* load j atom coordinates */
2290             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2291                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2292                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2293                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2294
2295             /* Calculate displacement vector */
2296             dx00             = _mm256_sub_pd(ix0,jx0);
2297             dy00             = _mm256_sub_pd(iy0,jy0);
2298             dz00             = _mm256_sub_pd(iz0,jz0);
2299             dx11             = _mm256_sub_pd(ix1,jx1);
2300             dy11             = _mm256_sub_pd(iy1,jy1);
2301             dz11             = _mm256_sub_pd(iz1,jz1);
2302             dx12             = _mm256_sub_pd(ix1,jx2);
2303             dy12             = _mm256_sub_pd(iy1,jy2);
2304             dz12             = _mm256_sub_pd(iz1,jz2);
2305             dx13             = _mm256_sub_pd(ix1,jx3);
2306             dy13             = _mm256_sub_pd(iy1,jy3);
2307             dz13             = _mm256_sub_pd(iz1,jz3);
2308             dx21             = _mm256_sub_pd(ix2,jx1);
2309             dy21             = _mm256_sub_pd(iy2,jy1);
2310             dz21             = _mm256_sub_pd(iz2,jz1);
2311             dx22             = _mm256_sub_pd(ix2,jx2);
2312             dy22             = _mm256_sub_pd(iy2,jy2);
2313             dz22             = _mm256_sub_pd(iz2,jz2);
2314             dx23             = _mm256_sub_pd(ix2,jx3);
2315             dy23             = _mm256_sub_pd(iy2,jy3);
2316             dz23             = _mm256_sub_pd(iz2,jz3);
2317             dx31             = _mm256_sub_pd(ix3,jx1);
2318             dy31             = _mm256_sub_pd(iy3,jy1);
2319             dz31             = _mm256_sub_pd(iz3,jz1);
2320             dx32             = _mm256_sub_pd(ix3,jx2);
2321             dy32             = _mm256_sub_pd(iy3,jy2);
2322             dz32             = _mm256_sub_pd(iz3,jz2);
2323             dx33             = _mm256_sub_pd(ix3,jx3);
2324             dy33             = _mm256_sub_pd(iy3,jy3);
2325             dz33             = _mm256_sub_pd(iz3,jz3);
2326
2327             /* Calculate squared distance and things based on it */
2328             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2329             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2330             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2331             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2332             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2333             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2334             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2335             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2336             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2337             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2338
2339             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2340             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2341             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
2342             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2343             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2344             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
2345             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
2346             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
2347             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
2348
2349             rinvsq00         = gmx_mm256_inv_pd(rsq00);
2350             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2351             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2352             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2353             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2354             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2355             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2356             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2357             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2358             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2359
2360             fjx0             = _mm256_setzero_pd();
2361             fjy0             = _mm256_setzero_pd();
2362             fjz0             = _mm256_setzero_pd();
2363             fjx1             = _mm256_setzero_pd();
2364             fjy1             = _mm256_setzero_pd();
2365             fjz1             = _mm256_setzero_pd();
2366             fjx2             = _mm256_setzero_pd();
2367             fjy2             = _mm256_setzero_pd();
2368             fjz2             = _mm256_setzero_pd();
2369             fjx3             = _mm256_setzero_pd();
2370             fjy3             = _mm256_setzero_pd();
2371             fjz3             = _mm256_setzero_pd();
2372
2373             /**************************
2374              * CALCULATE INTERACTIONS *
2375              **************************/
2376
2377             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2378             {
2379
2380             /* LENNARD-JONES DISPERSION/REPULSION */
2381
2382             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2383             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2384
2385             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2386
2387             fscal            = fvdw;
2388
2389             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2390
2391             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2392
2393             /* Calculate temporary vectorial force */
2394             tx               = _mm256_mul_pd(fscal,dx00);
2395             ty               = _mm256_mul_pd(fscal,dy00);
2396             tz               = _mm256_mul_pd(fscal,dz00);
2397
2398             /* Update vectorial force */
2399             fix0             = _mm256_add_pd(fix0,tx);
2400             fiy0             = _mm256_add_pd(fiy0,ty);
2401             fiz0             = _mm256_add_pd(fiz0,tz);
2402
2403             fjx0             = _mm256_add_pd(fjx0,tx);
2404             fjy0             = _mm256_add_pd(fjy0,ty);
2405             fjz0             = _mm256_add_pd(fjz0,tz);
2406
2407             }
2408
2409             /**************************
2410              * CALCULATE INTERACTIONS *
2411              **************************/
2412
2413             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2414             {
2415
2416             r11              = _mm256_mul_pd(rsq11,rinv11);
2417             r11              = _mm256_andnot_pd(dummy_mask,r11);
2418
2419             /* EWALD ELECTROSTATICS */
2420
2421             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2422             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2423             ewitab           = _mm256_cvttpd_epi32(ewrt);
2424             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2425             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2426                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2427                                             &ewtabF,&ewtabFn);
2428             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2429             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2430
2431             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2432
2433             fscal            = felec;
2434
2435             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2436
2437             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2438
2439             /* Calculate temporary vectorial force */
2440             tx               = _mm256_mul_pd(fscal,dx11);
2441             ty               = _mm256_mul_pd(fscal,dy11);
2442             tz               = _mm256_mul_pd(fscal,dz11);
2443
2444             /* Update vectorial force */
2445             fix1             = _mm256_add_pd(fix1,tx);
2446             fiy1             = _mm256_add_pd(fiy1,ty);
2447             fiz1             = _mm256_add_pd(fiz1,tz);
2448
2449             fjx1             = _mm256_add_pd(fjx1,tx);
2450             fjy1             = _mm256_add_pd(fjy1,ty);
2451             fjz1             = _mm256_add_pd(fjz1,tz);
2452
2453             }
2454
2455             /**************************
2456              * CALCULATE INTERACTIONS *
2457              **************************/
2458
2459             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2460             {
2461
2462             r12              = _mm256_mul_pd(rsq12,rinv12);
2463             r12              = _mm256_andnot_pd(dummy_mask,r12);
2464
2465             /* EWALD ELECTROSTATICS */
2466
2467             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2468             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2469             ewitab           = _mm256_cvttpd_epi32(ewrt);
2470             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2471             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2472                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2473                                             &ewtabF,&ewtabFn);
2474             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2475             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2476
2477             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2478
2479             fscal            = felec;
2480
2481             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2482
2483             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2484
2485             /* Calculate temporary vectorial force */
2486             tx               = _mm256_mul_pd(fscal,dx12);
2487             ty               = _mm256_mul_pd(fscal,dy12);
2488             tz               = _mm256_mul_pd(fscal,dz12);
2489
2490             /* Update vectorial force */
2491             fix1             = _mm256_add_pd(fix1,tx);
2492             fiy1             = _mm256_add_pd(fiy1,ty);
2493             fiz1             = _mm256_add_pd(fiz1,tz);
2494
2495             fjx2             = _mm256_add_pd(fjx2,tx);
2496             fjy2             = _mm256_add_pd(fjy2,ty);
2497             fjz2             = _mm256_add_pd(fjz2,tz);
2498
2499             }
2500
2501             /**************************
2502              * CALCULATE INTERACTIONS *
2503              **************************/
2504
2505             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2506             {
2507
2508             r13              = _mm256_mul_pd(rsq13,rinv13);
2509             r13              = _mm256_andnot_pd(dummy_mask,r13);
2510
2511             /* EWALD ELECTROSTATICS */
2512
2513             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2514             ewrt             = _mm256_mul_pd(r13,ewtabscale);
2515             ewitab           = _mm256_cvttpd_epi32(ewrt);
2516             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2517             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2518                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2519                                             &ewtabF,&ewtabFn);
2520             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2521             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2522
2523             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2524
2525             fscal            = felec;
2526
2527             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2528
2529             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2530
2531             /* Calculate temporary vectorial force */
2532             tx               = _mm256_mul_pd(fscal,dx13);
2533             ty               = _mm256_mul_pd(fscal,dy13);
2534             tz               = _mm256_mul_pd(fscal,dz13);
2535
2536             /* Update vectorial force */
2537             fix1             = _mm256_add_pd(fix1,tx);
2538             fiy1             = _mm256_add_pd(fiy1,ty);
2539             fiz1             = _mm256_add_pd(fiz1,tz);
2540
2541             fjx3             = _mm256_add_pd(fjx3,tx);
2542             fjy3             = _mm256_add_pd(fjy3,ty);
2543             fjz3             = _mm256_add_pd(fjz3,tz);
2544
2545             }
2546
2547             /**************************
2548              * CALCULATE INTERACTIONS *
2549              **************************/
2550
2551             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2552             {
2553
2554             r21              = _mm256_mul_pd(rsq21,rinv21);
2555             r21              = _mm256_andnot_pd(dummy_mask,r21);
2556
2557             /* EWALD ELECTROSTATICS */
2558
2559             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2560             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2561             ewitab           = _mm256_cvttpd_epi32(ewrt);
2562             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2563             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2564                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2565                                             &ewtabF,&ewtabFn);
2566             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2567             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2568
2569             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2570
2571             fscal            = felec;
2572
2573             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2574
2575             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2576
2577             /* Calculate temporary vectorial force */
2578             tx               = _mm256_mul_pd(fscal,dx21);
2579             ty               = _mm256_mul_pd(fscal,dy21);
2580             tz               = _mm256_mul_pd(fscal,dz21);
2581
2582             /* Update vectorial force */
2583             fix2             = _mm256_add_pd(fix2,tx);
2584             fiy2             = _mm256_add_pd(fiy2,ty);
2585             fiz2             = _mm256_add_pd(fiz2,tz);
2586
2587             fjx1             = _mm256_add_pd(fjx1,tx);
2588             fjy1             = _mm256_add_pd(fjy1,ty);
2589             fjz1             = _mm256_add_pd(fjz1,tz);
2590
2591             }
2592
2593             /**************************
2594              * CALCULATE INTERACTIONS *
2595              **************************/
2596
2597             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2598             {
2599
2600             r22              = _mm256_mul_pd(rsq22,rinv22);
2601             r22              = _mm256_andnot_pd(dummy_mask,r22);
2602
2603             /* EWALD ELECTROSTATICS */
2604
2605             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2606             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2607             ewitab           = _mm256_cvttpd_epi32(ewrt);
2608             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2609             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2610                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2611                                             &ewtabF,&ewtabFn);
2612             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2613             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2614
2615             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2616
2617             fscal            = felec;
2618
2619             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2620
2621             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2622
2623             /* Calculate temporary vectorial force */
2624             tx               = _mm256_mul_pd(fscal,dx22);
2625             ty               = _mm256_mul_pd(fscal,dy22);
2626             tz               = _mm256_mul_pd(fscal,dz22);
2627
2628             /* Update vectorial force */
2629             fix2             = _mm256_add_pd(fix2,tx);
2630             fiy2             = _mm256_add_pd(fiy2,ty);
2631             fiz2             = _mm256_add_pd(fiz2,tz);
2632
2633             fjx2             = _mm256_add_pd(fjx2,tx);
2634             fjy2             = _mm256_add_pd(fjy2,ty);
2635             fjz2             = _mm256_add_pd(fjz2,tz);
2636
2637             }
2638
2639             /**************************
2640              * CALCULATE INTERACTIONS *
2641              **************************/
2642
2643             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2644             {
2645
2646             r23              = _mm256_mul_pd(rsq23,rinv23);
2647             r23              = _mm256_andnot_pd(dummy_mask,r23);
2648
2649             /* EWALD ELECTROSTATICS */
2650
2651             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2652             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2653             ewitab           = _mm256_cvttpd_epi32(ewrt);
2654             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2655             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2656                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2657                                             &ewtabF,&ewtabFn);
2658             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2659             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2660
2661             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2662
2663             fscal            = felec;
2664
2665             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2666
2667             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2668
2669             /* Calculate temporary vectorial force */
2670             tx               = _mm256_mul_pd(fscal,dx23);
2671             ty               = _mm256_mul_pd(fscal,dy23);
2672             tz               = _mm256_mul_pd(fscal,dz23);
2673
2674             /* Update vectorial force */
2675             fix2             = _mm256_add_pd(fix2,tx);
2676             fiy2             = _mm256_add_pd(fiy2,ty);
2677             fiz2             = _mm256_add_pd(fiz2,tz);
2678
2679             fjx3             = _mm256_add_pd(fjx3,tx);
2680             fjy3             = _mm256_add_pd(fjy3,ty);
2681             fjz3             = _mm256_add_pd(fjz3,tz);
2682
2683             }
2684
2685             /**************************
2686              * CALCULATE INTERACTIONS *
2687              **************************/
2688
2689             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2690             {
2691
2692             r31              = _mm256_mul_pd(rsq31,rinv31);
2693             r31              = _mm256_andnot_pd(dummy_mask,r31);
2694
2695             /* EWALD ELECTROSTATICS */
2696
2697             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2698             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2699             ewitab           = _mm256_cvttpd_epi32(ewrt);
2700             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2701             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2702                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2703                                             &ewtabF,&ewtabFn);
2704             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2705             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2706
2707             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2708
2709             fscal            = felec;
2710
2711             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2712
2713             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2714
2715             /* Calculate temporary vectorial force */
2716             tx               = _mm256_mul_pd(fscal,dx31);
2717             ty               = _mm256_mul_pd(fscal,dy31);
2718             tz               = _mm256_mul_pd(fscal,dz31);
2719
2720             /* Update vectorial force */
2721             fix3             = _mm256_add_pd(fix3,tx);
2722             fiy3             = _mm256_add_pd(fiy3,ty);
2723             fiz3             = _mm256_add_pd(fiz3,tz);
2724
2725             fjx1             = _mm256_add_pd(fjx1,tx);
2726             fjy1             = _mm256_add_pd(fjy1,ty);
2727             fjz1             = _mm256_add_pd(fjz1,tz);
2728
2729             }
2730
2731             /**************************
2732              * CALCULATE INTERACTIONS *
2733              **************************/
2734
2735             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2736             {
2737
2738             r32              = _mm256_mul_pd(rsq32,rinv32);
2739             r32              = _mm256_andnot_pd(dummy_mask,r32);
2740
2741             /* EWALD ELECTROSTATICS */
2742
2743             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2744             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2745             ewitab           = _mm256_cvttpd_epi32(ewrt);
2746             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2747             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2748                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2749                                             &ewtabF,&ewtabFn);
2750             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2751             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2752
2753             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2754
2755             fscal            = felec;
2756
2757             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2758
2759             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2760
2761             /* Calculate temporary vectorial force */
2762             tx               = _mm256_mul_pd(fscal,dx32);
2763             ty               = _mm256_mul_pd(fscal,dy32);
2764             tz               = _mm256_mul_pd(fscal,dz32);
2765
2766             /* Update vectorial force */
2767             fix3             = _mm256_add_pd(fix3,tx);
2768             fiy3             = _mm256_add_pd(fiy3,ty);
2769             fiz3             = _mm256_add_pd(fiz3,tz);
2770
2771             fjx2             = _mm256_add_pd(fjx2,tx);
2772             fjy2             = _mm256_add_pd(fjy2,ty);
2773             fjz2             = _mm256_add_pd(fjz2,tz);
2774
2775             }
2776
2777             /**************************
2778              * CALCULATE INTERACTIONS *
2779              **************************/
2780
2781             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2782             {
2783
2784             r33              = _mm256_mul_pd(rsq33,rinv33);
2785             r33              = _mm256_andnot_pd(dummy_mask,r33);
2786
2787             /* EWALD ELECTROSTATICS */
2788
2789             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2790             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2791             ewitab           = _mm256_cvttpd_epi32(ewrt);
2792             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2793             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2794                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2795                                             &ewtabF,&ewtabFn);
2796             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2797             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2798
2799             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2800
2801             fscal            = felec;
2802
2803             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2804
2805             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2806
2807             /* Calculate temporary vectorial force */
2808             tx               = _mm256_mul_pd(fscal,dx33);
2809             ty               = _mm256_mul_pd(fscal,dy33);
2810             tz               = _mm256_mul_pd(fscal,dz33);
2811
2812             /* Update vectorial force */
2813             fix3             = _mm256_add_pd(fix3,tx);
2814             fiy3             = _mm256_add_pd(fiy3,ty);
2815             fiz3             = _mm256_add_pd(fiz3,tz);
2816
2817             fjx3             = _mm256_add_pd(fjx3,tx);
2818             fjy3             = _mm256_add_pd(fjy3,ty);
2819             fjz3             = _mm256_add_pd(fjz3,tz);
2820
2821             }
2822
2823             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2824             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2825             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2826             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2827
2828             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2829                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2830                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2831
2832             /* Inner loop uses 393 flops */
2833         }
2834
2835         /* End of innermost loop */
2836
2837         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2838                                                  f+i_coord_offset,fshift+i_shift_offset);
2839
2840         /* Increment number of inner iterations */
2841         inneriter                  += j_index_end - j_index_start;
2842
2843         /* Outer loop uses 24 flops */
2844     }
2845
2846     /* Increment number of outer iterations */
2847     outeriter        += nri;
2848
2849     /* Update outer/inner flops */
2850
2851     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*393);
2852 }