Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_avx_256_double.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     real *           vdwioffsetptr0;
84     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     real *           vdwioffsetptr1;
86     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     real *           vdwioffsetptr3;
90     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
116     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
117     __m128i          ewitab;
118     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
119     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
120     real             *ewtab;
121     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
122     real             rswitch_scalar,d_scalar;
123     __m256d          dummy_mask,cutoff_mask;
124     __m128           tmpmask0,tmpmask1;
125     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
126     __m256d          one     = _mm256_set1_pd(1.0);
127     __m256d          two     = _mm256_set1_pd(2.0);
128     x                = xx[0];
129     f                = ff[0];
130
131     nri              = nlist->nri;
132     iinr             = nlist->iinr;
133     jindex           = nlist->jindex;
134     jjnr             = nlist->jjnr;
135     shiftidx         = nlist->shift;
136     gid              = nlist->gid;
137     shiftvec         = fr->shift_vec[0];
138     fshift           = fr->fshift[0];
139     facel            = _mm256_set1_pd(fr->ic->epsfac);
140     charge           = mdatoms->chargeA;
141     nvdwtype         = fr->ntype;
142     vdwparam         = fr->nbfp;
143     vdwtype          = mdatoms->typeA;
144
145     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
146     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
147     beta2            = _mm256_mul_pd(beta,beta);
148     beta3            = _mm256_mul_pd(beta,beta2);
149
150     ewtab            = fr->ic->tabq_coul_FDV0;
151     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
152     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
153
154     /* Setup water-specific parameters */
155     inr              = nlist->iinr[0];
156     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
157     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
158     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
159     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
160
161     jq1              = _mm256_set1_pd(charge[inr+1]);
162     jq2              = _mm256_set1_pd(charge[inr+2]);
163     jq3              = _mm256_set1_pd(charge[inr+3]);
164     vdwjidx0A        = 2*vdwtype[inr+0];
165     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
166     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
167     qq11             = _mm256_mul_pd(iq1,jq1);
168     qq12             = _mm256_mul_pd(iq1,jq2);
169     qq13             = _mm256_mul_pd(iq1,jq3);
170     qq21             = _mm256_mul_pd(iq2,jq1);
171     qq22             = _mm256_mul_pd(iq2,jq2);
172     qq23             = _mm256_mul_pd(iq2,jq3);
173     qq31             = _mm256_mul_pd(iq3,jq1);
174     qq32             = _mm256_mul_pd(iq3,jq2);
175     qq33             = _mm256_mul_pd(iq3,jq3);
176
177     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
178     rcutoff_scalar   = fr->ic->rcoulomb;
179     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
180     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
181
182     rswitch_scalar   = fr->ic->rcoulomb_switch;
183     rswitch          = _mm256_set1_pd(rswitch_scalar);
184     /* Setup switch parameters */
185     d_scalar         = rcutoff_scalar-rswitch_scalar;
186     d                = _mm256_set1_pd(d_scalar);
187     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
188     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
189     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
190     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
191     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
192     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
193
194     /* Avoid stupid compiler warnings */
195     jnrA = jnrB = jnrC = jnrD = 0;
196     j_coord_offsetA = 0;
197     j_coord_offsetB = 0;
198     j_coord_offsetC = 0;
199     j_coord_offsetD = 0;
200
201     outeriter        = 0;
202     inneriter        = 0;
203
204     for(iidx=0;iidx<4*DIM;iidx++)
205     {
206         scratch[iidx] = 0.0;
207     }
208
209     /* Start outer loop over neighborlists */
210     for(iidx=0; iidx<nri; iidx++)
211     {
212         /* Load shift vector for this list */
213         i_shift_offset   = DIM*shiftidx[iidx];
214
215         /* Load limits for loop over neighbors */
216         j_index_start    = jindex[iidx];
217         j_index_end      = jindex[iidx+1];
218
219         /* Get outer coordinate index */
220         inr              = iinr[iidx];
221         i_coord_offset   = DIM*inr;
222
223         /* Load i particle coords and add shift vector */
224         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
225                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
226
227         fix0             = _mm256_setzero_pd();
228         fiy0             = _mm256_setzero_pd();
229         fiz0             = _mm256_setzero_pd();
230         fix1             = _mm256_setzero_pd();
231         fiy1             = _mm256_setzero_pd();
232         fiz1             = _mm256_setzero_pd();
233         fix2             = _mm256_setzero_pd();
234         fiy2             = _mm256_setzero_pd();
235         fiz2             = _mm256_setzero_pd();
236         fix3             = _mm256_setzero_pd();
237         fiy3             = _mm256_setzero_pd();
238         fiz3             = _mm256_setzero_pd();
239
240         /* Reset potential sums */
241         velecsum         = _mm256_setzero_pd();
242         vvdwsum          = _mm256_setzero_pd();
243
244         /* Start inner kernel loop */
245         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
246         {
247
248             /* Get j neighbor index, and coordinate index */
249             jnrA             = jjnr[jidx];
250             jnrB             = jjnr[jidx+1];
251             jnrC             = jjnr[jidx+2];
252             jnrD             = jjnr[jidx+3];
253             j_coord_offsetA  = DIM*jnrA;
254             j_coord_offsetB  = DIM*jnrB;
255             j_coord_offsetC  = DIM*jnrC;
256             j_coord_offsetD  = DIM*jnrD;
257
258             /* load j atom coordinates */
259             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
260                                                  x+j_coord_offsetC,x+j_coord_offsetD,
261                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
262                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
263
264             /* Calculate displacement vector */
265             dx00             = _mm256_sub_pd(ix0,jx0);
266             dy00             = _mm256_sub_pd(iy0,jy0);
267             dz00             = _mm256_sub_pd(iz0,jz0);
268             dx11             = _mm256_sub_pd(ix1,jx1);
269             dy11             = _mm256_sub_pd(iy1,jy1);
270             dz11             = _mm256_sub_pd(iz1,jz1);
271             dx12             = _mm256_sub_pd(ix1,jx2);
272             dy12             = _mm256_sub_pd(iy1,jy2);
273             dz12             = _mm256_sub_pd(iz1,jz2);
274             dx13             = _mm256_sub_pd(ix1,jx3);
275             dy13             = _mm256_sub_pd(iy1,jy3);
276             dz13             = _mm256_sub_pd(iz1,jz3);
277             dx21             = _mm256_sub_pd(ix2,jx1);
278             dy21             = _mm256_sub_pd(iy2,jy1);
279             dz21             = _mm256_sub_pd(iz2,jz1);
280             dx22             = _mm256_sub_pd(ix2,jx2);
281             dy22             = _mm256_sub_pd(iy2,jy2);
282             dz22             = _mm256_sub_pd(iz2,jz2);
283             dx23             = _mm256_sub_pd(ix2,jx3);
284             dy23             = _mm256_sub_pd(iy2,jy3);
285             dz23             = _mm256_sub_pd(iz2,jz3);
286             dx31             = _mm256_sub_pd(ix3,jx1);
287             dy31             = _mm256_sub_pd(iy3,jy1);
288             dz31             = _mm256_sub_pd(iz3,jz1);
289             dx32             = _mm256_sub_pd(ix3,jx2);
290             dy32             = _mm256_sub_pd(iy3,jy2);
291             dz32             = _mm256_sub_pd(iz3,jz2);
292             dx33             = _mm256_sub_pd(ix3,jx3);
293             dy33             = _mm256_sub_pd(iy3,jy3);
294             dz33             = _mm256_sub_pd(iz3,jz3);
295
296             /* Calculate squared distance and things based on it */
297             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
298             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
299             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
300             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
301             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
302             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
303             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
304             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
305             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
306             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
307
308             rinv00           = avx256_invsqrt_d(rsq00);
309             rinv11           = avx256_invsqrt_d(rsq11);
310             rinv12           = avx256_invsqrt_d(rsq12);
311             rinv13           = avx256_invsqrt_d(rsq13);
312             rinv21           = avx256_invsqrt_d(rsq21);
313             rinv22           = avx256_invsqrt_d(rsq22);
314             rinv23           = avx256_invsqrt_d(rsq23);
315             rinv31           = avx256_invsqrt_d(rsq31);
316             rinv32           = avx256_invsqrt_d(rsq32);
317             rinv33           = avx256_invsqrt_d(rsq33);
318
319             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
320             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
321             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
322             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
323             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
324             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
325             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
326             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
327             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
328             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
329
330             fjx0             = _mm256_setzero_pd();
331             fjy0             = _mm256_setzero_pd();
332             fjz0             = _mm256_setzero_pd();
333             fjx1             = _mm256_setzero_pd();
334             fjy1             = _mm256_setzero_pd();
335             fjz1             = _mm256_setzero_pd();
336             fjx2             = _mm256_setzero_pd();
337             fjy2             = _mm256_setzero_pd();
338             fjz2             = _mm256_setzero_pd();
339             fjx3             = _mm256_setzero_pd();
340             fjy3             = _mm256_setzero_pd();
341             fjz3             = _mm256_setzero_pd();
342
343             /**************************
344              * CALCULATE INTERACTIONS *
345              **************************/
346
347             if (gmx_mm256_any_lt(rsq00,rcutoff2))
348             {
349
350             r00              = _mm256_mul_pd(rsq00,rinv00);
351
352             /* LENNARD-JONES DISPERSION/REPULSION */
353
354             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
355             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
356             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
357             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
358             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
359
360             d                = _mm256_sub_pd(r00,rswitch);
361             d                = _mm256_max_pd(d,_mm256_setzero_pd());
362             d2               = _mm256_mul_pd(d,d);
363             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
364
365             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
366
367             /* Evaluate switch function */
368             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
369             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
370             vvdw             = _mm256_mul_pd(vvdw,sw);
371             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
372
373             /* Update potential sum for this i atom from the interaction with this j atom. */
374             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
375             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
376
377             fscal            = fvdw;
378
379             fscal            = _mm256_and_pd(fscal,cutoff_mask);
380
381             /* Calculate temporary vectorial force */
382             tx               = _mm256_mul_pd(fscal,dx00);
383             ty               = _mm256_mul_pd(fscal,dy00);
384             tz               = _mm256_mul_pd(fscal,dz00);
385
386             /* Update vectorial force */
387             fix0             = _mm256_add_pd(fix0,tx);
388             fiy0             = _mm256_add_pd(fiy0,ty);
389             fiz0             = _mm256_add_pd(fiz0,tz);
390
391             fjx0             = _mm256_add_pd(fjx0,tx);
392             fjy0             = _mm256_add_pd(fjy0,ty);
393             fjz0             = _mm256_add_pd(fjz0,tz);
394
395             }
396
397             /**************************
398              * CALCULATE INTERACTIONS *
399              **************************/
400
401             if (gmx_mm256_any_lt(rsq11,rcutoff2))
402             {
403
404             r11              = _mm256_mul_pd(rsq11,rinv11);
405
406             /* EWALD ELECTROSTATICS */
407
408             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
409             ewrt             = _mm256_mul_pd(r11,ewtabscale);
410             ewitab           = _mm256_cvttpd_epi32(ewrt);
411             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
412             ewitab           = _mm_slli_epi32(ewitab,2);
413             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
414             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
415             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
416             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
417             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
418             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
419             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
420             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
421             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
422
423             d                = _mm256_sub_pd(r11,rswitch);
424             d                = _mm256_max_pd(d,_mm256_setzero_pd());
425             d2               = _mm256_mul_pd(d,d);
426             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
427
428             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
429
430             /* Evaluate switch function */
431             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
432             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
433             velec            = _mm256_mul_pd(velec,sw);
434             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
435
436             /* Update potential sum for this i atom from the interaction with this j atom. */
437             velec            = _mm256_and_pd(velec,cutoff_mask);
438             velecsum         = _mm256_add_pd(velecsum,velec);
439
440             fscal            = felec;
441
442             fscal            = _mm256_and_pd(fscal,cutoff_mask);
443
444             /* Calculate temporary vectorial force */
445             tx               = _mm256_mul_pd(fscal,dx11);
446             ty               = _mm256_mul_pd(fscal,dy11);
447             tz               = _mm256_mul_pd(fscal,dz11);
448
449             /* Update vectorial force */
450             fix1             = _mm256_add_pd(fix1,tx);
451             fiy1             = _mm256_add_pd(fiy1,ty);
452             fiz1             = _mm256_add_pd(fiz1,tz);
453
454             fjx1             = _mm256_add_pd(fjx1,tx);
455             fjy1             = _mm256_add_pd(fjy1,ty);
456             fjz1             = _mm256_add_pd(fjz1,tz);
457
458             }
459
460             /**************************
461              * CALCULATE INTERACTIONS *
462              **************************/
463
464             if (gmx_mm256_any_lt(rsq12,rcutoff2))
465             {
466
467             r12              = _mm256_mul_pd(rsq12,rinv12);
468
469             /* EWALD ELECTROSTATICS */
470
471             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
472             ewrt             = _mm256_mul_pd(r12,ewtabscale);
473             ewitab           = _mm256_cvttpd_epi32(ewrt);
474             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
475             ewitab           = _mm_slli_epi32(ewitab,2);
476             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
477             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
478             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
479             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
480             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
481             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
482             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
483             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
484             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
485
486             d                = _mm256_sub_pd(r12,rswitch);
487             d                = _mm256_max_pd(d,_mm256_setzero_pd());
488             d2               = _mm256_mul_pd(d,d);
489             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
490
491             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
492
493             /* Evaluate switch function */
494             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
495             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
496             velec            = _mm256_mul_pd(velec,sw);
497             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
498
499             /* Update potential sum for this i atom from the interaction with this j atom. */
500             velec            = _mm256_and_pd(velec,cutoff_mask);
501             velecsum         = _mm256_add_pd(velecsum,velec);
502
503             fscal            = felec;
504
505             fscal            = _mm256_and_pd(fscal,cutoff_mask);
506
507             /* Calculate temporary vectorial force */
508             tx               = _mm256_mul_pd(fscal,dx12);
509             ty               = _mm256_mul_pd(fscal,dy12);
510             tz               = _mm256_mul_pd(fscal,dz12);
511
512             /* Update vectorial force */
513             fix1             = _mm256_add_pd(fix1,tx);
514             fiy1             = _mm256_add_pd(fiy1,ty);
515             fiz1             = _mm256_add_pd(fiz1,tz);
516
517             fjx2             = _mm256_add_pd(fjx2,tx);
518             fjy2             = _mm256_add_pd(fjy2,ty);
519             fjz2             = _mm256_add_pd(fjz2,tz);
520
521             }
522
523             /**************************
524              * CALCULATE INTERACTIONS *
525              **************************/
526
527             if (gmx_mm256_any_lt(rsq13,rcutoff2))
528             {
529
530             r13              = _mm256_mul_pd(rsq13,rinv13);
531
532             /* EWALD ELECTROSTATICS */
533
534             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
535             ewrt             = _mm256_mul_pd(r13,ewtabscale);
536             ewitab           = _mm256_cvttpd_epi32(ewrt);
537             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
538             ewitab           = _mm_slli_epi32(ewitab,2);
539             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
540             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
541             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
542             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
543             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
544             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
545             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
546             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
547             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
548
549             d                = _mm256_sub_pd(r13,rswitch);
550             d                = _mm256_max_pd(d,_mm256_setzero_pd());
551             d2               = _mm256_mul_pd(d,d);
552             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
553
554             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
555
556             /* Evaluate switch function */
557             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
558             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
559             velec            = _mm256_mul_pd(velec,sw);
560             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
561
562             /* Update potential sum for this i atom from the interaction with this j atom. */
563             velec            = _mm256_and_pd(velec,cutoff_mask);
564             velecsum         = _mm256_add_pd(velecsum,velec);
565
566             fscal            = felec;
567
568             fscal            = _mm256_and_pd(fscal,cutoff_mask);
569
570             /* Calculate temporary vectorial force */
571             tx               = _mm256_mul_pd(fscal,dx13);
572             ty               = _mm256_mul_pd(fscal,dy13);
573             tz               = _mm256_mul_pd(fscal,dz13);
574
575             /* Update vectorial force */
576             fix1             = _mm256_add_pd(fix1,tx);
577             fiy1             = _mm256_add_pd(fiy1,ty);
578             fiz1             = _mm256_add_pd(fiz1,tz);
579
580             fjx3             = _mm256_add_pd(fjx3,tx);
581             fjy3             = _mm256_add_pd(fjy3,ty);
582             fjz3             = _mm256_add_pd(fjz3,tz);
583
584             }
585
586             /**************************
587              * CALCULATE INTERACTIONS *
588              **************************/
589
590             if (gmx_mm256_any_lt(rsq21,rcutoff2))
591             {
592
593             r21              = _mm256_mul_pd(rsq21,rinv21);
594
595             /* EWALD ELECTROSTATICS */
596
597             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
598             ewrt             = _mm256_mul_pd(r21,ewtabscale);
599             ewitab           = _mm256_cvttpd_epi32(ewrt);
600             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
601             ewitab           = _mm_slli_epi32(ewitab,2);
602             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
603             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
604             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
605             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
606             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
607             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
608             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
609             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
610             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
611
612             d                = _mm256_sub_pd(r21,rswitch);
613             d                = _mm256_max_pd(d,_mm256_setzero_pd());
614             d2               = _mm256_mul_pd(d,d);
615             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
616
617             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
618
619             /* Evaluate switch function */
620             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
621             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
622             velec            = _mm256_mul_pd(velec,sw);
623             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
624
625             /* Update potential sum for this i atom from the interaction with this j atom. */
626             velec            = _mm256_and_pd(velec,cutoff_mask);
627             velecsum         = _mm256_add_pd(velecsum,velec);
628
629             fscal            = felec;
630
631             fscal            = _mm256_and_pd(fscal,cutoff_mask);
632
633             /* Calculate temporary vectorial force */
634             tx               = _mm256_mul_pd(fscal,dx21);
635             ty               = _mm256_mul_pd(fscal,dy21);
636             tz               = _mm256_mul_pd(fscal,dz21);
637
638             /* Update vectorial force */
639             fix2             = _mm256_add_pd(fix2,tx);
640             fiy2             = _mm256_add_pd(fiy2,ty);
641             fiz2             = _mm256_add_pd(fiz2,tz);
642
643             fjx1             = _mm256_add_pd(fjx1,tx);
644             fjy1             = _mm256_add_pd(fjy1,ty);
645             fjz1             = _mm256_add_pd(fjz1,tz);
646
647             }
648
649             /**************************
650              * CALCULATE INTERACTIONS *
651              **************************/
652
653             if (gmx_mm256_any_lt(rsq22,rcutoff2))
654             {
655
656             r22              = _mm256_mul_pd(rsq22,rinv22);
657
658             /* EWALD ELECTROSTATICS */
659
660             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
661             ewrt             = _mm256_mul_pd(r22,ewtabscale);
662             ewitab           = _mm256_cvttpd_epi32(ewrt);
663             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
664             ewitab           = _mm_slli_epi32(ewitab,2);
665             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
666             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
667             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
668             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
669             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
670             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
671             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
672             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
673             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
674
675             d                = _mm256_sub_pd(r22,rswitch);
676             d                = _mm256_max_pd(d,_mm256_setzero_pd());
677             d2               = _mm256_mul_pd(d,d);
678             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
679
680             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
681
682             /* Evaluate switch function */
683             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
684             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
685             velec            = _mm256_mul_pd(velec,sw);
686             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
687
688             /* Update potential sum for this i atom from the interaction with this j atom. */
689             velec            = _mm256_and_pd(velec,cutoff_mask);
690             velecsum         = _mm256_add_pd(velecsum,velec);
691
692             fscal            = felec;
693
694             fscal            = _mm256_and_pd(fscal,cutoff_mask);
695
696             /* Calculate temporary vectorial force */
697             tx               = _mm256_mul_pd(fscal,dx22);
698             ty               = _mm256_mul_pd(fscal,dy22);
699             tz               = _mm256_mul_pd(fscal,dz22);
700
701             /* Update vectorial force */
702             fix2             = _mm256_add_pd(fix2,tx);
703             fiy2             = _mm256_add_pd(fiy2,ty);
704             fiz2             = _mm256_add_pd(fiz2,tz);
705
706             fjx2             = _mm256_add_pd(fjx2,tx);
707             fjy2             = _mm256_add_pd(fjy2,ty);
708             fjz2             = _mm256_add_pd(fjz2,tz);
709
710             }
711
712             /**************************
713              * CALCULATE INTERACTIONS *
714              **************************/
715
716             if (gmx_mm256_any_lt(rsq23,rcutoff2))
717             {
718
719             r23              = _mm256_mul_pd(rsq23,rinv23);
720
721             /* EWALD ELECTROSTATICS */
722
723             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
724             ewrt             = _mm256_mul_pd(r23,ewtabscale);
725             ewitab           = _mm256_cvttpd_epi32(ewrt);
726             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
727             ewitab           = _mm_slli_epi32(ewitab,2);
728             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
729             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
730             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
731             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
732             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
733             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
734             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
735             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
736             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
737
738             d                = _mm256_sub_pd(r23,rswitch);
739             d                = _mm256_max_pd(d,_mm256_setzero_pd());
740             d2               = _mm256_mul_pd(d,d);
741             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
742
743             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
744
745             /* Evaluate switch function */
746             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
747             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
748             velec            = _mm256_mul_pd(velec,sw);
749             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
750
751             /* Update potential sum for this i atom from the interaction with this j atom. */
752             velec            = _mm256_and_pd(velec,cutoff_mask);
753             velecsum         = _mm256_add_pd(velecsum,velec);
754
755             fscal            = felec;
756
757             fscal            = _mm256_and_pd(fscal,cutoff_mask);
758
759             /* Calculate temporary vectorial force */
760             tx               = _mm256_mul_pd(fscal,dx23);
761             ty               = _mm256_mul_pd(fscal,dy23);
762             tz               = _mm256_mul_pd(fscal,dz23);
763
764             /* Update vectorial force */
765             fix2             = _mm256_add_pd(fix2,tx);
766             fiy2             = _mm256_add_pd(fiy2,ty);
767             fiz2             = _mm256_add_pd(fiz2,tz);
768
769             fjx3             = _mm256_add_pd(fjx3,tx);
770             fjy3             = _mm256_add_pd(fjy3,ty);
771             fjz3             = _mm256_add_pd(fjz3,tz);
772
773             }
774
775             /**************************
776              * CALCULATE INTERACTIONS *
777              **************************/
778
779             if (gmx_mm256_any_lt(rsq31,rcutoff2))
780             {
781
782             r31              = _mm256_mul_pd(rsq31,rinv31);
783
784             /* EWALD ELECTROSTATICS */
785
786             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
787             ewrt             = _mm256_mul_pd(r31,ewtabscale);
788             ewitab           = _mm256_cvttpd_epi32(ewrt);
789             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
790             ewitab           = _mm_slli_epi32(ewitab,2);
791             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
792             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
793             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
794             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
795             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
796             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
797             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
798             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
799             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
800
801             d                = _mm256_sub_pd(r31,rswitch);
802             d                = _mm256_max_pd(d,_mm256_setzero_pd());
803             d2               = _mm256_mul_pd(d,d);
804             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
805
806             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
807
808             /* Evaluate switch function */
809             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
810             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
811             velec            = _mm256_mul_pd(velec,sw);
812             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
813
814             /* Update potential sum for this i atom from the interaction with this j atom. */
815             velec            = _mm256_and_pd(velec,cutoff_mask);
816             velecsum         = _mm256_add_pd(velecsum,velec);
817
818             fscal            = felec;
819
820             fscal            = _mm256_and_pd(fscal,cutoff_mask);
821
822             /* Calculate temporary vectorial force */
823             tx               = _mm256_mul_pd(fscal,dx31);
824             ty               = _mm256_mul_pd(fscal,dy31);
825             tz               = _mm256_mul_pd(fscal,dz31);
826
827             /* Update vectorial force */
828             fix3             = _mm256_add_pd(fix3,tx);
829             fiy3             = _mm256_add_pd(fiy3,ty);
830             fiz3             = _mm256_add_pd(fiz3,tz);
831
832             fjx1             = _mm256_add_pd(fjx1,tx);
833             fjy1             = _mm256_add_pd(fjy1,ty);
834             fjz1             = _mm256_add_pd(fjz1,tz);
835
836             }
837
838             /**************************
839              * CALCULATE INTERACTIONS *
840              **************************/
841
842             if (gmx_mm256_any_lt(rsq32,rcutoff2))
843             {
844
845             r32              = _mm256_mul_pd(rsq32,rinv32);
846
847             /* EWALD ELECTROSTATICS */
848
849             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
850             ewrt             = _mm256_mul_pd(r32,ewtabscale);
851             ewitab           = _mm256_cvttpd_epi32(ewrt);
852             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
853             ewitab           = _mm_slli_epi32(ewitab,2);
854             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
855             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
856             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
857             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
858             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
859             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
860             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
861             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
862             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
863
864             d                = _mm256_sub_pd(r32,rswitch);
865             d                = _mm256_max_pd(d,_mm256_setzero_pd());
866             d2               = _mm256_mul_pd(d,d);
867             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
868
869             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
870
871             /* Evaluate switch function */
872             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
873             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
874             velec            = _mm256_mul_pd(velec,sw);
875             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
876
877             /* Update potential sum for this i atom from the interaction with this j atom. */
878             velec            = _mm256_and_pd(velec,cutoff_mask);
879             velecsum         = _mm256_add_pd(velecsum,velec);
880
881             fscal            = felec;
882
883             fscal            = _mm256_and_pd(fscal,cutoff_mask);
884
885             /* Calculate temporary vectorial force */
886             tx               = _mm256_mul_pd(fscal,dx32);
887             ty               = _mm256_mul_pd(fscal,dy32);
888             tz               = _mm256_mul_pd(fscal,dz32);
889
890             /* Update vectorial force */
891             fix3             = _mm256_add_pd(fix3,tx);
892             fiy3             = _mm256_add_pd(fiy3,ty);
893             fiz3             = _mm256_add_pd(fiz3,tz);
894
895             fjx2             = _mm256_add_pd(fjx2,tx);
896             fjy2             = _mm256_add_pd(fjy2,ty);
897             fjz2             = _mm256_add_pd(fjz2,tz);
898
899             }
900
901             /**************************
902              * CALCULATE INTERACTIONS *
903              **************************/
904
905             if (gmx_mm256_any_lt(rsq33,rcutoff2))
906             {
907
908             r33              = _mm256_mul_pd(rsq33,rinv33);
909
910             /* EWALD ELECTROSTATICS */
911
912             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
913             ewrt             = _mm256_mul_pd(r33,ewtabscale);
914             ewitab           = _mm256_cvttpd_epi32(ewrt);
915             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
916             ewitab           = _mm_slli_epi32(ewitab,2);
917             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
918             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
919             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
920             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
921             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
922             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
923             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
924             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
925             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
926
927             d                = _mm256_sub_pd(r33,rswitch);
928             d                = _mm256_max_pd(d,_mm256_setzero_pd());
929             d2               = _mm256_mul_pd(d,d);
930             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
931
932             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
933
934             /* Evaluate switch function */
935             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
936             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
937             velec            = _mm256_mul_pd(velec,sw);
938             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
939
940             /* Update potential sum for this i atom from the interaction with this j atom. */
941             velec            = _mm256_and_pd(velec,cutoff_mask);
942             velecsum         = _mm256_add_pd(velecsum,velec);
943
944             fscal            = felec;
945
946             fscal            = _mm256_and_pd(fscal,cutoff_mask);
947
948             /* Calculate temporary vectorial force */
949             tx               = _mm256_mul_pd(fscal,dx33);
950             ty               = _mm256_mul_pd(fscal,dy33);
951             tz               = _mm256_mul_pd(fscal,dz33);
952
953             /* Update vectorial force */
954             fix3             = _mm256_add_pd(fix3,tx);
955             fiy3             = _mm256_add_pd(fiy3,ty);
956             fiz3             = _mm256_add_pd(fiz3,tz);
957
958             fjx3             = _mm256_add_pd(fjx3,tx);
959             fjy3             = _mm256_add_pd(fjy3,ty);
960             fjz3             = _mm256_add_pd(fjz3,tz);
961
962             }
963
964             fjptrA             = f+j_coord_offsetA;
965             fjptrB             = f+j_coord_offsetB;
966             fjptrC             = f+j_coord_offsetC;
967             fjptrD             = f+j_coord_offsetD;
968
969             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
970                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
971                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
972
973             /* Inner loop uses 647 flops */
974         }
975
976         if(jidx<j_index_end)
977         {
978
979             /* Get j neighbor index, and coordinate index */
980             jnrlistA         = jjnr[jidx];
981             jnrlistB         = jjnr[jidx+1];
982             jnrlistC         = jjnr[jidx+2];
983             jnrlistD         = jjnr[jidx+3];
984             /* Sign of each element will be negative for non-real atoms.
985              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
986              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
987              */
988             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
989
990             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
991             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
992             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
993
994             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
995             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
996             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
997             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
998             j_coord_offsetA  = DIM*jnrA;
999             j_coord_offsetB  = DIM*jnrB;
1000             j_coord_offsetC  = DIM*jnrC;
1001             j_coord_offsetD  = DIM*jnrD;
1002
1003             /* load j atom coordinates */
1004             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1005                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1006                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1007                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1008
1009             /* Calculate displacement vector */
1010             dx00             = _mm256_sub_pd(ix0,jx0);
1011             dy00             = _mm256_sub_pd(iy0,jy0);
1012             dz00             = _mm256_sub_pd(iz0,jz0);
1013             dx11             = _mm256_sub_pd(ix1,jx1);
1014             dy11             = _mm256_sub_pd(iy1,jy1);
1015             dz11             = _mm256_sub_pd(iz1,jz1);
1016             dx12             = _mm256_sub_pd(ix1,jx2);
1017             dy12             = _mm256_sub_pd(iy1,jy2);
1018             dz12             = _mm256_sub_pd(iz1,jz2);
1019             dx13             = _mm256_sub_pd(ix1,jx3);
1020             dy13             = _mm256_sub_pd(iy1,jy3);
1021             dz13             = _mm256_sub_pd(iz1,jz3);
1022             dx21             = _mm256_sub_pd(ix2,jx1);
1023             dy21             = _mm256_sub_pd(iy2,jy1);
1024             dz21             = _mm256_sub_pd(iz2,jz1);
1025             dx22             = _mm256_sub_pd(ix2,jx2);
1026             dy22             = _mm256_sub_pd(iy2,jy2);
1027             dz22             = _mm256_sub_pd(iz2,jz2);
1028             dx23             = _mm256_sub_pd(ix2,jx3);
1029             dy23             = _mm256_sub_pd(iy2,jy3);
1030             dz23             = _mm256_sub_pd(iz2,jz3);
1031             dx31             = _mm256_sub_pd(ix3,jx1);
1032             dy31             = _mm256_sub_pd(iy3,jy1);
1033             dz31             = _mm256_sub_pd(iz3,jz1);
1034             dx32             = _mm256_sub_pd(ix3,jx2);
1035             dy32             = _mm256_sub_pd(iy3,jy2);
1036             dz32             = _mm256_sub_pd(iz3,jz2);
1037             dx33             = _mm256_sub_pd(ix3,jx3);
1038             dy33             = _mm256_sub_pd(iy3,jy3);
1039             dz33             = _mm256_sub_pd(iz3,jz3);
1040
1041             /* Calculate squared distance and things based on it */
1042             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1043             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1044             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1045             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1046             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1047             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1048             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1049             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1050             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1051             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1052
1053             rinv00           = avx256_invsqrt_d(rsq00);
1054             rinv11           = avx256_invsqrt_d(rsq11);
1055             rinv12           = avx256_invsqrt_d(rsq12);
1056             rinv13           = avx256_invsqrt_d(rsq13);
1057             rinv21           = avx256_invsqrt_d(rsq21);
1058             rinv22           = avx256_invsqrt_d(rsq22);
1059             rinv23           = avx256_invsqrt_d(rsq23);
1060             rinv31           = avx256_invsqrt_d(rsq31);
1061             rinv32           = avx256_invsqrt_d(rsq32);
1062             rinv33           = avx256_invsqrt_d(rsq33);
1063
1064             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1065             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1066             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1067             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1068             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1069             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1070             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1071             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1072             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1073             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1074
1075             fjx0             = _mm256_setzero_pd();
1076             fjy0             = _mm256_setzero_pd();
1077             fjz0             = _mm256_setzero_pd();
1078             fjx1             = _mm256_setzero_pd();
1079             fjy1             = _mm256_setzero_pd();
1080             fjz1             = _mm256_setzero_pd();
1081             fjx2             = _mm256_setzero_pd();
1082             fjy2             = _mm256_setzero_pd();
1083             fjz2             = _mm256_setzero_pd();
1084             fjx3             = _mm256_setzero_pd();
1085             fjy3             = _mm256_setzero_pd();
1086             fjz3             = _mm256_setzero_pd();
1087
1088             /**************************
1089              * CALCULATE INTERACTIONS *
1090              **************************/
1091
1092             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1093             {
1094
1095             r00              = _mm256_mul_pd(rsq00,rinv00);
1096             r00              = _mm256_andnot_pd(dummy_mask,r00);
1097
1098             /* LENNARD-JONES DISPERSION/REPULSION */
1099
1100             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1101             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1102             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1103             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1104             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1105
1106             d                = _mm256_sub_pd(r00,rswitch);
1107             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1108             d2               = _mm256_mul_pd(d,d);
1109             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1110
1111             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1112
1113             /* Evaluate switch function */
1114             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1115             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1116             vvdw             = _mm256_mul_pd(vvdw,sw);
1117             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1118
1119             /* Update potential sum for this i atom from the interaction with this j atom. */
1120             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
1121             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
1122             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
1123
1124             fscal            = fvdw;
1125
1126             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1127
1128             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1129
1130             /* Calculate temporary vectorial force */
1131             tx               = _mm256_mul_pd(fscal,dx00);
1132             ty               = _mm256_mul_pd(fscal,dy00);
1133             tz               = _mm256_mul_pd(fscal,dz00);
1134
1135             /* Update vectorial force */
1136             fix0             = _mm256_add_pd(fix0,tx);
1137             fiy0             = _mm256_add_pd(fiy0,ty);
1138             fiz0             = _mm256_add_pd(fiz0,tz);
1139
1140             fjx0             = _mm256_add_pd(fjx0,tx);
1141             fjy0             = _mm256_add_pd(fjy0,ty);
1142             fjz0             = _mm256_add_pd(fjz0,tz);
1143
1144             }
1145
1146             /**************************
1147              * CALCULATE INTERACTIONS *
1148              **************************/
1149
1150             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1151             {
1152
1153             r11              = _mm256_mul_pd(rsq11,rinv11);
1154             r11              = _mm256_andnot_pd(dummy_mask,r11);
1155
1156             /* EWALD ELECTROSTATICS */
1157
1158             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1159             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1160             ewitab           = _mm256_cvttpd_epi32(ewrt);
1161             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1162             ewitab           = _mm_slli_epi32(ewitab,2);
1163             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1164             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1165             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1166             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1167             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1168             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1169             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1170             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1171             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1172
1173             d                = _mm256_sub_pd(r11,rswitch);
1174             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1175             d2               = _mm256_mul_pd(d,d);
1176             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1177
1178             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1179
1180             /* Evaluate switch function */
1181             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1182             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1183             velec            = _mm256_mul_pd(velec,sw);
1184             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1185
1186             /* Update potential sum for this i atom from the interaction with this j atom. */
1187             velec            = _mm256_and_pd(velec,cutoff_mask);
1188             velec            = _mm256_andnot_pd(dummy_mask,velec);
1189             velecsum         = _mm256_add_pd(velecsum,velec);
1190
1191             fscal            = felec;
1192
1193             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1194
1195             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1196
1197             /* Calculate temporary vectorial force */
1198             tx               = _mm256_mul_pd(fscal,dx11);
1199             ty               = _mm256_mul_pd(fscal,dy11);
1200             tz               = _mm256_mul_pd(fscal,dz11);
1201
1202             /* Update vectorial force */
1203             fix1             = _mm256_add_pd(fix1,tx);
1204             fiy1             = _mm256_add_pd(fiy1,ty);
1205             fiz1             = _mm256_add_pd(fiz1,tz);
1206
1207             fjx1             = _mm256_add_pd(fjx1,tx);
1208             fjy1             = _mm256_add_pd(fjy1,ty);
1209             fjz1             = _mm256_add_pd(fjz1,tz);
1210
1211             }
1212
1213             /**************************
1214              * CALCULATE INTERACTIONS *
1215              **************************/
1216
1217             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1218             {
1219
1220             r12              = _mm256_mul_pd(rsq12,rinv12);
1221             r12              = _mm256_andnot_pd(dummy_mask,r12);
1222
1223             /* EWALD ELECTROSTATICS */
1224
1225             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1226             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1227             ewitab           = _mm256_cvttpd_epi32(ewrt);
1228             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1229             ewitab           = _mm_slli_epi32(ewitab,2);
1230             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1231             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1232             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1233             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1234             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1235             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1236             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1237             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1238             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1239
1240             d                = _mm256_sub_pd(r12,rswitch);
1241             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1242             d2               = _mm256_mul_pd(d,d);
1243             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1244
1245             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1246
1247             /* Evaluate switch function */
1248             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1249             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1250             velec            = _mm256_mul_pd(velec,sw);
1251             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1252
1253             /* Update potential sum for this i atom from the interaction with this j atom. */
1254             velec            = _mm256_and_pd(velec,cutoff_mask);
1255             velec            = _mm256_andnot_pd(dummy_mask,velec);
1256             velecsum         = _mm256_add_pd(velecsum,velec);
1257
1258             fscal            = felec;
1259
1260             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1261
1262             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1263
1264             /* Calculate temporary vectorial force */
1265             tx               = _mm256_mul_pd(fscal,dx12);
1266             ty               = _mm256_mul_pd(fscal,dy12);
1267             tz               = _mm256_mul_pd(fscal,dz12);
1268
1269             /* Update vectorial force */
1270             fix1             = _mm256_add_pd(fix1,tx);
1271             fiy1             = _mm256_add_pd(fiy1,ty);
1272             fiz1             = _mm256_add_pd(fiz1,tz);
1273
1274             fjx2             = _mm256_add_pd(fjx2,tx);
1275             fjy2             = _mm256_add_pd(fjy2,ty);
1276             fjz2             = _mm256_add_pd(fjz2,tz);
1277
1278             }
1279
1280             /**************************
1281              * CALCULATE INTERACTIONS *
1282              **************************/
1283
1284             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1285             {
1286
1287             r13              = _mm256_mul_pd(rsq13,rinv13);
1288             r13              = _mm256_andnot_pd(dummy_mask,r13);
1289
1290             /* EWALD ELECTROSTATICS */
1291
1292             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1293             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1294             ewitab           = _mm256_cvttpd_epi32(ewrt);
1295             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1296             ewitab           = _mm_slli_epi32(ewitab,2);
1297             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1298             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1299             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1300             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1301             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1302             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1303             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1304             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1305             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1306
1307             d                = _mm256_sub_pd(r13,rswitch);
1308             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1309             d2               = _mm256_mul_pd(d,d);
1310             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1311
1312             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1313
1314             /* Evaluate switch function */
1315             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1316             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
1317             velec            = _mm256_mul_pd(velec,sw);
1318             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1319
1320             /* Update potential sum for this i atom from the interaction with this j atom. */
1321             velec            = _mm256_and_pd(velec,cutoff_mask);
1322             velec            = _mm256_andnot_pd(dummy_mask,velec);
1323             velecsum         = _mm256_add_pd(velecsum,velec);
1324
1325             fscal            = felec;
1326
1327             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1328
1329             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1330
1331             /* Calculate temporary vectorial force */
1332             tx               = _mm256_mul_pd(fscal,dx13);
1333             ty               = _mm256_mul_pd(fscal,dy13);
1334             tz               = _mm256_mul_pd(fscal,dz13);
1335
1336             /* Update vectorial force */
1337             fix1             = _mm256_add_pd(fix1,tx);
1338             fiy1             = _mm256_add_pd(fiy1,ty);
1339             fiz1             = _mm256_add_pd(fiz1,tz);
1340
1341             fjx3             = _mm256_add_pd(fjx3,tx);
1342             fjy3             = _mm256_add_pd(fjy3,ty);
1343             fjz3             = _mm256_add_pd(fjz3,tz);
1344
1345             }
1346
1347             /**************************
1348              * CALCULATE INTERACTIONS *
1349              **************************/
1350
1351             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1352             {
1353
1354             r21              = _mm256_mul_pd(rsq21,rinv21);
1355             r21              = _mm256_andnot_pd(dummy_mask,r21);
1356
1357             /* EWALD ELECTROSTATICS */
1358
1359             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1360             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1361             ewitab           = _mm256_cvttpd_epi32(ewrt);
1362             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1363             ewitab           = _mm_slli_epi32(ewitab,2);
1364             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1365             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1366             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1367             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1368             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1369             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1370             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1371             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1372             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1373
1374             d                = _mm256_sub_pd(r21,rswitch);
1375             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1376             d2               = _mm256_mul_pd(d,d);
1377             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1378
1379             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1380
1381             /* Evaluate switch function */
1382             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1383             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
1384             velec            = _mm256_mul_pd(velec,sw);
1385             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1386
1387             /* Update potential sum for this i atom from the interaction with this j atom. */
1388             velec            = _mm256_and_pd(velec,cutoff_mask);
1389             velec            = _mm256_andnot_pd(dummy_mask,velec);
1390             velecsum         = _mm256_add_pd(velecsum,velec);
1391
1392             fscal            = felec;
1393
1394             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1395
1396             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1397
1398             /* Calculate temporary vectorial force */
1399             tx               = _mm256_mul_pd(fscal,dx21);
1400             ty               = _mm256_mul_pd(fscal,dy21);
1401             tz               = _mm256_mul_pd(fscal,dz21);
1402
1403             /* Update vectorial force */
1404             fix2             = _mm256_add_pd(fix2,tx);
1405             fiy2             = _mm256_add_pd(fiy2,ty);
1406             fiz2             = _mm256_add_pd(fiz2,tz);
1407
1408             fjx1             = _mm256_add_pd(fjx1,tx);
1409             fjy1             = _mm256_add_pd(fjy1,ty);
1410             fjz1             = _mm256_add_pd(fjz1,tz);
1411
1412             }
1413
1414             /**************************
1415              * CALCULATE INTERACTIONS *
1416              **************************/
1417
1418             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1419             {
1420
1421             r22              = _mm256_mul_pd(rsq22,rinv22);
1422             r22              = _mm256_andnot_pd(dummy_mask,r22);
1423
1424             /* EWALD ELECTROSTATICS */
1425
1426             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1427             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1428             ewitab           = _mm256_cvttpd_epi32(ewrt);
1429             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1430             ewitab           = _mm_slli_epi32(ewitab,2);
1431             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1432             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1433             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1434             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1435             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1436             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1437             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1438             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1439             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1440
1441             d                = _mm256_sub_pd(r22,rswitch);
1442             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1443             d2               = _mm256_mul_pd(d,d);
1444             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1445
1446             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1447
1448             /* Evaluate switch function */
1449             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1450             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
1451             velec            = _mm256_mul_pd(velec,sw);
1452             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1453
1454             /* Update potential sum for this i atom from the interaction with this j atom. */
1455             velec            = _mm256_and_pd(velec,cutoff_mask);
1456             velec            = _mm256_andnot_pd(dummy_mask,velec);
1457             velecsum         = _mm256_add_pd(velecsum,velec);
1458
1459             fscal            = felec;
1460
1461             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1462
1463             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1464
1465             /* Calculate temporary vectorial force */
1466             tx               = _mm256_mul_pd(fscal,dx22);
1467             ty               = _mm256_mul_pd(fscal,dy22);
1468             tz               = _mm256_mul_pd(fscal,dz22);
1469
1470             /* Update vectorial force */
1471             fix2             = _mm256_add_pd(fix2,tx);
1472             fiy2             = _mm256_add_pd(fiy2,ty);
1473             fiz2             = _mm256_add_pd(fiz2,tz);
1474
1475             fjx2             = _mm256_add_pd(fjx2,tx);
1476             fjy2             = _mm256_add_pd(fjy2,ty);
1477             fjz2             = _mm256_add_pd(fjz2,tz);
1478
1479             }
1480
1481             /**************************
1482              * CALCULATE INTERACTIONS *
1483              **************************/
1484
1485             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1486             {
1487
1488             r23              = _mm256_mul_pd(rsq23,rinv23);
1489             r23              = _mm256_andnot_pd(dummy_mask,r23);
1490
1491             /* EWALD ELECTROSTATICS */
1492
1493             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1494             ewrt             = _mm256_mul_pd(r23,ewtabscale);
1495             ewitab           = _mm256_cvttpd_epi32(ewrt);
1496             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1497             ewitab           = _mm_slli_epi32(ewitab,2);
1498             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1499             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1500             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1501             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1502             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1503             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1504             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1505             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1506             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1507
1508             d                = _mm256_sub_pd(r23,rswitch);
1509             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1510             d2               = _mm256_mul_pd(d,d);
1511             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1512
1513             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1514
1515             /* Evaluate switch function */
1516             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1517             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
1518             velec            = _mm256_mul_pd(velec,sw);
1519             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1520
1521             /* Update potential sum for this i atom from the interaction with this j atom. */
1522             velec            = _mm256_and_pd(velec,cutoff_mask);
1523             velec            = _mm256_andnot_pd(dummy_mask,velec);
1524             velecsum         = _mm256_add_pd(velecsum,velec);
1525
1526             fscal            = felec;
1527
1528             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1529
1530             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1531
1532             /* Calculate temporary vectorial force */
1533             tx               = _mm256_mul_pd(fscal,dx23);
1534             ty               = _mm256_mul_pd(fscal,dy23);
1535             tz               = _mm256_mul_pd(fscal,dz23);
1536
1537             /* Update vectorial force */
1538             fix2             = _mm256_add_pd(fix2,tx);
1539             fiy2             = _mm256_add_pd(fiy2,ty);
1540             fiz2             = _mm256_add_pd(fiz2,tz);
1541
1542             fjx3             = _mm256_add_pd(fjx3,tx);
1543             fjy3             = _mm256_add_pd(fjy3,ty);
1544             fjz3             = _mm256_add_pd(fjz3,tz);
1545
1546             }
1547
1548             /**************************
1549              * CALCULATE INTERACTIONS *
1550              **************************/
1551
1552             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1553             {
1554
1555             r31              = _mm256_mul_pd(rsq31,rinv31);
1556             r31              = _mm256_andnot_pd(dummy_mask,r31);
1557
1558             /* EWALD ELECTROSTATICS */
1559
1560             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1561             ewrt             = _mm256_mul_pd(r31,ewtabscale);
1562             ewitab           = _mm256_cvttpd_epi32(ewrt);
1563             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1564             ewitab           = _mm_slli_epi32(ewitab,2);
1565             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1566             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1567             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1568             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1569             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1570             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1571             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1572             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1573             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1574
1575             d                = _mm256_sub_pd(r31,rswitch);
1576             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1577             d2               = _mm256_mul_pd(d,d);
1578             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1579
1580             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1581
1582             /* Evaluate switch function */
1583             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1584             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
1585             velec            = _mm256_mul_pd(velec,sw);
1586             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1587
1588             /* Update potential sum for this i atom from the interaction with this j atom. */
1589             velec            = _mm256_and_pd(velec,cutoff_mask);
1590             velec            = _mm256_andnot_pd(dummy_mask,velec);
1591             velecsum         = _mm256_add_pd(velecsum,velec);
1592
1593             fscal            = felec;
1594
1595             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1596
1597             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1598
1599             /* Calculate temporary vectorial force */
1600             tx               = _mm256_mul_pd(fscal,dx31);
1601             ty               = _mm256_mul_pd(fscal,dy31);
1602             tz               = _mm256_mul_pd(fscal,dz31);
1603
1604             /* Update vectorial force */
1605             fix3             = _mm256_add_pd(fix3,tx);
1606             fiy3             = _mm256_add_pd(fiy3,ty);
1607             fiz3             = _mm256_add_pd(fiz3,tz);
1608
1609             fjx1             = _mm256_add_pd(fjx1,tx);
1610             fjy1             = _mm256_add_pd(fjy1,ty);
1611             fjz1             = _mm256_add_pd(fjz1,tz);
1612
1613             }
1614
1615             /**************************
1616              * CALCULATE INTERACTIONS *
1617              **************************/
1618
1619             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1620             {
1621
1622             r32              = _mm256_mul_pd(rsq32,rinv32);
1623             r32              = _mm256_andnot_pd(dummy_mask,r32);
1624
1625             /* EWALD ELECTROSTATICS */
1626
1627             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1628             ewrt             = _mm256_mul_pd(r32,ewtabscale);
1629             ewitab           = _mm256_cvttpd_epi32(ewrt);
1630             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1631             ewitab           = _mm_slli_epi32(ewitab,2);
1632             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1633             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1634             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1635             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1636             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1637             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1638             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1639             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1640             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1641
1642             d                = _mm256_sub_pd(r32,rswitch);
1643             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1644             d2               = _mm256_mul_pd(d,d);
1645             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1646
1647             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1648
1649             /* Evaluate switch function */
1650             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1651             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
1652             velec            = _mm256_mul_pd(velec,sw);
1653             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1654
1655             /* Update potential sum for this i atom from the interaction with this j atom. */
1656             velec            = _mm256_and_pd(velec,cutoff_mask);
1657             velec            = _mm256_andnot_pd(dummy_mask,velec);
1658             velecsum         = _mm256_add_pd(velecsum,velec);
1659
1660             fscal            = felec;
1661
1662             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1663
1664             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1665
1666             /* Calculate temporary vectorial force */
1667             tx               = _mm256_mul_pd(fscal,dx32);
1668             ty               = _mm256_mul_pd(fscal,dy32);
1669             tz               = _mm256_mul_pd(fscal,dz32);
1670
1671             /* Update vectorial force */
1672             fix3             = _mm256_add_pd(fix3,tx);
1673             fiy3             = _mm256_add_pd(fiy3,ty);
1674             fiz3             = _mm256_add_pd(fiz3,tz);
1675
1676             fjx2             = _mm256_add_pd(fjx2,tx);
1677             fjy2             = _mm256_add_pd(fjy2,ty);
1678             fjz2             = _mm256_add_pd(fjz2,tz);
1679
1680             }
1681
1682             /**************************
1683              * CALCULATE INTERACTIONS *
1684              **************************/
1685
1686             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1687             {
1688
1689             r33              = _mm256_mul_pd(rsq33,rinv33);
1690             r33              = _mm256_andnot_pd(dummy_mask,r33);
1691
1692             /* EWALD ELECTROSTATICS */
1693
1694             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1695             ewrt             = _mm256_mul_pd(r33,ewtabscale);
1696             ewitab           = _mm256_cvttpd_epi32(ewrt);
1697             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1698             ewitab           = _mm_slli_epi32(ewitab,2);
1699             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1700             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1701             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1702             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1703             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1704             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1705             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1706             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1707             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1708
1709             d                = _mm256_sub_pd(r33,rswitch);
1710             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1711             d2               = _mm256_mul_pd(d,d);
1712             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1713
1714             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1715
1716             /* Evaluate switch function */
1717             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1718             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
1719             velec            = _mm256_mul_pd(velec,sw);
1720             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1721
1722             /* Update potential sum for this i atom from the interaction with this j atom. */
1723             velec            = _mm256_and_pd(velec,cutoff_mask);
1724             velec            = _mm256_andnot_pd(dummy_mask,velec);
1725             velecsum         = _mm256_add_pd(velecsum,velec);
1726
1727             fscal            = felec;
1728
1729             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1730
1731             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1732
1733             /* Calculate temporary vectorial force */
1734             tx               = _mm256_mul_pd(fscal,dx33);
1735             ty               = _mm256_mul_pd(fscal,dy33);
1736             tz               = _mm256_mul_pd(fscal,dz33);
1737
1738             /* Update vectorial force */
1739             fix3             = _mm256_add_pd(fix3,tx);
1740             fiy3             = _mm256_add_pd(fiy3,ty);
1741             fiz3             = _mm256_add_pd(fiz3,tz);
1742
1743             fjx3             = _mm256_add_pd(fjx3,tx);
1744             fjy3             = _mm256_add_pd(fjy3,ty);
1745             fjz3             = _mm256_add_pd(fjz3,tz);
1746
1747             }
1748
1749             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1750             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1751             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1752             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1753
1754             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1755                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1756                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1757
1758             /* Inner loop uses 657 flops */
1759         }
1760
1761         /* End of innermost loop */
1762
1763         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1764                                                  f+i_coord_offset,fshift+i_shift_offset);
1765
1766         ggid                        = gid[iidx];
1767         /* Update potential energies */
1768         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1769         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1770
1771         /* Increment number of inner iterations */
1772         inneriter                  += j_index_end - j_index_start;
1773
1774         /* Outer loop uses 26 flops */
1775     }
1776
1777     /* Increment number of outer iterations */
1778     outeriter        += nri;
1779
1780     /* Update outer/inner flops */
1781
1782     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
1783 }
1784 /*
1785  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double
1786  * Electrostatics interaction: Ewald
1787  * VdW interaction:            LennardJones
1788  * Geometry:                   Water4-Water4
1789  * Calculate force/pot:        Force
1790  */
1791 void
1792 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double
1793                     (t_nblist                    * gmx_restrict       nlist,
1794                      rvec                        * gmx_restrict          xx,
1795                      rvec                        * gmx_restrict          ff,
1796                      struct t_forcerec           * gmx_restrict          fr,
1797                      t_mdatoms                   * gmx_restrict     mdatoms,
1798                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1799                      t_nrnb                      * gmx_restrict        nrnb)
1800 {
1801     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1802      * just 0 for non-waters.
1803      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1804      * jnr indices corresponding to data put in the four positions in the SIMD register.
1805      */
1806     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1807     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1808     int              jnrA,jnrB,jnrC,jnrD;
1809     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1810     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1811     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1812     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1813     real             rcutoff_scalar;
1814     real             *shiftvec,*fshift,*x,*f;
1815     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1816     real             scratch[4*DIM];
1817     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1818     real *           vdwioffsetptr0;
1819     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1820     real *           vdwioffsetptr1;
1821     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1822     real *           vdwioffsetptr2;
1823     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1824     real *           vdwioffsetptr3;
1825     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1826     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1827     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1828     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1829     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1830     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1831     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1832     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1833     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1834     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1835     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1836     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1837     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1838     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1839     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1840     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1841     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1842     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1843     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1844     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1845     real             *charge;
1846     int              nvdwtype;
1847     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1848     int              *vdwtype;
1849     real             *vdwparam;
1850     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1851     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1852     __m128i          ewitab;
1853     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1854     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1855     real             *ewtab;
1856     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1857     real             rswitch_scalar,d_scalar;
1858     __m256d          dummy_mask,cutoff_mask;
1859     __m128           tmpmask0,tmpmask1;
1860     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1861     __m256d          one     = _mm256_set1_pd(1.0);
1862     __m256d          two     = _mm256_set1_pd(2.0);
1863     x                = xx[0];
1864     f                = ff[0];
1865
1866     nri              = nlist->nri;
1867     iinr             = nlist->iinr;
1868     jindex           = nlist->jindex;
1869     jjnr             = nlist->jjnr;
1870     shiftidx         = nlist->shift;
1871     gid              = nlist->gid;
1872     shiftvec         = fr->shift_vec[0];
1873     fshift           = fr->fshift[0];
1874     facel            = _mm256_set1_pd(fr->ic->epsfac);
1875     charge           = mdatoms->chargeA;
1876     nvdwtype         = fr->ntype;
1877     vdwparam         = fr->nbfp;
1878     vdwtype          = mdatoms->typeA;
1879
1880     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1881     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1882     beta2            = _mm256_mul_pd(beta,beta);
1883     beta3            = _mm256_mul_pd(beta,beta2);
1884
1885     ewtab            = fr->ic->tabq_coul_FDV0;
1886     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1887     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1888
1889     /* Setup water-specific parameters */
1890     inr              = nlist->iinr[0];
1891     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1892     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1893     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1894     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1895
1896     jq1              = _mm256_set1_pd(charge[inr+1]);
1897     jq2              = _mm256_set1_pd(charge[inr+2]);
1898     jq3              = _mm256_set1_pd(charge[inr+3]);
1899     vdwjidx0A        = 2*vdwtype[inr+0];
1900     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1901     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1902     qq11             = _mm256_mul_pd(iq1,jq1);
1903     qq12             = _mm256_mul_pd(iq1,jq2);
1904     qq13             = _mm256_mul_pd(iq1,jq3);
1905     qq21             = _mm256_mul_pd(iq2,jq1);
1906     qq22             = _mm256_mul_pd(iq2,jq2);
1907     qq23             = _mm256_mul_pd(iq2,jq3);
1908     qq31             = _mm256_mul_pd(iq3,jq1);
1909     qq32             = _mm256_mul_pd(iq3,jq2);
1910     qq33             = _mm256_mul_pd(iq3,jq3);
1911
1912     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1913     rcutoff_scalar   = fr->ic->rcoulomb;
1914     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1915     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1916
1917     rswitch_scalar   = fr->ic->rcoulomb_switch;
1918     rswitch          = _mm256_set1_pd(rswitch_scalar);
1919     /* Setup switch parameters */
1920     d_scalar         = rcutoff_scalar-rswitch_scalar;
1921     d                = _mm256_set1_pd(d_scalar);
1922     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1923     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1924     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1925     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1926     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1927     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1928
1929     /* Avoid stupid compiler warnings */
1930     jnrA = jnrB = jnrC = jnrD = 0;
1931     j_coord_offsetA = 0;
1932     j_coord_offsetB = 0;
1933     j_coord_offsetC = 0;
1934     j_coord_offsetD = 0;
1935
1936     outeriter        = 0;
1937     inneriter        = 0;
1938
1939     for(iidx=0;iidx<4*DIM;iidx++)
1940     {
1941         scratch[iidx] = 0.0;
1942     }
1943
1944     /* Start outer loop over neighborlists */
1945     for(iidx=0; iidx<nri; iidx++)
1946     {
1947         /* Load shift vector for this list */
1948         i_shift_offset   = DIM*shiftidx[iidx];
1949
1950         /* Load limits for loop over neighbors */
1951         j_index_start    = jindex[iidx];
1952         j_index_end      = jindex[iidx+1];
1953
1954         /* Get outer coordinate index */
1955         inr              = iinr[iidx];
1956         i_coord_offset   = DIM*inr;
1957
1958         /* Load i particle coords and add shift vector */
1959         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1960                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1961
1962         fix0             = _mm256_setzero_pd();
1963         fiy0             = _mm256_setzero_pd();
1964         fiz0             = _mm256_setzero_pd();
1965         fix1             = _mm256_setzero_pd();
1966         fiy1             = _mm256_setzero_pd();
1967         fiz1             = _mm256_setzero_pd();
1968         fix2             = _mm256_setzero_pd();
1969         fiy2             = _mm256_setzero_pd();
1970         fiz2             = _mm256_setzero_pd();
1971         fix3             = _mm256_setzero_pd();
1972         fiy3             = _mm256_setzero_pd();
1973         fiz3             = _mm256_setzero_pd();
1974
1975         /* Start inner kernel loop */
1976         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1977         {
1978
1979             /* Get j neighbor index, and coordinate index */
1980             jnrA             = jjnr[jidx];
1981             jnrB             = jjnr[jidx+1];
1982             jnrC             = jjnr[jidx+2];
1983             jnrD             = jjnr[jidx+3];
1984             j_coord_offsetA  = DIM*jnrA;
1985             j_coord_offsetB  = DIM*jnrB;
1986             j_coord_offsetC  = DIM*jnrC;
1987             j_coord_offsetD  = DIM*jnrD;
1988
1989             /* load j atom coordinates */
1990             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1991                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1992                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1993                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1994
1995             /* Calculate displacement vector */
1996             dx00             = _mm256_sub_pd(ix0,jx0);
1997             dy00             = _mm256_sub_pd(iy0,jy0);
1998             dz00             = _mm256_sub_pd(iz0,jz0);
1999             dx11             = _mm256_sub_pd(ix1,jx1);
2000             dy11             = _mm256_sub_pd(iy1,jy1);
2001             dz11             = _mm256_sub_pd(iz1,jz1);
2002             dx12             = _mm256_sub_pd(ix1,jx2);
2003             dy12             = _mm256_sub_pd(iy1,jy2);
2004             dz12             = _mm256_sub_pd(iz1,jz2);
2005             dx13             = _mm256_sub_pd(ix1,jx3);
2006             dy13             = _mm256_sub_pd(iy1,jy3);
2007             dz13             = _mm256_sub_pd(iz1,jz3);
2008             dx21             = _mm256_sub_pd(ix2,jx1);
2009             dy21             = _mm256_sub_pd(iy2,jy1);
2010             dz21             = _mm256_sub_pd(iz2,jz1);
2011             dx22             = _mm256_sub_pd(ix2,jx2);
2012             dy22             = _mm256_sub_pd(iy2,jy2);
2013             dz22             = _mm256_sub_pd(iz2,jz2);
2014             dx23             = _mm256_sub_pd(ix2,jx3);
2015             dy23             = _mm256_sub_pd(iy2,jy3);
2016             dz23             = _mm256_sub_pd(iz2,jz3);
2017             dx31             = _mm256_sub_pd(ix3,jx1);
2018             dy31             = _mm256_sub_pd(iy3,jy1);
2019             dz31             = _mm256_sub_pd(iz3,jz1);
2020             dx32             = _mm256_sub_pd(ix3,jx2);
2021             dy32             = _mm256_sub_pd(iy3,jy2);
2022             dz32             = _mm256_sub_pd(iz3,jz2);
2023             dx33             = _mm256_sub_pd(ix3,jx3);
2024             dy33             = _mm256_sub_pd(iy3,jy3);
2025             dz33             = _mm256_sub_pd(iz3,jz3);
2026
2027             /* Calculate squared distance and things based on it */
2028             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2029             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2030             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2031             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2032             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2033             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2034             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2035             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2036             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2037             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2038
2039             rinv00           = avx256_invsqrt_d(rsq00);
2040             rinv11           = avx256_invsqrt_d(rsq11);
2041             rinv12           = avx256_invsqrt_d(rsq12);
2042             rinv13           = avx256_invsqrt_d(rsq13);
2043             rinv21           = avx256_invsqrt_d(rsq21);
2044             rinv22           = avx256_invsqrt_d(rsq22);
2045             rinv23           = avx256_invsqrt_d(rsq23);
2046             rinv31           = avx256_invsqrt_d(rsq31);
2047             rinv32           = avx256_invsqrt_d(rsq32);
2048             rinv33           = avx256_invsqrt_d(rsq33);
2049
2050             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2051             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2052             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2053             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2054             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2055             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2056             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2057             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2058             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2059             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2060
2061             fjx0             = _mm256_setzero_pd();
2062             fjy0             = _mm256_setzero_pd();
2063             fjz0             = _mm256_setzero_pd();
2064             fjx1             = _mm256_setzero_pd();
2065             fjy1             = _mm256_setzero_pd();
2066             fjz1             = _mm256_setzero_pd();
2067             fjx2             = _mm256_setzero_pd();
2068             fjy2             = _mm256_setzero_pd();
2069             fjz2             = _mm256_setzero_pd();
2070             fjx3             = _mm256_setzero_pd();
2071             fjy3             = _mm256_setzero_pd();
2072             fjz3             = _mm256_setzero_pd();
2073
2074             /**************************
2075              * CALCULATE INTERACTIONS *
2076              **************************/
2077
2078             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2079             {
2080
2081             r00              = _mm256_mul_pd(rsq00,rinv00);
2082
2083             /* LENNARD-JONES DISPERSION/REPULSION */
2084
2085             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2086             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
2087             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2088             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2089             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2090
2091             d                = _mm256_sub_pd(r00,rswitch);
2092             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2093             d2               = _mm256_mul_pd(d,d);
2094             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2095
2096             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2097
2098             /* Evaluate switch function */
2099             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2100             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2101             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2102
2103             fscal            = fvdw;
2104
2105             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2106
2107             /* Calculate temporary vectorial force */
2108             tx               = _mm256_mul_pd(fscal,dx00);
2109             ty               = _mm256_mul_pd(fscal,dy00);
2110             tz               = _mm256_mul_pd(fscal,dz00);
2111
2112             /* Update vectorial force */
2113             fix0             = _mm256_add_pd(fix0,tx);
2114             fiy0             = _mm256_add_pd(fiy0,ty);
2115             fiz0             = _mm256_add_pd(fiz0,tz);
2116
2117             fjx0             = _mm256_add_pd(fjx0,tx);
2118             fjy0             = _mm256_add_pd(fjy0,ty);
2119             fjz0             = _mm256_add_pd(fjz0,tz);
2120
2121             }
2122
2123             /**************************
2124              * CALCULATE INTERACTIONS *
2125              **************************/
2126
2127             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2128             {
2129
2130             r11              = _mm256_mul_pd(rsq11,rinv11);
2131
2132             /* EWALD ELECTROSTATICS */
2133
2134             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2135             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2136             ewitab           = _mm256_cvttpd_epi32(ewrt);
2137             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2138             ewitab           = _mm_slli_epi32(ewitab,2);
2139             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2140             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2141             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2142             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2143             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2144             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2145             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2146             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2147             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2148
2149             d                = _mm256_sub_pd(r11,rswitch);
2150             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2151             d2               = _mm256_mul_pd(d,d);
2152             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2153
2154             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2155
2156             /* Evaluate switch function */
2157             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2158             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2159             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2160
2161             fscal            = felec;
2162
2163             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2164
2165             /* Calculate temporary vectorial force */
2166             tx               = _mm256_mul_pd(fscal,dx11);
2167             ty               = _mm256_mul_pd(fscal,dy11);
2168             tz               = _mm256_mul_pd(fscal,dz11);
2169
2170             /* Update vectorial force */
2171             fix1             = _mm256_add_pd(fix1,tx);
2172             fiy1             = _mm256_add_pd(fiy1,ty);
2173             fiz1             = _mm256_add_pd(fiz1,tz);
2174
2175             fjx1             = _mm256_add_pd(fjx1,tx);
2176             fjy1             = _mm256_add_pd(fjy1,ty);
2177             fjz1             = _mm256_add_pd(fjz1,tz);
2178
2179             }
2180
2181             /**************************
2182              * CALCULATE INTERACTIONS *
2183              **************************/
2184
2185             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2186             {
2187
2188             r12              = _mm256_mul_pd(rsq12,rinv12);
2189
2190             /* EWALD ELECTROSTATICS */
2191
2192             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2193             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2194             ewitab           = _mm256_cvttpd_epi32(ewrt);
2195             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2196             ewitab           = _mm_slli_epi32(ewitab,2);
2197             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2198             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2199             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2200             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2201             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2202             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2203             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2204             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2205             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2206
2207             d                = _mm256_sub_pd(r12,rswitch);
2208             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2209             d2               = _mm256_mul_pd(d,d);
2210             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2211
2212             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2213
2214             /* Evaluate switch function */
2215             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2216             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2217             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2218
2219             fscal            = felec;
2220
2221             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2222
2223             /* Calculate temporary vectorial force */
2224             tx               = _mm256_mul_pd(fscal,dx12);
2225             ty               = _mm256_mul_pd(fscal,dy12);
2226             tz               = _mm256_mul_pd(fscal,dz12);
2227
2228             /* Update vectorial force */
2229             fix1             = _mm256_add_pd(fix1,tx);
2230             fiy1             = _mm256_add_pd(fiy1,ty);
2231             fiz1             = _mm256_add_pd(fiz1,tz);
2232
2233             fjx2             = _mm256_add_pd(fjx2,tx);
2234             fjy2             = _mm256_add_pd(fjy2,ty);
2235             fjz2             = _mm256_add_pd(fjz2,tz);
2236
2237             }
2238
2239             /**************************
2240              * CALCULATE INTERACTIONS *
2241              **************************/
2242
2243             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2244             {
2245
2246             r13              = _mm256_mul_pd(rsq13,rinv13);
2247
2248             /* EWALD ELECTROSTATICS */
2249
2250             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2251             ewrt             = _mm256_mul_pd(r13,ewtabscale);
2252             ewitab           = _mm256_cvttpd_epi32(ewrt);
2253             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2254             ewitab           = _mm_slli_epi32(ewitab,2);
2255             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2256             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2257             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2258             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2259             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2260             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2261             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2262             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2263             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2264
2265             d                = _mm256_sub_pd(r13,rswitch);
2266             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2267             d2               = _mm256_mul_pd(d,d);
2268             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2269
2270             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2271
2272             /* Evaluate switch function */
2273             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2274             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2275             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2276
2277             fscal            = felec;
2278
2279             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2280
2281             /* Calculate temporary vectorial force */
2282             tx               = _mm256_mul_pd(fscal,dx13);
2283             ty               = _mm256_mul_pd(fscal,dy13);
2284             tz               = _mm256_mul_pd(fscal,dz13);
2285
2286             /* Update vectorial force */
2287             fix1             = _mm256_add_pd(fix1,tx);
2288             fiy1             = _mm256_add_pd(fiy1,ty);
2289             fiz1             = _mm256_add_pd(fiz1,tz);
2290
2291             fjx3             = _mm256_add_pd(fjx3,tx);
2292             fjy3             = _mm256_add_pd(fjy3,ty);
2293             fjz3             = _mm256_add_pd(fjz3,tz);
2294
2295             }
2296
2297             /**************************
2298              * CALCULATE INTERACTIONS *
2299              **************************/
2300
2301             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2302             {
2303
2304             r21              = _mm256_mul_pd(rsq21,rinv21);
2305
2306             /* EWALD ELECTROSTATICS */
2307
2308             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2309             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2310             ewitab           = _mm256_cvttpd_epi32(ewrt);
2311             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2312             ewitab           = _mm_slli_epi32(ewitab,2);
2313             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2314             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2315             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2316             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2317             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2318             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2319             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2320             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2321             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2322
2323             d                = _mm256_sub_pd(r21,rswitch);
2324             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2325             d2               = _mm256_mul_pd(d,d);
2326             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2327
2328             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2329
2330             /* Evaluate switch function */
2331             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2332             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2333             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2334
2335             fscal            = felec;
2336
2337             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2338
2339             /* Calculate temporary vectorial force */
2340             tx               = _mm256_mul_pd(fscal,dx21);
2341             ty               = _mm256_mul_pd(fscal,dy21);
2342             tz               = _mm256_mul_pd(fscal,dz21);
2343
2344             /* Update vectorial force */
2345             fix2             = _mm256_add_pd(fix2,tx);
2346             fiy2             = _mm256_add_pd(fiy2,ty);
2347             fiz2             = _mm256_add_pd(fiz2,tz);
2348
2349             fjx1             = _mm256_add_pd(fjx1,tx);
2350             fjy1             = _mm256_add_pd(fjy1,ty);
2351             fjz1             = _mm256_add_pd(fjz1,tz);
2352
2353             }
2354
2355             /**************************
2356              * CALCULATE INTERACTIONS *
2357              **************************/
2358
2359             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2360             {
2361
2362             r22              = _mm256_mul_pd(rsq22,rinv22);
2363
2364             /* EWALD ELECTROSTATICS */
2365
2366             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2367             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2368             ewitab           = _mm256_cvttpd_epi32(ewrt);
2369             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2370             ewitab           = _mm_slli_epi32(ewitab,2);
2371             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2372             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2373             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2374             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2375             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2376             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2377             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2378             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2379             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2380
2381             d                = _mm256_sub_pd(r22,rswitch);
2382             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2383             d2               = _mm256_mul_pd(d,d);
2384             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2385
2386             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2387
2388             /* Evaluate switch function */
2389             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2390             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2391             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2392
2393             fscal            = felec;
2394
2395             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2396
2397             /* Calculate temporary vectorial force */
2398             tx               = _mm256_mul_pd(fscal,dx22);
2399             ty               = _mm256_mul_pd(fscal,dy22);
2400             tz               = _mm256_mul_pd(fscal,dz22);
2401
2402             /* Update vectorial force */
2403             fix2             = _mm256_add_pd(fix2,tx);
2404             fiy2             = _mm256_add_pd(fiy2,ty);
2405             fiz2             = _mm256_add_pd(fiz2,tz);
2406
2407             fjx2             = _mm256_add_pd(fjx2,tx);
2408             fjy2             = _mm256_add_pd(fjy2,ty);
2409             fjz2             = _mm256_add_pd(fjz2,tz);
2410
2411             }
2412
2413             /**************************
2414              * CALCULATE INTERACTIONS *
2415              **************************/
2416
2417             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2418             {
2419
2420             r23              = _mm256_mul_pd(rsq23,rinv23);
2421
2422             /* EWALD ELECTROSTATICS */
2423
2424             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2425             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2426             ewitab           = _mm256_cvttpd_epi32(ewrt);
2427             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2428             ewitab           = _mm_slli_epi32(ewitab,2);
2429             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2430             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2431             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2432             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2433             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2434             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2435             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2436             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
2437             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2438
2439             d                = _mm256_sub_pd(r23,rswitch);
2440             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2441             d2               = _mm256_mul_pd(d,d);
2442             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2443
2444             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2445
2446             /* Evaluate switch function */
2447             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2448             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
2449             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2450
2451             fscal            = felec;
2452
2453             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2454
2455             /* Calculate temporary vectorial force */
2456             tx               = _mm256_mul_pd(fscal,dx23);
2457             ty               = _mm256_mul_pd(fscal,dy23);
2458             tz               = _mm256_mul_pd(fscal,dz23);
2459
2460             /* Update vectorial force */
2461             fix2             = _mm256_add_pd(fix2,tx);
2462             fiy2             = _mm256_add_pd(fiy2,ty);
2463             fiz2             = _mm256_add_pd(fiz2,tz);
2464
2465             fjx3             = _mm256_add_pd(fjx3,tx);
2466             fjy3             = _mm256_add_pd(fjy3,ty);
2467             fjz3             = _mm256_add_pd(fjz3,tz);
2468
2469             }
2470
2471             /**************************
2472              * CALCULATE INTERACTIONS *
2473              **************************/
2474
2475             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2476             {
2477
2478             r31              = _mm256_mul_pd(rsq31,rinv31);
2479
2480             /* EWALD ELECTROSTATICS */
2481
2482             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2483             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2484             ewitab           = _mm256_cvttpd_epi32(ewrt);
2485             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2486             ewitab           = _mm_slli_epi32(ewitab,2);
2487             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2488             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2489             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2490             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2491             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2492             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2493             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2494             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
2495             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2496
2497             d                = _mm256_sub_pd(r31,rswitch);
2498             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2499             d2               = _mm256_mul_pd(d,d);
2500             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2501
2502             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2503
2504             /* Evaluate switch function */
2505             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2506             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
2507             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2508
2509             fscal            = felec;
2510
2511             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2512
2513             /* Calculate temporary vectorial force */
2514             tx               = _mm256_mul_pd(fscal,dx31);
2515             ty               = _mm256_mul_pd(fscal,dy31);
2516             tz               = _mm256_mul_pd(fscal,dz31);
2517
2518             /* Update vectorial force */
2519             fix3             = _mm256_add_pd(fix3,tx);
2520             fiy3             = _mm256_add_pd(fiy3,ty);
2521             fiz3             = _mm256_add_pd(fiz3,tz);
2522
2523             fjx1             = _mm256_add_pd(fjx1,tx);
2524             fjy1             = _mm256_add_pd(fjy1,ty);
2525             fjz1             = _mm256_add_pd(fjz1,tz);
2526
2527             }
2528
2529             /**************************
2530              * CALCULATE INTERACTIONS *
2531              **************************/
2532
2533             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2534             {
2535
2536             r32              = _mm256_mul_pd(rsq32,rinv32);
2537
2538             /* EWALD ELECTROSTATICS */
2539
2540             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2541             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2542             ewitab           = _mm256_cvttpd_epi32(ewrt);
2543             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2544             ewitab           = _mm_slli_epi32(ewitab,2);
2545             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2546             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2547             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2548             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2549             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2550             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2551             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2552             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
2553             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2554
2555             d                = _mm256_sub_pd(r32,rswitch);
2556             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2557             d2               = _mm256_mul_pd(d,d);
2558             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2559
2560             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2561
2562             /* Evaluate switch function */
2563             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2564             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
2565             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2566
2567             fscal            = felec;
2568
2569             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2570
2571             /* Calculate temporary vectorial force */
2572             tx               = _mm256_mul_pd(fscal,dx32);
2573             ty               = _mm256_mul_pd(fscal,dy32);
2574             tz               = _mm256_mul_pd(fscal,dz32);
2575
2576             /* Update vectorial force */
2577             fix3             = _mm256_add_pd(fix3,tx);
2578             fiy3             = _mm256_add_pd(fiy3,ty);
2579             fiz3             = _mm256_add_pd(fiz3,tz);
2580
2581             fjx2             = _mm256_add_pd(fjx2,tx);
2582             fjy2             = _mm256_add_pd(fjy2,ty);
2583             fjz2             = _mm256_add_pd(fjz2,tz);
2584
2585             }
2586
2587             /**************************
2588              * CALCULATE INTERACTIONS *
2589              **************************/
2590
2591             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2592             {
2593
2594             r33              = _mm256_mul_pd(rsq33,rinv33);
2595
2596             /* EWALD ELECTROSTATICS */
2597
2598             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2599             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2600             ewitab           = _mm256_cvttpd_epi32(ewrt);
2601             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2602             ewitab           = _mm_slli_epi32(ewitab,2);
2603             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2604             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2605             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2606             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2607             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2608             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2609             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2610             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
2611             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2612
2613             d                = _mm256_sub_pd(r33,rswitch);
2614             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2615             d2               = _mm256_mul_pd(d,d);
2616             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2617
2618             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2619
2620             /* Evaluate switch function */
2621             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2622             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
2623             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2624
2625             fscal            = felec;
2626
2627             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2628
2629             /* Calculate temporary vectorial force */
2630             tx               = _mm256_mul_pd(fscal,dx33);
2631             ty               = _mm256_mul_pd(fscal,dy33);
2632             tz               = _mm256_mul_pd(fscal,dz33);
2633
2634             /* Update vectorial force */
2635             fix3             = _mm256_add_pd(fix3,tx);
2636             fiy3             = _mm256_add_pd(fiy3,ty);
2637             fiz3             = _mm256_add_pd(fiz3,tz);
2638
2639             fjx3             = _mm256_add_pd(fjx3,tx);
2640             fjy3             = _mm256_add_pd(fjy3,ty);
2641             fjz3             = _mm256_add_pd(fjz3,tz);
2642
2643             }
2644
2645             fjptrA             = f+j_coord_offsetA;
2646             fjptrB             = f+j_coord_offsetB;
2647             fjptrC             = f+j_coord_offsetC;
2648             fjptrD             = f+j_coord_offsetD;
2649
2650             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2651                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2652                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2653
2654             /* Inner loop uses 617 flops */
2655         }
2656
2657         if(jidx<j_index_end)
2658         {
2659
2660             /* Get j neighbor index, and coordinate index */
2661             jnrlistA         = jjnr[jidx];
2662             jnrlistB         = jjnr[jidx+1];
2663             jnrlistC         = jjnr[jidx+2];
2664             jnrlistD         = jjnr[jidx+3];
2665             /* Sign of each element will be negative for non-real atoms.
2666              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2667              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2668              */
2669             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2670
2671             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2672             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2673             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2674
2675             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2676             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2677             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2678             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2679             j_coord_offsetA  = DIM*jnrA;
2680             j_coord_offsetB  = DIM*jnrB;
2681             j_coord_offsetC  = DIM*jnrC;
2682             j_coord_offsetD  = DIM*jnrD;
2683
2684             /* load j atom coordinates */
2685             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2686                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2687                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2688                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2689
2690             /* Calculate displacement vector */
2691             dx00             = _mm256_sub_pd(ix0,jx0);
2692             dy00             = _mm256_sub_pd(iy0,jy0);
2693             dz00             = _mm256_sub_pd(iz0,jz0);
2694             dx11             = _mm256_sub_pd(ix1,jx1);
2695             dy11             = _mm256_sub_pd(iy1,jy1);
2696             dz11             = _mm256_sub_pd(iz1,jz1);
2697             dx12             = _mm256_sub_pd(ix1,jx2);
2698             dy12             = _mm256_sub_pd(iy1,jy2);
2699             dz12             = _mm256_sub_pd(iz1,jz2);
2700             dx13             = _mm256_sub_pd(ix1,jx3);
2701             dy13             = _mm256_sub_pd(iy1,jy3);
2702             dz13             = _mm256_sub_pd(iz1,jz3);
2703             dx21             = _mm256_sub_pd(ix2,jx1);
2704             dy21             = _mm256_sub_pd(iy2,jy1);
2705             dz21             = _mm256_sub_pd(iz2,jz1);
2706             dx22             = _mm256_sub_pd(ix2,jx2);
2707             dy22             = _mm256_sub_pd(iy2,jy2);
2708             dz22             = _mm256_sub_pd(iz2,jz2);
2709             dx23             = _mm256_sub_pd(ix2,jx3);
2710             dy23             = _mm256_sub_pd(iy2,jy3);
2711             dz23             = _mm256_sub_pd(iz2,jz3);
2712             dx31             = _mm256_sub_pd(ix3,jx1);
2713             dy31             = _mm256_sub_pd(iy3,jy1);
2714             dz31             = _mm256_sub_pd(iz3,jz1);
2715             dx32             = _mm256_sub_pd(ix3,jx2);
2716             dy32             = _mm256_sub_pd(iy3,jy2);
2717             dz32             = _mm256_sub_pd(iz3,jz2);
2718             dx33             = _mm256_sub_pd(ix3,jx3);
2719             dy33             = _mm256_sub_pd(iy3,jy3);
2720             dz33             = _mm256_sub_pd(iz3,jz3);
2721
2722             /* Calculate squared distance and things based on it */
2723             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2724             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2725             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2726             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2727             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2728             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2729             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2730             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2731             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2732             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2733
2734             rinv00           = avx256_invsqrt_d(rsq00);
2735             rinv11           = avx256_invsqrt_d(rsq11);
2736             rinv12           = avx256_invsqrt_d(rsq12);
2737             rinv13           = avx256_invsqrt_d(rsq13);
2738             rinv21           = avx256_invsqrt_d(rsq21);
2739             rinv22           = avx256_invsqrt_d(rsq22);
2740             rinv23           = avx256_invsqrt_d(rsq23);
2741             rinv31           = avx256_invsqrt_d(rsq31);
2742             rinv32           = avx256_invsqrt_d(rsq32);
2743             rinv33           = avx256_invsqrt_d(rsq33);
2744
2745             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2746             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2747             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2748             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2749             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2750             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2751             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2752             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2753             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2754             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2755
2756             fjx0             = _mm256_setzero_pd();
2757             fjy0             = _mm256_setzero_pd();
2758             fjz0             = _mm256_setzero_pd();
2759             fjx1             = _mm256_setzero_pd();
2760             fjy1             = _mm256_setzero_pd();
2761             fjz1             = _mm256_setzero_pd();
2762             fjx2             = _mm256_setzero_pd();
2763             fjy2             = _mm256_setzero_pd();
2764             fjz2             = _mm256_setzero_pd();
2765             fjx3             = _mm256_setzero_pd();
2766             fjy3             = _mm256_setzero_pd();
2767             fjz3             = _mm256_setzero_pd();
2768
2769             /**************************
2770              * CALCULATE INTERACTIONS *
2771              **************************/
2772
2773             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2774             {
2775
2776             r00              = _mm256_mul_pd(rsq00,rinv00);
2777             r00              = _mm256_andnot_pd(dummy_mask,r00);
2778
2779             /* LENNARD-JONES DISPERSION/REPULSION */
2780
2781             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2782             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
2783             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2784             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2785             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2786
2787             d                = _mm256_sub_pd(r00,rswitch);
2788             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2789             d2               = _mm256_mul_pd(d,d);
2790             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2791
2792             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2793
2794             /* Evaluate switch function */
2795             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2796             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2797             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2798
2799             fscal            = fvdw;
2800
2801             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2802
2803             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2804
2805             /* Calculate temporary vectorial force */
2806             tx               = _mm256_mul_pd(fscal,dx00);
2807             ty               = _mm256_mul_pd(fscal,dy00);
2808             tz               = _mm256_mul_pd(fscal,dz00);
2809
2810             /* Update vectorial force */
2811             fix0             = _mm256_add_pd(fix0,tx);
2812             fiy0             = _mm256_add_pd(fiy0,ty);
2813             fiz0             = _mm256_add_pd(fiz0,tz);
2814
2815             fjx0             = _mm256_add_pd(fjx0,tx);
2816             fjy0             = _mm256_add_pd(fjy0,ty);
2817             fjz0             = _mm256_add_pd(fjz0,tz);
2818
2819             }
2820
2821             /**************************
2822              * CALCULATE INTERACTIONS *
2823              **************************/
2824
2825             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2826             {
2827
2828             r11              = _mm256_mul_pd(rsq11,rinv11);
2829             r11              = _mm256_andnot_pd(dummy_mask,r11);
2830
2831             /* EWALD ELECTROSTATICS */
2832
2833             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2834             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2835             ewitab           = _mm256_cvttpd_epi32(ewrt);
2836             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2837             ewitab           = _mm_slli_epi32(ewitab,2);
2838             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2839             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2840             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2841             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2842             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2843             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2844             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2845             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2846             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2847
2848             d                = _mm256_sub_pd(r11,rswitch);
2849             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2850             d2               = _mm256_mul_pd(d,d);
2851             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2852
2853             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2854
2855             /* Evaluate switch function */
2856             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2857             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2858             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2859
2860             fscal            = felec;
2861
2862             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2863
2864             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2865
2866             /* Calculate temporary vectorial force */
2867             tx               = _mm256_mul_pd(fscal,dx11);
2868             ty               = _mm256_mul_pd(fscal,dy11);
2869             tz               = _mm256_mul_pd(fscal,dz11);
2870
2871             /* Update vectorial force */
2872             fix1             = _mm256_add_pd(fix1,tx);
2873             fiy1             = _mm256_add_pd(fiy1,ty);
2874             fiz1             = _mm256_add_pd(fiz1,tz);
2875
2876             fjx1             = _mm256_add_pd(fjx1,tx);
2877             fjy1             = _mm256_add_pd(fjy1,ty);
2878             fjz1             = _mm256_add_pd(fjz1,tz);
2879
2880             }
2881
2882             /**************************
2883              * CALCULATE INTERACTIONS *
2884              **************************/
2885
2886             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2887             {
2888
2889             r12              = _mm256_mul_pd(rsq12,rinv12);
2890             r12              = _mm256_andnot_pd(dummy_mask,r12);
2891
2892             /* EWALD ELECTROSTATICS */
2893
2894             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2895             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2896             ewitab           = _mm256_cvttpd_epi32(ewrt);
2897             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2898             ewitab           = _mm_slli_epi32(ewitab,2);
2899             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2900             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2901             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2902             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2903             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2904             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2905             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2906             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2907             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2908
2909             d                = _mm256_sub_pd(r12,rswitch);
2910             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2911             d2               = _mm256_mul_pd(d,d);
2912             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2913
2914             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2915
2916             /* Evaluate switch function */
2917             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2918             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2919             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2920
2921             fscal            = felec;
2922
2923             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2924
2925             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2926
2927             /* Calculate temporary vectorial force */
2928             tx               = _mm256_mul_pd(fscal,dx12);
2929             ty               = _mm256_mul_pd(fscal,dy12);
2930             tz               = _mm256_mul_pd(fscal,dz12);
2931
2932             /* Update vectorial force */
2933             fix1             = _mm256_add_pd(fix1,tx);
2934             fiy1             = _mm256_add_pd(fiy1,ty);
2935             fiz1             = _mm256_add_pd(fiz1,tz);
2936
2937             fjx2             = _mm256_add_pd(fjx2,tx);
2938             fjy2             = _mm256_add_pd(fjy2,ty);
2939             fjz2             = _mm256_add_pd(fjz2,tz);
2940
2941             }
2942
2943             /**************************
2944              * CALCULATE INTERACTIONS *
2945              **************************/
2946
2947             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2948             {
2949
2950             r13              = _mm256_mul_pd(rsq13,rinv13);
2951             r13              = _mm256_andnot_pd(dummy_mask,r13);
2952
2953             /* EWALD ELECTROSTATICS */
2954
2955             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2956             ewrt             = _mm256_mul_pd(r13,ewtabscale);
2957             ewitab           = _mm256_cvttpd_epi32(ewrt);
2958             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2959             ewitab           = _mm_slli_epi32(ewitab,2);
2960             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2961             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2962             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2963             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2964             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2965             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2966             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2967             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2968             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2969
2970             d                = _mm256_sub_pd(r13,rswitch);
2971             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2972             d2               = _mm256_mul_pd(d,d);
2973             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2974
2975             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2976
2977             /* Evaluate switch function */
2978             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2979             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2980             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2981
2982             fscal            = felec;
2983
2984             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2985
2986             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2987
2988             /* Calculate temporary vectorial force */
2989             tx               = _mm256_mul_pd(fscal,dx13);
2990             ty               = _mm256_mul_pd(fscal,dy13);
2991             tz               = _mm256_mul_pd(fscal,dz13);
2992
2993             /* Update vectorial force */
2994             fix1             = _mm256_add_pd(fix1,tx);
2995             fiy1             = _mm256_add_pd(fiy1,ty);
2996             fiz1             = _mm256_add_pd(fiz1,tz);
2997
2998             fjx3             = _mm256_add_pd(fjx3,tx);
2999             fjy3             = _mm256_add_pd(fjy3,ty);
3000             fjz3             = _mm256_add_pd(fjz3,tz);
3001
3002             }
3003
3004             /**************************
3005              * CALCULATE INTERACTIONS *
3006              **************************/
3007
3008             if (gmx_mm256_any_lt(rsq21,rcutoff2))
3009             {
3010
3011             r21              = _mm256_mul_pd(rsq21,rinv21);
3012             r21              = _mm256_andnot_pd(dummy_mask,r21);
3013
3014             /* EWALD ELECTROSTATICS */
3015
3016             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3017             ewrt             = _mm256_mul_pd(r21,ewtabscale);
3018             ewitab           = _mm256_cvttpd_epi32(ewrt);
3019             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3020             ewitab           = _mm_slli_epi32(ewitab,2);
3021             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3022             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3023             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3024             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3025             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3026             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3027             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3028             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
3029             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
3030
3031             d                = _mm256_sub_pd(r21,rswitch);
3032             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3033             d2               = _mm256_mul_pd(d,d);
3034             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3035
3036             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3037
3038             /* Evaluate switch function */
3039             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3040             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
3041             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
3042
3043             fscal            = felec;
3044
3045             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3046
3047             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3048
3049             /* Calculate temporary vectorial force */
3050             tx               = _mm256_mul_pd(fscal,dx21);
3051             ty               = _mm256_mul_pd(fscal,dy21);
3052             tz               = _mm256_mul_pd(fscal,dz21);
3053
3054             /* Update vectorial force */
3055             fix2             = _mm256_add_pd(fix2,tx);
3056             fiy2             = _mm256_add_pd(fiy2,ty);
3057             fiz2             = _mm256_add_pd(fiz2,tz);
3058
3059             fjx1             = _mm256_add_pd(fjx1,tx);
3060             fjy1             = _mm256_add_pd(fjy1,ty);
3061             fjz1             = _mm256_add_pd(fjz1,tz);
3062
3063             }
3064
3065             /**************************
3066              * CALCULATE INTERACTIONS *
3067              **************************/
3068
3069             if (gmx_mm256_any_lt(rsq22,rcutoff2))
3070             {
3071
3072             r22              = _mm256_mul_pd(rsq22,rinv22);
3073             r22              = _mm256_andnot_pd(dummy_mask,r22);
3074
3075             /* EWALD ELECTROSTATICS */
3076
3077             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3078             ewrt             = _mm256_mul_pd(r22,ewtabscale);
3079             ewitab           = _mm256_cvttpd_epi32(ewrt);
3080             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3081             ewitab           = _mm_slli_epi32(ewitab,2);
3082             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3083             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3084             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3085             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3086             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3087             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3088             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3089             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
3090             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
3091
3092             d                = _mm256_sub_pd(r22,rswitch);
3093             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3094             d2               = _mm256_mul_pd(d,d);
3095             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3096
3097             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3098
3099             /* Evaluate switch function */
3100             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3101             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
3102             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
3103
3104             fscal            = felec;
3105
3106             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3107
3108             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3109
3110             /* Calculate temporary vectorial force */
3111             tx               = _mm256_mul_pd(fscal,dx22);
3112             ty               = _mm256_mul_pd(fscal,dy22);
3113             tz               = _mm256_mul_pd(fscal,dz22);
3114
3115             /* Update vectorial force */
3116             fix2             = _mm256_add_pd(fix2,tx);
3117             fiy2             = _mm256_add_pd(fiy2,ty);
3118             fiz2             = _mm256_add_pd(fiz2,tz);
3119
3120             fjx2             = _mm256_add_pd(fjx2,tx);
3121             fjy2             = _mm256_add_pd(fjy2,ty);
3122             fjz2             = _mm256_add_pd(fjz2,tz);
3123
3124             }
3125
3126             /**************************
3127              * CALCULATE INTERACTIONS *
3128              **************************/
3129
3130             if (gmx_mm256_any_lt(rsq23,rcutoff2))
3131             {
3132
3133             r23              = _mm256_mul_pd(rsq23,rinv23);
3134             r23              = _mm256_andnot_pd(dummy_mask,r23);
3135
3136             /* EWALD ELECTROSTATICS */
3137
3138             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3139             ewrt             = _mm256_mul_pd(r23,ewtabscale);
3140             ewitab           = _mm256_cvttpd_epi32(ewrt);
3141             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3142             ewitab           = _mm_slli_epi32(ewitab,2);
3143             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3144             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3145             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3146             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3147             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3148             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3149             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3150             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
3151             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
3152
3153             d                = _mm256_sub_pd(r23,rswitch);
3154             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3155             d2               = _mm256_mul_pd(d,d);
3156             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3157
3158             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3159
3160             /* Evaluate switch function */
3161             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3162             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
3163             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
3164
3165             fscal            = felec;
3166
3167             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3168
3169             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3170
3171             /* Calculate temporary vectorial force */
3172             tx               = _mm256_mul_pd(fscal,dx23);
3173             ty               = _mm256_mul_pd(fscal,dy23);
3174             tz               = _mm256_mul_pd(fscal,dz23);
3175
3176             /* Update vectorial force */
3177             fix2             = _mm256_add_pd(fix2,tx);
3178             fiy2             = _mm256_add_pd(fiy2,ty);
3179             fiz2             = _mm256_add_pd(fiz2,tz);
3180
3181             fjx3             = _mm256_add_pd(fjx3,tx);
3182             fjy3             = _mm256_add_pd(fjy3,ty);
3183             fjz3             = _mm256_add_pd(fjz3,tz);
3184
3185             }
3186
3187             /**************************
3188              * CALCULATE INTERACTIONS *
3189              **************************/
3190
3191             if (gmx_mm256_any_lt(rsq31,rcutoff2))
3192             {
3193
3194             r31              = _mm256_mul_pd(rsq31,rinv31);
3195             r31              = _mm256_andnot_pd(dummy_mask,r31);
3196
3197             /* EWALD ELECTROSTATICS */
3198
3199             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3200             ewrt             = _mm256_mul_pd(r31,ewtabscale);
3201             ewitab           = _mm256_cvttpd_epi32(ewrt);
3202             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3203             ewitab           = _mm_slli_epi32(ewitab,2);
3204             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3205             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3206             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3207             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3208             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3209             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3210             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3211             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
3212             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
3213
3214             d                = _mm256_sub_pd(r31,rswitch);
3215             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3216             d2               = _mm256_mul_pd(d,d);
3217             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3218
3219             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3220
3221             /* Evaluate switch function */
3222             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3223             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
3224             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
3225
3226             fscal            = felec;
3227
3228             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3229
3230             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3231
3232             /* Calculate temporary vectorial force */
3233             tx               = _mm256_mul_pd(fscal,dx31);
3234             ty               = _mm256_mul_pd(fscal,dy31);
3235             tz               = _mm256_mul_pd(fscal,dz31);
3236
3237             /* Update vectorial force */
3238             fix3             = _mm256_add_pd(fix3,tx);
3239             fiy3             = _mm256_add_pd(fiy3,ty);
3240             fiz3             = _mm256_add_pd(fiz3,tz);
3241
3242             fjx1             = _mm256_add_pd(fjx1,tx);
3243             fjy1             = _mm256_add_pd(fjy1,ty);
3244             fjz1             = _mm256_add_pd(fjz1,tz);
3245
3246             }
3247
3248             /**************************
3249              * CALCULATE INTERACTIONS *
3250              **************************/
3251
3252             if (gmx_mm256_any_lt(rsq32,rcutoff2))
3253             {
3254
3255             r32              = _mm256_mul_pd(rsq32,rinv32);
3256             r32              = _mm256_andnot_pd(dummy_mask,r32);
3257
3258             /* EWALD ELECTROSTATICS */
3259
3260             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3261             ewrt             = _mm256_mul_pd(r32,ewtabscale);
3262             ewitab           = _mm256_cvttpd_epi32(ewrt);
3263             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3264             ewitab           = _mm_slli_epi32(ewitab,2);
3265             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3266             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3267             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3268             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3269             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3270             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3271             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3272             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
3273             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
3274
3275             d                = _mm256_sub_pd(r32,rswitch);
3276             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3277             d2               = _mm256_mul_pd(d,d);
3278             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3279
3280             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3281
3282             /* Evaluate switch function */
3283             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3284             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
3285             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
3286
3287             fscal            = felec;
3288
3289             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3290
3291             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3292
3293             /* Calculate temporary vectorial force */
3294             tx               = _mm256_mul_pd(fscal,dx32);
3295             ty               = _mm256_mul_pd(fscal,dy32);
3296             tz               = _mm256_mul_pd(fscal,dz32);
3297
3298             /* Update vectorial force */
3299             fix3             = _mm256_add_pd(fix3,tx);
3300             fiy3             = _mm256_add_pd(fiy3,ty);
3301             fiz3             = _mm256_add_pd(fiz3,tz);
3302
3303             fjx2             = _mm256_add_pd(fjx2,tx);
3304             fjy2             = _mm256_add_pd(fjy2,ty);
3305             fjz2             = _mm256_add_pd(fjz2,tz);
3306
3307             }
3308
3309             /**************************
3310              * CALCULATE INTERACTIONS *
3311              **************************/
3312
3313             if (gmx_mm256_any_lt(rsq33,rcutoff2))
3314             {
3315
3316             r33              = _mm256_mul_pd(rsq33,rinv33);
3317             r33              = _mm256_andnot_pd(dummy_mask,r33);
3318
3319             /* EWALD ELECTROSTATICS */
3320
3321             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3322             ewrt             = _mm256_mul_pd(r33,ewtabscale);
3323             ewitab           = _mm256_cvttpd_epi32(ewrt);
3324             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3325             ewitab           = _mm_slli_epi32(ewitab,2);
3326             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3327             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3328             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3329             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3330             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3331             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3332             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3333             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
3334             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
3335
3336             d                = _mm256_sub_pd(r33,rswitch);
3337             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3338             d2               = _mm256_mul_pd(d,d);
3339             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3340
3341             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3342
3343             /* Evaluate switch function */
3344             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3345             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
3346             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
3347
3348             fscal            = felec;
3349
3350             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3351
3352             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3353
3354             /* Calculate temporary vectorial force */
3355             tx               = _mm256_mul_pd(fscal,dx33);
3356             ty               = _mm256_mul_pd(fscal,dy33);
3357             tz               = _mm256_mul_pd(fscal,dz33);
3358
3359             /* Update vectorial force */
3360             fix3             = _mm256_add_pd(fix3,tx);
3361             fiy3             = _mm256_add_pd(fiy3,ty);
3362             fiz3             = _mm256_add_pd(fiz3,tz);
3363
3364             fjx3             = _mm256_add_pd(fjx3,tx);
3365             fjy3             = _mm256_add_pd(fjy3,ty);
3366             fjz3             = _mm256_add_pd(fjz3,tz);
3367
3368             }
3369
3370             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3371             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3372             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3373             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3374
3375             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
3376                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3377                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3378
3379             /* Inner loop uses 627 flops */
3380         }
3381
3382         /* End of innermost loop */
3383
3384         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3385                                                  f+i_coord_offset,fshift+i_shift_offset);
3386
3387         /* Increment number of inner iterations */
3388         inneriter                  += j_index_end - j_index_start;
3389
3390         /* Outer loop uses 24 flops */
3391     }
3392
3393     /* Increment number of outer iterations */
3394     outeriter        += nri;
3395
3396     /* Update outer/inner flops */
3397
3398     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);
3399 }