Remove no-inline-max-size and suppress remark
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84     real             scratch[4*DIM];
85     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     real *           vdwioffsetptr3;
93     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
95     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
97     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
99     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
101     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
102     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
103     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
106     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
109     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
110     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
111     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
112     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
113     real             *charge;
114     int              nvdwtype;
115     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
116     int              *vdwtype;
117     real             *vdwparam;
118     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
119     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
120     __m128i          ewitab;
121     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
122     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
123     real             *ewtab;
124     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
125     real             rswitch_scalar,d_scalar;
126     __m256d          dummy_mask,cutoff_mask;
127     __m128           tmpmask0,tmpmask1;
128     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
129     __m256d          one     = _mm256_set1_pd(1.0);
130     __m256d          two     = _mm256_set1_pd(2.0);
131     x                = xx[0];
132     f                = ff[0];
133
134     nri              = nlist->nri;
135     iinr             = nlist->iinr;
136     jindex           = nlist->jindex;
137     jjnr             = nlist->jjnr;
138     shiftidx         = nlist->shift;
139     gid              = nlist->gid;
140     shiftvec         = fr->shift_vec[0];
141     fshift           = fr->fshift[0];
142     facel            = _mm256_set1_pd(fr->epsfac);
143     charge           = mdatoms->chargeA;
144     nvdwtype         = fr->ntype;
145     vdwparam         = fr->nbfp;
146     vdwtype          = mdatoms->typeA;
147
148     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
149     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
150     beta2            = _mm256_mul_pd(beta,beta);
151     beta3            = _mm256_mul_pd(beta,beta2);
152
153     ewtab            = fr->ic->tabq_coul_FDV0;
154     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
155     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
156
157     /* Setup water-specific parameters */
158     inr              = nlist->iinr[0];
159     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
160     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
161     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
162     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
163
164     jq1              = _mm256_set1_pd(charge[inr+1]);
165     jq2              = _mm256_set1_pd(charge[inr+2]);
166     jq3              = _mm256_set1_pd(charge[inr+3]);
167     vdwjidx0A        = 2*vdwtype[inr+0];
168     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
169     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
170     qq11             = _mm256_mul_pd(iq1,jq1);
171     qq12             = _mm256_mul_pd(iq1,jq2);
172     qq13             = _mm256_mul_pd(iq1,jq3);
173     qq21             = _mm256_mul_pd(iq2,jq1);
174     qq22             = _mm256_mul_pd(iq2,jq2);
175     qq23             = _mm256_mul_pd(iq2,jq3);
176     qq31             = _mm256_mul_pd(iq3,jq1);
177     qq32             = _mm256_mul_pd(iq3,jq2);
178     qq33             = _mm256_mul_pd(iq3,jq3);
179
180     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
181     rcutoff_scalar   = fr->rcoulomb;
182     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
183     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
184
185     rswitch_scalar   = fr->rcoulomb_switch;
186     rswitch          = _mm256_set1_pd(rswitch_scalar);
187     /* Setup switch parameters */
188     d_scalar         = rcutoff_scalar-rswitch_scalar;
189     d                = _mm256_set1_pd(d_scalar);
190     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
191     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
192     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
193     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
194     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
195     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
196
197     /* Avoid stupid compiler warnings */
198     jnrA = jnrB = jnrC = jnrD = 0;
199     j_coord_offsetA = 0;
200     j_coord_offsetB = 0;
201     j_coord_offsetC = 0;
202     j_coord_offsetD = 0;
203
204     outeriter        = 0;
205     inneriter        = 0;
206
207     for(iidx=0;iidx<4*DIM;iidx++)
208     {
209         scratch[iidx] = 0.0;
210     }
211
212     /* Start outer loop over neighborlists */
213     for(iidx=0; iidx<nri; iidx++)
214     {
215         /* Load shift vector for this list */
216         i_shift_offset   = DIM*shiftidx[iidx];
217
218         /* Load limits for loop over neighbors */
219         j_index_start    = jindex[iidx];
220         j_index_end      = jindex[iidx+1];
221
222         /* Get outer coordinate index */
223         inr              = iinr[iidx];
224         i_coord_offset   = DIM*inr;
225
226         /* Load i particle coords and add shift vector */
227         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
228                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
229
230         fix0             = _mm256_setzero_pd();
231         fiy0             = _mm256_setzero_pd();
232         fiz0             = _mm256_setzero_pd();
233         fix1             = _mm256_setzero_pd();
234         fiy1             = _mm256_setzero_pd();
235         fiz1             = _mm256_setzero_pd();
236         fix2             = _mm256_setzero_pd();
237         fiy2             = _mm256_setzero_pd();
238         fiz2             = _mm256_setzero_pd();
239         fix3             = _mm256_setzero_pd();
240         fiy3             = _mm256_setzero_pd();
241         fiz3             = _mm256_setzero_pd();
242
243         /* Reset potential sums */
244         velecsum         = _mm256_setzero_pd();
245         vvdwsum          = _mm256_setzero_pd();
246
247         /* Start inner kernel loop */
248         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
249         {
250
251             /* Get j neighbor index, and coordinate index */
252             jnrA             = jjnr[jidx];
253             jnrB             = jjnr[jidx+1];
254             jnrC             = jjnr[jidx+2];
255             jnrD             = jjnr[jidx+3];
256             j_coord_offsetA  = DIM*jnrA;
257             j_coord_offsetB  = DIM*jnrB;
258             j_coord_offsetC  = DIM*jnrC;
259             j_coord_offsetD  = DIM*jnrD;
260
261             /* load j atom coordinates */
262             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
263                                                  x+j_coord_offsetC,x+j_coord_offsetD,
264                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
265                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
266
267             /* Calculate displacement vector */
268             dx00             = _mm256_sub_pd(ix0,jx0);
269             dy00             = _mm256_sub_pd(iy0,jy0);
270             dz00             = _mm256_sub_pd(iz0,jz0);
271             dx11             = _mm256_sub_pd(ix1,jx1);
272             dy11             = _mm256_sub_pd(iy1,jy1);
273             dz11             = _mm256_sub_pd(iz1,jz1);
274             dx12             = _mm256_sub_pd(ix1,jx2);
275             dy12             = _mm256_sub_pd(iy1,jy2);
276             dz12             = _mm256_sub_pd(iz1,jz2);
277             dx13             = _mm256_sub_pd(ix1,jx3);
278             dy13             = _mm256_sub_pd(iy1,jy3);
279             dz13             = _mm256_sub_pd(iz1,jz3);
280             dx21             = _mm256_sub_pd(ix2,jx1);
281             dy21             = _mm256_sub_pd(iy2,jy1);
282             dz21             = _mm256_sub_pd(iz2,jz1);
283             dx22             = _mm256_sub_pd(ix2,jx2);
284             dy22             = _mm256_sub_pd(iy2,jy2);
285             dz22             = _mm256_sub_pd(iz2,jz2);
286             dx23             = _mm256_sub_pd(ix2,jx3);
287             dy23             = _mm256_sub_pd(iy2,jy3);
288             dz23             = _mm256_sub_pd(iz2,jz3);
289             dx31             = _mm256_sub_pd(ix3,jx1);
290             dy31             = _mm256_sub_pd(iy3,jy1);
291             dz31             = _mm256_sub_pd(iz3,jz1);
292             dx32             = _mm256_sub_pd(ix3,jx2);
293             dy32             = _mm256_sub_pd(iy3,jy2);
294             dz32             = _mm256_sub_pd(iz3,jz2);
295             dx33             = _mm256_sub_pd(ix3,jx3);
296             dy33             = _mm256_sub_pd(iy3,jy3);
297             dz33             = _mm256_sub_pd(iz3,jz3);
298
299             /* Calculate squared distance and things based on it */
300             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
301             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
302             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
303             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
304             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
305             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
306             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
307             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
308             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
309             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
310
311             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
312             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
313             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
314             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
315             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
316             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
317             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
318             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
319             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
320             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
321
322             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
323             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
324             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
325             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
326             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
327             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
328             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
329             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
330             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
331             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
332
333             fjx0             = _mm256_setzero_pd();
334             fjy0             = _mm256_setzero_pd();
335             fjz0             = _mm256_setzero_pd();
336             fjx1             = _mm256_setzero_pd();
337             fjy1             = _mm256_setzero_pd();
338             fjz1             = _mm256_setzero_pd();
339             fjx2             = _mm256_setzero_pd();
340             fjy2             = _mm256_setzero_pd();
341             fjz2             = _mm256_setzero_pd();
342             fjx3             = _mm256_setzero_pd();
343             fjy3             = _mm256_setzero_pd();
344             fjz3             = _mm256_setzero_pd();
345
346             /**************************
347              * CALCULATE INTERACTIONS *
348              **************************/
349
350             if (gmx_mm256_any_lt(rsq00,rcutoff2))
351             {
352
353             r00              = _mm256_mul_pd(rsq00,rinv00);
354
355             /* LENNARD-JONES DISPERSION/REPULSION */
356
357             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
358             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
359             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
360             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
361             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
362
363             d                = _mm256_sub_pd(r00,rswitch);
364             d                = _mm256_max_pd(d,_mm256_setzero_pd());
365             d2               = _mm256_mul_pd(d,d);
366             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
367
368             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
369
370             /* Evaluate switch function */
371             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
372             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
373             vvdw             = _mm256_mul_pd(vvdw,sw);
374             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
375
376             /* Update potential sum for this i atom from the interaction with this j atom. */
377             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
378             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
379
380             fscal            = fvdw;
381
382             fscal            = _mm256_and_pd(fscal,cutoff_mask);
383
384             /* Calculate temporary vectorial force */
385             tx               = _mm256_mul_pd(fscal,dx00);
386             ty               = _mm256_mul_pd(fscal,dy00);
387             tz               = _mm256_mul_pd(fscal,dz00);
388
389             /* Update vectorial force */
390             fix0             = _mm256_add_pd(fix0,tx);
391             fiy0             = _mm256_add_pd(fiy0,ty);
392             fiz0             = _mm256_add_pd(fiz0,tz);
393
394             fjx0             = _mm256_add_pd(fjx0,tx);
395             fjy0             = _mm256_add_pd(fjy0,ty);
396             fjz0             = _mm256_add_pd(fjz0,tz);
397
398             }
399
400             /**************************
401              * CALCULATE INTERACTIONS *
402              **************************/
403
404             if (gmx_mm256_any_lt(rsq11,rcutoff2))
405             {
406
407             r11              = _mm256_mul_pd(rsq11,rinv11);
408
409             /* EWALD ELECTROSTATICS */
410
411             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
412             ewrt             = _mm256_mul_pd(r11,ewtabscale);
413             ewitab           = _mm256_cvttpd_epi32(ewrt);
414             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
415             ewitab           = _mm_slli_epi32(ewitab,2);
416             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
417             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
418             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
419             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
420             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
421             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
422             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
423             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
424             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
425
426             d                = _mm256_sub_pd(r11,rswitch);
427             d                = _mm256_max_pd(d,_mm256_setzero_pd());
428             d2               = _mm256_mul_pd(d,d);
429             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
430
431             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
432
433             /* Evaluate switch function */
434             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
435             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
436             velec            = _mm256_mul_pd(velec,sw);
437             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
438
439             /* Update potential sum for this i atom from the interaction with this j atom. */
440             velec            = _mm256_and_pd(velec,cutoff_mask);
441             velecsum         = _mm256_add_pd(velecsum,velec);
442
443             fscal            = felec;
444
445             fscal            = _mm256_and_pd(fscal,cutoff_mask);
446
447             /* Calculate temporary vectorial force */
448             tx               = _mm256_mul_pd(fscal,dx11);
449             ty               = _mm256_mul_pd(fscal,dy11);
450             tz               = _mm256_mul_pd(fscal,dz11);
451
452             /* Update vectorial force */
453             fix1             = _mm256_add_pd(fix1,tx);
454             fiy1             = _mm256_add_pd(fiy1,ty);
455             fiz1             = _mm256_add_pd(fiz1,tz);
456
457             fjx1             = _mm256_add_pd(fjx1,tx);
458             fjy1             = _mm256_add_pd(fjy1,ty);
459             fjz1             = _mm256_add_pd(fjz1,tz);
460
461             }
462
463             /**************************
464              * CALCULATE INTERACTIONS *
465              **************************/
466
467             if (gmx_mm256_any_lt(rsq12,rcutoff2))
468             {
469
470             r12              = _mm256_mul_pd(rsq12,rinv12);
471
472             /* EWALD ELECTROSTATICS */
473
474             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
475             ewrt             = _mm256_mul_pd(r12,ewtabscale);
476             ewitab           = _mm256_cvttpd_epi32(ewrt);
477             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
478             ewitab           = _mm_slli_epi32(ewitab,2);
479             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
480             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
481             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
482             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
483             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
484             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
485             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
486             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
487             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
488
489             d                = _mm256_sub_pd(r12,rswitch);
490             d                = _mm256_max_pd(d,_mm256_setzero_pd());
491             d2               = _mm256_mul_pd(d,d);
492             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
493
494             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
495
496             /* Evaluate switch function */
497             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
498             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
499             velec            = _mm256_mul_pd(velec,sw);
500             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
501
502             /* Update potential sum for this i atom from the interaction with this j atom. */
503             velec            = _mm256_and_pd(velec,cutoff_mask);
504             velecsum         = _mm256_add_pd(velecsum,velec);
505
506             fscal            = felec;
507
508             fscal            = _mm256_and_pd(fscal,cutoff_mask);
509
510             /* Calculate temporary vectorial force */
511             tx               = _mm256_mul_pd(fscal,dx12);
512             ty               = _mm256_mul_pd(fscal,dy12);
513             tz               = _mm256_mul_pd(fscal,dz12);
514
515             /* Update vectorial force */
516             fix1             = _mm256_add_pd(fix1,tx);
517             fiy1             = _mm256_add_pd(fiy1,ty);
518             fiz1             = _mm256_add_pd(fiz1,tz);
519
520             fjx2             = _mm256_add_pd(fjx2,tx);
521             fjy2             = _mm256_add_pd(fjy2,ty);
522             fjz2             = _mm256_add_pd(fjz2,tz);
523
524             }
525
526             /**************************
527              * CALCULATE INTERACTIONS *
528              **************************/
529
530             if (gmx_mm256_any_lt(rsq13,rcutoff2))
531             {
532
533             r13              = _mm256_mul_pd(rsq13,rinv13);
534
535             /* EWALD ELECTROSTATICS */
536
537             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
538             ewrt             = _mm256_mul_pd(r13,ewtabscale);
539             ewitab           = _mm256_cvttpd_epi32(ewrt);
540             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
541             ewitab           = _mm_slli_epi32(ewitab,2);
542             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
543             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
544             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
545             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
546             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
547             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
548             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
549             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
550             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
551
552             d                = _mm256_sub_pd(r13,rswitch);
553             d                = _mm256_max_pd(d,_mm256_setzero_pd());
554             d2               = _mm256_mul_pd(d,d);
555             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
556
557             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
558
559             /* Evaluate switch function */
560             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
561             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
562             velec            = _mm256_mul_pd(velec,sw);
563             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
564
565             /* Update potential sum for this i atom from the interaction with this j atom. */
566             velec            = _mm256_and_pd(velec,cutoff_mask);
567             velecsum         = _mm256_add_pd(velecsum,velec);
568
569             fscal            = felec;
570
571             fscal            = _mm256_and_pd(fscal,cutoff_mask);
572
573             /* Calculate temporary vectorial force */
574             tx               = _mm256_mul_pd(fscal,dx13);
575             ty               = _mm256_mul_pd(fscal,dy13);
576             tz               = _mm256_mul_pd(fscal,dz13);
577
578             /* Update vectorial force */
579             fix1             = _mm256_add_pd(fix1,tx);
580             fiy1             = _mm256_add_pd(fiy1,ty);
581             fiz1             = _mm256_add_pd(fiz1,tz);
582
583             fjx3             = _mm256_add_pd(fjx3,tx);
584             fjy3             = _mm256_add_pd(fjy3,ty);
585             fjz3             = _mm256_add_pd(fjz3,tz);
586
587             }
588
589             /**************************
590              * CALCULATE INTERACTIONS *
591              **************************/
592
593             if (gmx_mm256_any_lt(rsq21,rcutoff2))
594             {
595
596             r21              = _mm256_mul_pd(rsq21,rinv21);
597
598             /* EWALD ELECTROSTATICS */
599
600             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
601             ewrt             = _mm256_mul_pd(r21,ewtabscale);
602             ewitab           = _mm256_cvttpd_epi32(ewrt);
603             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
604             ewitab           = _mm_slli_epi32(ewitab,2);
605             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
606             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
607             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
608             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
609             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
610             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
611             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
612             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
613             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
614
615             d                = _mm256_sub_pd(r21,rswitch);
616             d                = _mm256_max_pd(d,_mm256_setzero_pd());
617             d2               = _mm256_mul_pd(d,d);
618             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
619
620             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
621
622             /* Evaluate switch function */
623             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
624             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
625             velec            = _mm256_mul_pd(velec,sw);
626             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
627
628             /* Update potential sum for this i atom from the interaction with this j atom. */
629             velec            = _mm256_and_pd(velec,cutoff_mask);
630             velecsum         = _mm256_add_pd(velecsum,velec);
631
632             fscal            = felec;
633
634             fscal            = _mm256_and_pd(fscal,cutoff_mask);
635
636             /* Calculate temporary vectorial force */
637             tx               = _mm256_mul_pd(fscal,dx21);
638             ty               = _mm256_mul_pd(fscal,dy21);
639             tz               = _mm256_mul_pd(fscal,dz21);
640
641             /* Update vectorial force */
642             fix2             = _mm256_add_pd(fix2,tx);
643             fiy2             = _mm256_add_pd(fiy2,ty);
644             fiz2             = _mm256_add_pd(fiz2,tz);
645
646             fjx1             = _mm256_add_pd(fjx1,tx);
647             fjy1             = _mm256_add_pd(fjy1,ty);
648             fjz1             = _mm256_add_pd(fjz1,tz);
649
650             }
651
652             /**************************
653              * CALCULATE INTERACTIONS *
654              **************************/
655
656             if (gmx_mm256_any_lt(rsq22,rcutoff2))
657             {
658
659             r22              = _mm256_mul_pd(rsq22,rinv22);
660
661             /* EWALD ELECTROSTATICS */
662
663             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
664             ewrt             = _mm256_mul_pd(r22,ewtabscale);
665             ewitab           = _mm256_cvttpd_epi32(ewrt);
666             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
667             ewitab           = _mm_slli_epi32(ewitab,2);
668             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
669             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
670             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
671             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
672             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
673             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
674             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
675             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
676             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
677
678             d                = _mm256_sub_pd(r22,rswitch);
679             d                = _mm256_max_pd(d,_mm256_setzero_pd());
680             d2               = _mm256_mul_pd(d,d);
681             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
682
683             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
684
685             /* Evaluate switch function */
686             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
687             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
688             velec            = _mm256_mul_pd(velec,sw);
689             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
690
691             /* Update potential sum for this i atom from the interaction with this j atom. */
692             velec            = _mm256_and_pd(velec,cutoff_mask);
693             velecsum         = _mm256_add_pd(velecsum,velec);
694
695             fscal            = felec;
696
697             fscal            = _mm256_and_pd(fscal,cutoff_mask);
698
699             /* Calculate temporary vectorial force */
700             tx               = _mm256_mul_pd(fscal,dx22);
701             ty               = _mm256_mul_pd(fscal,dy22);
702             tz               = _mm256_mul_pd(fscal,dz22);
703
704             /* Update vectorial force */
705             fix2             = _mm256_add_pd(fix2,tx);
706             fiy2             = _mm256_add_pd(fiy2,ty);
707             fiz2             = _mm256_add_pd(fiz2,tz);
708
709             fjx2             = _mm256_add_pd(fjx2,tx);
710             fjy2             = _mm256_add_pd(fjy2,ty);
711             fjz2             = _mm256_add_pd(fjz2,tz);
712
713             }
714
715             /**************************
716              * CALCULATE INTERACTIONS *
717              **************************/
718
719             if (gmx_mm256_any_lt(rsq23,rcutoff2))
720             {
721
722             r23              = _mm256_mul_pd(rsq23,rinv23);
723
724             /* EWALD ELECTROSTATICS */
725
726             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
727             ewrt             = _mm256_mul_pd(r23,ewtabscale);
728             ewitab           = _mm256_cvttpd_epi32(ewrt);
729             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
730             ewitab           = _mm_slli_epi32(ewitab,2);
731             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
732             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
733             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
734             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
735             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
736             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
737             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
738             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
739             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
740
741             d                = _mm256_sub_pd(r23,rswitch);
742             d                = _mm256_max_pd(d,_mm256_setzero_pd());
743             d2               = _mm256_mul_pd(d,d);
744             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
745
746             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
747
748             /* Evaluate switch function */
749             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
750             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
751             velec            = _mm256_mul_pd(velec,sw);
752             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
753
754             /* Update potential sum for this i atom from the interaction with this j atom. */
755             velec            = _mm256_and_pd(velec,cutoff_mask);
756             velecsum         = _mm256_add_pd(velecsum,velec);
757
758             fscal            = felec;
759
760             fscal            = _mm256_and_pd(fscal,cutoff_mask);
761
762             /* Calculate temporary vectorial force */
763             tx               = _mm256_mul_pd(fscal,dx23);
764             ty               = _mm256_mul_pd(fscal,dy23);
765             tz               = _mm256_mul_pd(fscal,dz23);
766
767             /* Update vectorial force */
768             fix2             = _mm256_add_pd(fix2,tx);
769             fiy2             = _mm256_add_pd(fiy2,ty);
770             fiz2             = _mm256_add_pd(fiz2,tz);
771
772             fjx3             = _mm256_add_pd(fjx3,tx);
773             fjy3             = _mm256_add_pd(fjy3,ty);
774             fjz3             = _mm256_add_pd(fjz3,tz);
775
776             }
777
778             /**************************
779              * CALCULATE INTERACTIONS *
780              **************************/
781
782             if (gmx_mm256_any_lt(rsq31,rcutoff2))
783             {
784
785             r31              = _mm256_mul_pd(rsq31,rinv31);
786
787             /* EWALD ELECTROSTATICS */
788
789             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
790             ewrt             = _mm256_mul_pd(r31,ewtabscale);
791             ewitab           = _mm256_cvttpd_epi32(ewrt);
792             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
793             ewitab           = _mm_slli_epi32(ewitab,2);
794             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
795             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
796             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
797             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
798             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
799             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
800             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
801             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
802             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
803
804             d                = _mm256_sub_pd(r31,rswitch);
805             d                = _mm256_max_pd(d,_mm256_setzero_pd());
806             d2               = _mm256_mul_pd(d,d);
807             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
808
809             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
810
811             /* Evaluate switch function */
812             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
813             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
814             velec            = _mm256_mul_pd(velec,sw);
815             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
816
817             /* Update potential sum for this i atom from the interaction with this j atom. */
818             velec            = _mm256_and_pd(velec,cutoff_mask);
819             velecsum         = _mm256_add_pd(velecsum,velec);
820
821             fscal            = felec;
822
823             fscal            = _mm256_and_pd(fscal,cutoff_mask);
824
825             /* Calculate temporary vectorial force */
826             tx               = _mm256_mul_pd(fscal,dx31);
827             ty               = _mm256_mul_pd(fscal,dy31);
828             tz               = _mm256_mul_pd(fscal,dz31);
829
830             /* Update vectorial force */
831             fix3             = _mm256_add_pd(fix3,tx);
832             fiy3             = _mm256_add_pd(fiy3,ty);
833             fiz3             = _mm256_add_pd(fiz3,tz);
834
835             fjx1             = _mm256_add_pd(fjx1,tx);
836             fjy1             = _mm256_add_pd(fjy1,ty);
837             fjz1             = _mm256_add_pd(fjz1,tz);
838
839             }
840
841             /**************************
842              * CALCULATE INTERACTIONS *
843              **************************/
844
845             if (gmx_mm256_any_lt(rsq32,rcutoff2))
846             {
847
848             r32              = _mm256_mul_pd(rsq32,rinv32);
849
850             /* EWALD ELECTROSTATICS */
851
852             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
853             ewrt             = _mm256_mul_pd(r32,ewtabscale);
854             ewitab           = _mm256_cvttpd_epi32(ewrt);
855             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
856             ewitab           = _mm_slli_epi32(ewitab,2);
857             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
858             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
859             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
860             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
861             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
862             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
863             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
864             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
865             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
866
867             d                = _mm256_sub_pd(r32,rswitch);
868             d                = _mm256_max_pd(d,_mm256_setzero_pd());
869             d2               = _mm256_mul_pd(d,d);
870             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
871
872             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
873
874             /* Evaluate switch function */
875             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
876             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
877             velec            = _mm256_mul_pd(velec,sw);
878             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
879
880             /* Update potential sum for this i atom from the interaction with this j atom. */
881             velec            = _mm256_and_pd(velec,cutoff_mask);
882             velecsum         = _mm256_add_pd(velecsum,velec);
883
884             fscal            = felec;
885
886             fscal            = _mm256_and_pd(fscal,cutoff_mask);
887
888             /* Calculate temporary vectorial force */
889             tx               = _mm256_mul_pd(fscal,dx32);
890             ty               = _mm256_mul_pd(fscal,dy32);
891             tz               = _mm256_mul_pd(fscal,dz32);
892
893             /* Update vectorial force */
894             fix3             = _mm256_add_pd(fix3,tx);
895             fiy3             = _mm256_add_pd(fiy3,ty);
896             fiz3             = _mm256_add_pd(fiz3,tz);
897
898             fjx2             = _mm256_add_pd(fjx2,tx);
899             fjy2             = _mm256_add_pd(fjy2,ty);
900             fjz2             = _mm256_add_pd(fjz2,tz);
901
902             }
903
904             /**************************
905              * CALCULATE INTERACTIONS *
906              **************************/
907
908             if (gmx_mm256_any_lt(rsq33,rcutoff2))
909             {
910
911             r33              = _mm256_mul_pd(rsq33,rinv33);
912
913             /* EWALD ELECTROSTATICS */
914
915             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
916             ewrt             = _mm256_mul_pd(r33,ewtabscale);
917             ewitab           = _mm256_cvttpd_epi32(ewrt);
918             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
919             ewitab           = _mm_slli_epi32(ewitab,2);
920             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
921             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
922             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
923             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
924             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
925             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
926             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
927             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
928             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
929
930             d                = _mm256_sub_pd(r33,rswitch);
931             d                = _mm256_max_pd(d,_mm256_setzero_pd());
932             d2               = _mm256_mul_pd(d,d);
933             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
934
935             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
936
937             /* Evaluate switch function */
938             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
939             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
940             velec            = _mm256_mul_pd(velec,sw);
941             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
942
943             /* Update potential sum for this i atom from the interaction with this j atom. */
944             velec            = _mm256_and_pd(velec,cutoff_mask);
945             velecsum         = _mm256_add_pd(velecsum,velec);
946
947             fscal            = felec;
948
949             fscal            = _mm256_and_pd(fscal,cutoff_mask);
950
951             /* Calculate temporary vectorial force */
952             tx               = _mm256_mul_pd(fscal,dx33);
953             ty               = _mm256_mul_pd(fscal,dy33);
954             tz               = _mm256_mul_pd(fscal,dz33);
955
956             /* Update vectorial force */
957             fix3             = _mm256_add_pd(fix3,tx);
958             fiy3             = _mm256_add_pd(fiy3,ty);
959             fiz3             = _mm256_add_pd(fiz3,tz);
960
961             fjx3             = _mm256_add_pd(fjx3,tx);
962             fjy3             = _mm256_add_pd(fjy3,ty);
963             fjz3             = _mm256_add_pd(fjz3,tz);
964
965             }
966
967             fjptrA             = f+j_coord_offsetA;
968             fjptrB             = f+j_coord_offsetB;
969             fjptrC             = f+j_coord_offsetC;
970             fjptrD             = f+j_coord_offsetD;
971
972             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
973                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
974                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
975
976             /* Inner loop uses 647 flops */
977         }
978
979         if(jidx<j_index_end)
980         {
981
982             /* Get j neighbor index, and coordinate index */
983             jnrlistA         = jjnr[jidx];
984             jnrlistB         = jjnr[jidx+1];
985             jnrlistC         = jjnr[jidx+2];
986             jnrlistD         = jjnr[jidx+3];
987             /* Sign of each element will be negative for non-real atoms.
988              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
989              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
990              */
991             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
992
993             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
994             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
995             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
996
997             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
998             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
999             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1000             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1001             j_coord_offsetA  = DIM*jnrA;
1002             j_coord_offsetB  = DIM*jnrB;
1003             j_coord_offsetC  = DIM*jnrC;
1004             j_coord_offsetD  = DIM*jnrD;
1005
1006             /* load j atom coordinates */
1007             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1008                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1009                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1010                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1011
1012             /* Calculate displacement vector */
1013             dx00             = _mm256_sub_pd(ix0,jx0);
1014             dy00             = _mm256_sub_pd(iy0,jy0);
1015             dz00             = _mm256_sub_pd(iz0,jz0);
1016             dx11             = _mm256_sub_pd(ix1,jx1);
1017             dy11             = _mm256_sub_pd(iy1,jy1);
1018             dz11             = _mm256_sub_pd(iz1,jz1);
1019             dx12             = _mm256_sub_pd(ix1,jx2);
1020             dy12             = _mm256_sub_pd(iy1,jy2);
1021             dz12             = _mm256_sub_pd(iz1,jz2);
1022             dx13             = _mm256_sub_pd(ix1,jx3);
1023             dy13             = _mm256_sub_pd(iy1,jy3);
1024             dz13             = _mm256_sub_pd(iz1,jz3);
1025             dx21             = _mm256_sub_pd(ix2,jx1);
1026             dy21             = _mm256_sub_pd(iy2,jy1);
1027             dz21             = _mm256_sub_pd(iz2,jz1);
1028             dx22             = _mm256_sub_pd(ix2,jx2);
1029             dy22             = _mm256_sub_pd(iy2,jy2);
1030             dz22             = _mm256_sub_pd(iz2,jz2);
1031             dx23             = _mm256_sub_pd(ix2,jx3);
1032             dy23             = _mm256_sub_pd(iy2,jy3);
1033             dz23             = _mm256_sub_pd(iz2,jz3);
1034             dx31             = _mm256_sub_pd(ix3,jx1);
1035             dy31             = _mm256_sub_pd(iy3,jy1);
1036             dz31             = _mm256_sub_pd(iz3,jz1);
1037             dx32             = _mm256_sub_pd(ix3,jx2);
1038             dy32             = _mm256_sub_pd(iy3,jy2);
1039             dz32             = _mm256_sub_pd(iz3,jz2);
1040             dx33             = _mm256_sub_pd(ix3,jx3);
1041             dy33             = _mm256_sub_pd(iy3,jy3);
1042             dz33             = _mm256_sub_pd(iz3,jz3);
1043
1044             /* Calculate squared distance and things based on it */
1045             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1046             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1047             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1048             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1049             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1050             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1051             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1052             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1053             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1054             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1055
1056             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1057             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1058             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1059             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1060             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1061             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1062             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1063             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1064             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1065             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1066
1067             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1068             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1069             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1070             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1071             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1072             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1073             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1074             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1075             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1076             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1077
1078             fjx0             = _mm256_setzero_pd();
1079             fjy0             = _mm256_setzero_pd();
1080             fjz0             = _mm256_setzero_pd();
1081             fjx1             = _mm256_setzero_pd();
1082             fjy1             = _mm256_setzero_pd();
1083             fjz1             = _mm256_setzero_pd();
1084             fjx2             = _mm256_setzero_pd();
1085             fjy2             = _mm256_setzero_pd();
1086             fjz2             = _mm256_setzero_pd();
1087             fjx3             = _mm256_setzero_pd();
1088             fjy3             = _mm256_setzero_pd();
1089             fjz3             = _mm256_setzero_pd();
1090
1091             /**************************
1092              * CALCULATE INTERACTIONS *
1093              **************************/
1094
1095             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1096             {
1097
1098             r00              = _mm256_mul_pd(rsq00,rinv00);
1099             r00              = _mm256_andnot_pd(dummy_mask,r00);
1100
1101             /* LENNARD-JONES DISPERSION/REPULSION */
1102
1103             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1104             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1105             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1106             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1107             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1108
1109             d                = _mm256_sub_pd(r00,rswitch);
1110             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1111             d2               = _mm256_mul_pd(d,d);
1112             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1113
1114             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1115
1116             /* Evaluate switch function */
1117             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1118             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1119             vvdw             = _mm256_mul_pd(vvdw,sw);
1120             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1121
1122             /* Update potential sum for this i atom from the interaction with this j atom. */
1123             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
1124             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
1125             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
1126
1127             fscal            = fvdw;
1128
1129             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1130
1131             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1132
1133             /* Calculate temporary vectorial force */
1134             tx               = _mm256_mul_pd(fscal,dx00);
1135             ty               = _mm256_mul_pd(fscal,dy00);
1136             tz               = _mm256_mul_pd(fscal,dz00);
1137
1138             /* Update vectorial force */
1139             fix0             = _mm256_add_pd(fix0,tx);
1140             fiy0             = _mm256_add_pd(fiy0,ty);
1141             fiz0             = _mm256_add_pd(fiz0,tz);
1142
1143             fjx0             = _mm256_add_pd(fjx0,tx);
1144             fjy0             = _mm256_add_pd(fjy0,ty);
1145             fjz0             = _mm256_add_pd(fjz0,tz);
1146
1147             }
1148
1149             /**************************
1150              * CALCULATE INTERACTIONS *
1151              **************************/
1152
1153             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1154             {
1155
1156             r11              = _mm256_mul_pd(rsq11,rinv11);
1157             r11              = _mm256_andnot_pd(dummy_mask,r11);
1158
1159             /* EWALD ELECTROSTATICS */
1160
1161             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1162             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1163             ewitab           = _mm256_cvttpd_epi32(ewrt);
1164             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1165             ewitab           = _mm_slli_epi32(ewitab,2);
1166             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1167             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1168             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1169             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1170             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1171             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1172             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1173             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1174             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1175
1176             d                = _mm256_sub_pd(r11,rswitch);
1177             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1178             d2               = _mm256_mul_pd(d,d);
1179             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1180
1181             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1182
1183             /* Evaluate switch function */
1184             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1185             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1186             velec            = _mm256_mul_pd(velec,sw);
1187             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1188
1189             /* Update potential sum for this i atom from the interaction with this j atom. */
1190             velec            = _mm256_and_pd(velec,cutoff_mask);
1191             velec            = _mm256_andnot_pd(dummy_mask,velec);
1192             velecsum         = _mm256_add_pd(velecsum,velec);
1193
1194             fscal            = felec;
1195
1196             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1197
1198             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1199
1200             /* Calculate temporary vectorial force */
1201             tx               = _mm256_mul_pd(fscal,dx11);
1202             ty               = _mm256_mul_pd(fscal,dy11);
1203             tz               = _mm256_mul_pd(fscal,dz11);
1204
1205             /* Update vectorial force */
1206             fix1             = _mm256_add_pd(fix1,tx);
1207             fiy1             = _mm256_add_pd(fiy1,ty);
1208             fiz1             = _mm256_add_pd(fiz1,tz);
1209
1210             fjx1             = _mm256_add_pd(fjx1,tx);
1211             fjy1             = _mm256_add_pd(fjy1,ty);
1212             fjz1             = _mm256_add_pd(fjz1,tz);
1213
1214             }
1215
1216             /**************************
1217              * CALCULATE INTERACTIONS *
1218              **************************/
1219
1220             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1221             {
1222
1223             r12              = _mm256_mul_pd(rsq12,rinv12);
1224             r12              = _mm256_andnot_pd(dummy_mask,r12);
1225
1226             /* EWALD ELECTROSTATICS */
1227
1228             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1229             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1230             ewitab           = _mm256_cvttpd_epi32(ewrt);
1231             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1232             ewitab           = _mm_slli_epi32(ewitab,2);
1233             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1234             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1235             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1236             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1237             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1238             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1239             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1240             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1241             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1242
1243             d                = _mm256_sub_pd(r12,rswitch);
1244             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1245             d2               = _mm256_mul_pd(d,d);
1246             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1247
1248             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1249
1250             /* Evaluate switch function */
1251             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1252             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1253             velec            = _mm256_mul_pd(velec,sw);
1254             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1255
1256             /* Update potential sum for this i atom from the interaction with this j atom. */
1257             velec            = _mm256_and_pd(velec,cutoff_mask);
1258             velec            = _mm256_andnot_pd(dummy_mask,velec);
1259             velecsum         = _mm256_add_pd(velecsum,velec);
1260
1261             fscal            = felec;
1262
1263             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1264
1265             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1266
1267             /* Calculate temporary vectorial force */
1268             tx               = _mm256_mul_pd(fscal,dx12);
1269             ty               = _mm256_mul_pd(fscal,dy12);
1270             tz               = _mm256_mul_pd(fscal,dz12);
1271
1272             /* Update vectorial force */
1273             fix1             = _mm256_add_pd(fix1,tx);
1274             fiy1             = _mm256_add_pd(fiy1,ty);
1275             fiz1             = _mm256_add_pd(fiz1,tz);
1276
1277             fjx2             = _mm256_add_pd(fjx2,tx);
1278             fjy2             = _mm256_add_pd(fjy2,ty);
1279             fjz2             = _mm256_add_pd(fjz2,tz);
1280
1281             }
1282
1283             /**************************
1284              * CALCULATE INTERACTIONS *
1285              **************************/
1286
1287             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1288             {
1289
1290             r13              = _mm256_mul_pd(rsq13,rinv13);
1291             r13              = _mm256_andnot_pd(dummy_mask,r13);
1292
1293             /* EWALD ELECTROSTATICS */
1294
1295             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1296             ewrt             = _mm256_mul_pd(r13,ewtabscale);
1297             ewitab           = _mm256_cvttpd_epi32(ewrt);
1298             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1299             ewitab           = _mm_slli_epi32(ewitab,2);
1300             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1301             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1302             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1303             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1304             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1305             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1306             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1307             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1308             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1309
1310             d                = _mm256_sub_pd(r13,rswitch);
1311             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1312             d2               = _mm256_mul_pd(d,d);
1313             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1314
1315             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1316
1317             /* Evaluate switch function */
1318             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1319             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
1320             velec            = _mm256_mul_pd(velec,sw);
1321             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1322
1323             /* Update potential sum for this i atom from the interaction with this j atom. */
1324             velec            = _mm256_and_pd(velec,cutoff_mask);
1325             velec            = _mm256_andnot_pd(dummy_mask,velec);
1326             velecsum         = _mm256_add_pd(velecsum,velec);
1327
1328             fscal            = felec;
1329
1330             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1331
1332             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1333
1334             /* Calculate temporary vectorial force */
1335             tx               = _mm256_mul_pd(fscal,dx13);
1336             ty               = _mm256_mul_pd(fscal,dy13);
1337             tz               = _mm256_mul_pd(fscal,dz13);
1338
1339             /* Update vectorial force */
1340             fix1             = _mm256_add_pd(fix1,tx);
1341             fiy1             = _mm256_add_pd(fiy1,ty);
1342             fiz1             = _mm256_add_pd(fiz1,tz);
1343
1344             fjx3             = _mm256_add_pd(fjx3,tx);
1345             fjy3             = _mm256_add_pd(fjy3,ty);
1346             fjz3             = _mm256_add_pd(fjz3,tz);
1347
1348             }
1349
1350             /**************************
1351              * CALCULATE INTERACTIONS *
1352              **************************/
1353
1354             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1355             {
1356
1357             r21              = _mm256_mul_pd(rsq21,rinv21);
1358             r21              = _mm256_andnot_pd(dummy_mask,r21);
1359
1360             /* EWALD ELECTROSTATICS */
1361
1362             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1363             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1364             ewitab           = _mm256_cvttpd_epi32(ewrt);
1365             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1366             ewitab           = _mm_slli_epi32(ewitab,2);
1367             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1368             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1369             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1370             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1371             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1372             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1373             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1374             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1375             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1376
1377             d                = _mm256_sub_pd(r21,rswitch);
1378             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1379             d2               = _mm256_mul_pd(d,d);
1380             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1381
1382             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1383
1384             /* Evaluate switch function */
1385             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1386             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
1387             velec            = _mm256_mul_pd(velec,sw);
1388             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1389
1390             /* Update potential sum for this i atom from the interaction with this j atom. */
1391             velec            = _mm256_and_pd(velec,cutoff_mask);
1392             velec            = _mm256_andnot_pd(dummy_mask,velec);
1393             velecsum         = _mm256_add_pd(velecsum,velec);
1394
1395             fscal            = felec;
1396
1397             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1398
1399             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1400
1401             /* Calculate temporary vectorial force */
1402             tx               = _mm256_mul_pd(fscal,dx21);
1403             ty               = _mm256_mul_pd(fscal,dy21);
1404             tz               = _mm256_mul_pd(fscal,dz21);
1405
1406             /* Update vectorial force */
1407             fix2             = _mm256_add_pd(fix2,tx);
1408             fiy2             = _mm256_add_pd(fiy2,ty);
1409             fiz2             = _mm256_add_pd(fiz2,tz);
1410
1411             fjx1             = _mm256_add_pd(fjx1,tx);
1412             fjy1             = _mm256_add_pd(fjy1,ty);
1413             fjz1             = _mm256_add_pd(fjz1,tz);
1414
1415             }
1416
1417             /**************************
1418              * CALCULATE INTERACTIONS *
1419              **************************/
1420
1421             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1422             {
1423
1424             r22              = _mm256_mul_pd(rsq22,rinv22);
1425             r22              = _mm256_andnot_pd(dummy_mask,r22);
1426
1427             /* EWALD ELECTROSTATICS */
1428
1429             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1430             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1431             ewitab           = _mm256_cvttpd_epi32(ewrt);
1432             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1433             ewitab           = _mm_slli_epi32(ewitab,2);
1434             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1435             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1436             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1437             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1438             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1439             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1440             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1441             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1442             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1443
1444             d                = _mm256_sub_pd(r22,rswitch);
1445             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1446             d2               = _mm256_mul_pd(d,d);
1447             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1448
1449             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1450
1451             /* Evaluate switch function */
1452             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1453             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
1454             velec            = _mm256_mul_pd(velec,sw);
1455             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1456
1457             /* Update potential sum for this i atom from the interaction with this j atom. */
1458             velec            = _mm256_and_pd(velec,cutoff_mask);
1459             velec            = _mm256_andnot_pd(dummy_mask,velec);
1460             velecsum         = _mm256_add_pd(velecsum,velec);
1461
1462             fscal            = felec;
1463
1464             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1465
1466             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1467
1468             /* Calculate temporary vectorial force */
1469             tx               = _mm256_mul_pd(fscal,dx22);
1470             ty               = _mm256_mul_pd(fscal,dy22);
1471             tz               = _mm256_mul_pd(fscal,dz22);
1472
1473             /* Update vectorial force */
1474             fix2             = _mm256_add_pd(fix2,tx);
1475             fiy2             = _mm256_add_pd(fiy2,ty);
1476             fiz2             = _mm256_add_pd(fiz2,tz);
1477
1478             fjx2             = _mm256_add_pd(fjx2,tx);
1479             fjy2             = _mm256_add_pd(fjy2,ty);
1480             fjz2             = _mm256_add_pd(fjz2,tz);
1481
1482             }
1483
1484             /**************************
1485              * CALCULATE INTERACTIONS *
1486              **************************/
1487
1488             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1489             {
1490
1491             r23              = _mm256_mul_pd(rsq23,rinv23);
1492             r23              = _mm256_andnot_pd(dummy_mask,r23);
1493
1494             /* EWALD ELECTROSTATICS */
1495
1496             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1497             ewrt             = _mm256_mul_pd(r23,ewtabscale);
1498             ewitab           = _mm256_cvttpd_epi32(ewrt);
1499             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1500             ewitab           = _mm_slli_epi32(ewitab,2);
1501             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1502             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1503             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1504             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1505             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1506             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1507             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1508             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1509             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1510
1511             d                = _mm256_sub_pd(r23,rswitch);
1512             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1513             d2               = _mm256_mul_pd(d,d);
1514             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1515
1516             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1517
1518             /* Evaluate switch function */
1519             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1520             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
1521             velec            = _mm256_mul_pd(velec,sw);
1522             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1523
1524             /* Update potential sum for this i atom from the interaction with this j atom. */
1525             velec            = _mm256_and_pd(velec,cutoff_mask);
1526             velec            = _mm256_andnot_pd(dummy_mask,velec);
1527             velecsum         = _mm256_add_pd(velecsum,velec);
1528
1529             fscal            = felec;
1530
1531             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1532
1533             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1534
1535             /* Calculate temporary vectorial force */
1536             tx               = _mm256_mul_pd(fscal,dx23);
1537             ty               = _mm256_mul_pd(fscal,dy23);
1538             tz               = _mm256_mul_pd(fscal,dz23);
1539
1540             /* Update vectorial force */
1541             fix2             = _mm256_add_pd(fix2,tx);
1542             fiy2             = _mm256_add_pd(fiy2,ty);
1543             fiz2             = _mm256_add_pd(fiz2,tz);
1544
1545             fjx3             = _mm256_add_pd(fjx3,tx);
1546             fjy3             = _mm256_add_pd(fjy3,ty);
1547             fjz3             = _mm256_add_pd(fjz3,tz);
1548
1549             }
1550
1551             /**************************
1552              * CALCULATE INTERACTIONS *
1553              **************************/
1554
1555             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1556             {
1557
1558             r31              = _mm256_mul_pd(rsq31,rinv31);
1559             r31              = _mm256_andnot_pd(dummy_mask,r31);
1560
1561             /* EWALD ELECTROSTATICS */
1562
1563             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1564             ewrt             = _mm256_mul_pd(r31,ewtabscale);
1565             ewitab           = _mm256_cvttpd_epi32(ewrt);
1566             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1567             ewitab           = _mm_slli_epi32(ewitab,2);
1568             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1569             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1570             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1571             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1572             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1573             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1574             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1575             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1576             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1577
1578             d                = _mm256_sub_pd(r31,rswitch);
1579             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1580             d2               = _mm256_mul_pd(d,d);
1581             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1582
1583             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1584
1585             /* Evaluate switch function */
1586             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1587             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
1588             velec            = _mm256_mul_pd(velec,sw);
1589             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1590
1591             /* Update potential sum for this i atom from the interaction with this j atom. */
1592             velec            = _mm256_and_pd(velec,cutoff_mask);
1593             velec            = _mm256_andnot_pd(dummy_mask,velec);
1594             velecsum         = _mm256_add_pd(velecsum,velec);
1595
1596             fscal            = felec;
1597
1598             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1599
1600             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1601
1602             /* Calculate temporary vectorial force */
1603             tx               = _mm256_mul_pd(fscal,dx31);
1604             ty               = _mm256_mul_pd(fscal,dy31);
1605             tz               = _mm256_mul_pd(fscal,dz31);
1606
1607             /* Update vectorial force */
1608             fix3             = _mm256_add_pd(fix3,tx);
1609             fiy3             = _mm256_add_pd(fiy3,ty);
1610             fiz3             = _mm256_add_pd(fiz3,tz);
1611
1612             fjx1             = _mm256_add_pd(fjx1,tx);
1613             fjy1             = _mm256_add_pd(fjy1,ty);
1614             fjz1             = _mm256_add_pd(fjz1,tz);
1615
1616             }
1617
1618             /**************************
1619              * CALCULATE INTERACTIONS *
1620              **************************/
1621
1622             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1623             {
1624
1625             r32              = _mm256_mul_pd(rsq32,rinv32);
1626             r32              = _mm256_andnot_pd(dummy_mask,r32);
1627
1628             /* EWALD ELECTROSTATICS */
1629
1630             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1631             ewrt             = _mm256_mul_pd(r32,ewtabscale);
1632             ewitab           = _mm256_cvttpd_epi32(ewrt);
1633             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1634             ewitab           = _mm_slli_epi32(ewitab,2);
1635             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1636             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1637             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1638             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1639             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1640             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1641             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1642             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1643             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1644
1645             d                = _mm256_sub_pd(r32,rswitch);
1646             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1647             d2               = _mm256_mul_pd(d,d);
1648             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1649
1650             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1651
1652             /* Evaluate switch function */
1653             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1654             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
1655             velec            = _mm256_mul_pd(velec,sw);
1656             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1657
1658             /* Update potential sum for this i atom from the interaction with this j atom. */
1659             velec            = _mm256_and_pd(velec,cutoff_mask);
1660             velec            = _mm256_andnot_pd(dummy_mask,velec);
1661             velecsum         = _mm256_add_pd(velecsum,velec);
1662
1663             fscal            = felec;
1664
1665             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1666
1667             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1668
1669             /* Calculate temporary vectorial force */
1670             tx               = _mm256_mul_pd(fscal,dx32);
1671             ty               = _mm256_mul_pd(fscal,dy32);
1672             tz               = _mm256_mul_pd(fscal,dz32);
1673
1674             /* Update vectorial force */
1675             fix3             = _mm256_add_pd(fix3,tx);
1676             fiy3             = _mm256_add_pd(fiy3,ty);
1677             fiz3             = _mm256_add_pd(fiz3,tz);
1678
1679             fjx2             = _mm256_add_pd(fjx2,tx);
1680             fjy2             = _mm256_add_pd(fjy2,ty);
1681             fjz2             = _mm256_add_pd(fjz2,tz);
1682
1683             }
1684
1685             /**************************
1686              * CALCULATE INTERACTIONS *
1687              **************************/
1688
1689             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1690             {
1691
1692             r33              = _mm256_mul_pd(rsq33,rinv33);
1693             r33              = _mm256_andnot_pd(dummy_mask,r33);
1694
1695             /* EWALD ELECTROSTATICS */
1696
1697             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1698             ewrt             = _mm256_mul_pd(r33,ewtabscale);
1699             ewitab           = _mm256_cvttpd_epi32(ewrt);
1700             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1701             ewitab           = _mm_slli_epi32(ewitab,2);
1702             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1703             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1704             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1705             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1706             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1707             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1708             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1709             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1710             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1711
1712             d                = _mm256_sub_pd(r33,rswitch);
1713             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1714             d2               = _mm256_mul_pd(d,d);
1715             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1716
1717             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1718
1719             /* Evaluate switch function */
1720             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1721             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
1722             velec            = _mm256_mul_pd(velec,sw);
1723             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1724
1725             /* Update potential sum for this i atom from the interaction with this j atom. */
1726             velec            = _mm256_and_pd(velec,cutoff_mask);
1727             velec            = _mm256_andnot_pd(dummy_mask,velec);
1728             velecsum         = _mm256_add_pd(velecsum,velec);
1729
1730             fscal            = felec;
1731
1732             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1733
1734             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1735
1736             /* Calculate temporary vectorial force */
1737             tx               = _mm256_mul_pd(fscal,dx33);
1738             ty               = _mm256_mul_pd(fscal,dy33);
1739             tz               = _mm256_mul_pd(fscal,dz33);
1740
1741             /* Update vectorial force */
1742             fix3             = _mm256_add_pd(fix3,tx);
1743             fiy3             = _mm256_add_pd(fiy3,ty);
1744             fiz3             = _mm256_add_pd(fiz3,tz);
1745
1746             fjx3             = _mm256_add_pd(fjx3,tx);
1747             fjy3             = _mm256_add_pd(fjy3,ty);
1748             fjz3             = _mm256_add_pd(fjz3,tz);
1749
1750             }
1751
1752             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1753             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1754             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1755             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1756
1757             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1758                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1759                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1760
1761             /* Inner loop uses 657 flops */
1762         }
1763
1764         /* End of innermost loop */
1765
1766         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1767                                                  f+i_coord_offset,fshift+i_shift_offset);
1768
1769         ggid                        = gid[iidx];
1770         /* Update potential energies */
1771         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1772         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1773
1774         /* Increment number of inner iterations */
1775         inneriter                  += j_index_end - j_index_start;
1776
1777         /* Outer loop uses 26 flops */
1778     }
1779
1780     /* Increment number of outer iterations */
1781     outeriter        += nri;
1782
1783     /* Update outer/inner flops */
1784
1785     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
1786 }
1787 /*
1788  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double
1789  * Electrostatics interaction: Ewald
1790  * VdW interaction:            LennardJones
1791  * Geometry:                   Water4-Water4
1792  * Calculate force/pot:        Force
1793  */
1794 void
1795 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double
1796                     (t_nblist                    * gmx_restrict       nlist,
1797                      rvec                        * gmx_restrict          xx,
1798                      rvec                        * gmx_restrict          ff,
1799                      t_forcerec                  * gmx_restrict          fr,
1800                      t_mdatoms                   * gmx_restrict     mdatoms,
1801                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1802                      t_nrnb                      * gmx_restrict        nrnb)
1803 {
1804     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1805      * just 0 for non-waters.
1806      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1807      * jnr indices corresponding to data put in the four positions in the SIMD register.
1808      */
1809     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1810     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1811     int              jnrA,jnrB,jnrC,jnrD;
1812     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1813     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1814     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1815     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1816     real             rcutoff_scalar;
1817     real             *shiftvec,*fshift,*x,*f;
1818     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1819     real             scratch[4*DIM];
1820     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1821     real *           vdwioffsetptr0;
1822     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1823     real *           vdwioffsetptr1;
1824     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1825     real *           vdwioffsetptr2;
1826     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1827     real *           vdwioffsetptr3;
1828     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1829     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1830     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1831     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1832     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1833     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1834     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1835     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1836     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1837     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1838     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1839     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1840     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1841     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1842     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1843     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1844     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1845     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1846     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1847     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1848     real             *charge;
1849     int              nvdwtype;
1850     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1851     int              *vdwtype;
1852     real             *vdwparam;
1853     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1854     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1855     __m128i          ewitab;
1856     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1857     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1858     real             *ewtab;
1859     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1860     real             rswitch_scalar,d_scalar;
1861     __m256d          dummy_mask,cutoff_mask;
1862     __m128           tmpmask0,tmpmask1;
1863     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1864     __m256d          one     = _mm256_set1_pd(1.0);
1865     __m256d          two     = _mm256_set1_pd(2.0);
1866     x                = xx[0];
1867     f                = ff[0];
1868
1869     nri              = nlist->nri;
1870     iinr             = nlist->iinr;
1871     jindex           = nlist->jindex;
1872     jjnr             = nlist->jjnr;
1873     shiftidx         = nlist->shift;
1874     gid              = nlist->gid;
1875     shiftvec         = fr->shift_vec[0];
1876     fshift           = fr->fshift[0];
1877     facel            = _mm256_set1_pd(fr->epsfac);
1878     charge           = mdatoms->chargeA;
1879     nvdwtype         = fr->ntype;
1880     vdwparam         = fr->nbfp;
1881     vdwtype          = mdatoms->typeA;
1882
1883     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1884     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1885     beta2            = _mm256_mul_pd(beta,beta);
1886     beta3            = _mm256_mul_pd(beta,beta2);
1887
1888     ewtab            = fr->ic->tabq_coul_FDV0;
1889     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1890     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1891
1892     /* Setup water-specific parameters */
1893     inr              = nlist->iinr[0];
1894     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1895     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1896     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1897     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1898
1899     jq1              = _mm256_set1_pd(charge[inr+1]);
1900     jq2              = _mm256_set1_pd(charge[inr+2]);
1901     jq3              = _mm256_set1_pd(charge[inr+3]);
1902     vdwjidx0A        = 2*vdwtype[inr+0];
1903     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1904     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1905     qq11             = _mm256_mul_pd(iq1,jq1);
1906     qq12             = _mm256_mul_pd(iq1,jq2);
1907     qq13             = _mm256_mul_pd(iq1,jq3);
1908     qq21             = _mm256_mul_pd(iq2,jq1);
1909     qq22             = _mm256_mul_pd(iq2,jq2);
1910     qq23             = _mm256_mul_pd(iq2,jq3);
1911     qq31             = _mm256_mul_pd(iq3,jq1);
1912     qq32             = _mm256_mul_pd(iq3,jq2);
1913     qq33             = _mm256_mul_pd(iq3,jq3);
1914
1915     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1916     rcutoff_scalar   = fr->rcoulomb;
1917     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1918     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1919
1920     rswitch_scalar   = fr->rcoulomb_switch;
1921     rswitch          = _mm256_set1_pd(rswitch_scalar);
1922     /* Setup switch parameters */
1923     d_scalar         = rcutoff_scalar-rswitch_scalar;
1924     d                = _mm256_set1_pd(d_scalar);
1925     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1926     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1927     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1928     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1929     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1930     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1931
1932     /* Avoid stupid compiler warnings */
1933     jnrA = jnrB = jnrC = jnrD = 0;
1934     j_coord_offsetA = 0;
1935     j_coord_offsetB = 0;
1936     j_coord_offsetC = 0;
1937     j_coord_offsetD = 0;
1938
1939     outeriter        = 0;
1940     inneriter        = 0;
1941
1942     for(iidx=0;iidx<4*DIM;iidx++)
1943     {
1944         scratch[iidx] = 0.0;
1945     }
1946
1947     /* Start outer loop over neighborlists */
1948     for(iidx=0; iidx<nri; iidx++)
1949     {
1950         /* Load shift vector for this list */
1951         i_shift_offset   = DIM*shiftidx[iidx];
1952
1953         /* Load limits for loop over neighbors */
1954         j_index_start    = jindex[iidx];
1955         j_index_end      = jindex[iidx+1];
1956
1957         /* Get outer coordinate index */
1958         inr              = iinr[iidx];
1959         i_coord_offset   = DIM*inr;
1960
1961         /* Load i particle coords and add shift vector */
1962         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1963                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1964
1965         fix0             = _mm256_setzero_pd();
1966         fiy0             = _mm256_setzero_pd();
1967         fiz0             = _mm256_setzero_pd();
1968         fix1             = _mm256_setzero_pd();
1969         fiy1             = _mm256_setzero_pd();
1970         fiz1             = _mm256_setzero_pd();
1971         fix2             = _mm256_setzero_pd();
1972         fiy2             = _mm256_setzero_pd();
1973         fiz2             = _mm256_setzero_pd();
1974         fix3             = _mm256_setzero_pd();
1975         fiy3             = _mm256_setzero_pd();
1976         fiz3             = _mm256_setzero_pd();
1977
1978         /* Start inner kernel loop */
1979         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1980         {
1981
1982             /* Get j neighbor index, and coordinate index */
1983             jnrA             = jjnr[jidx];
1984             jnrB             = jjnr[jidx+1];
1985             jnrC             = jjnr[jidx+2];
1986             jnrD             = jjnr[jidx+3];
1987             j_coord_offsetA  = DIM*jnrA;
1988             j_coord_offsetB  = DIM*jnrB;
1989             j_coord_offsetC  = DIM*jnrC;
1990             j_coord_offsetD  = DIM*jnrD;
1991
1992             /* load j atom coordinates */
1993             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1994                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1995                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1996                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1997
1998             /* Calculate displacement vector */
1999             dx00             = _mm256_sub_pd(ix0,jx0);
2000             dy00             = _mm256_sub_pd(iy0,jy0);
2001             dz00             = _mm256_sub_pd(iz0,jz0);
2002             dx11             = _mm256_sub_pd(ix1,jx1);
2003             dy11             = _mm256_sub_pd(iy1,jy1);
2004             dz11             = _mm256_sub_pd(iz1,jz1);
2005             dx12             = _mm256_sub_pd(ix1,jx2);
2006             dy12             = _mm256_sub_pd(iy1,jy2);
2007             dz12             = _mm256_sub_pd(iz1,jz2);
2008             dx13             = _mm256_sub_pd(ix1,jx3);
2009             dy13             = _mm256_sub_pd(iy1,jy3);
2010             dz13             = _mm256_sub_pd(iz1,jz3);
2011             dx21             = _mm256_sub_pd(ix2,jx1);
2012             dy21             = _mm256_sub_pd(iy2,jy1);
2013             dz21             = _mm256_sub_pd(iz2,jz1);
2014             dx22             = _mm256_sub_pd(ix2,jx2);
2015             dy22             = _mm256_sub_pd(iy2,jy2);
2016             dz22             = _mm256_sub_pd(iz2,jz2);
2017             dx23             = _mm256_sub_pd(ix2,jx3);
2018             dy23             = _mm256_sub_pd(iy2,jy3);
2019             dz23             = _mm256_sub_pd(iz2,jz3);
2020             dx31             = _mm256_sub_pd(ix3,jx1);
2021             dy31             = _mm256_sub_pd(iy3,jy1);
2022             dz31             = _mm256_sub_pd(iz3,jz1);
2023             dx32             = _mm256_sub_pd(ix3,jx2);
2024             dy32             = _mm256_sub_pd(iy3,jy2);
2025             dz32             = _mm256_sub_pd(iz3,jz2);
2026             dx33             = _mm256_sub_pd(ix3,jx3);
2027             dy33             = _mm256_sub_pd(iy3,jy3);
2028             dz33             = _mm256_sub_pd(iz3,jz3);
2029
2030             /* Calculate squared distance and things based on it */
2031             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2032             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2033             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2034             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2035             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2036             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2037             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2038             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2039             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2040             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2041
2042             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
2043             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2044             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2045             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
2046             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2047             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2048             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
2049             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
2050             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
2051             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
2052
2053             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2054             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2055             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2056             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2057             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2058             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2059             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2060             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2061             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2062             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2063
2064             fjx0             = _mm256_setzero_pd();
2065             fjy0             = _mm256_setzero_pd();
2066             fjz0             = _mm256_setzero_pd();
2067             fjx1             = _mm256_setzero_pd();
2068             fjy1             = _mm256_setzero_pd();
2069             fjz1             = _mm256_setzero_pd();
2070             fjx2             = _mm256_setzero_pd();
2071             fjy2             = _mm256_setzero_pd();
2072             fjz2             = _mm256_setzero_pd();
2073             fjx3             = _mm256_setzero_pd();
2074             fjy3             = _mm256_setzero_pd();
2075             fjz3             = _mm256_setzero_pd();
2076
2077             /**************************
2078              * CALCULATE INTERACTIONS *
2079              **************************/
2080
2081             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2082             {
2083
2084             r00              = _mm256_mul_pd(rsq00,rinv00);
2085
2086             /* LENNARD-JONES DISPERSION/REPULSION */
2087
2088             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2089             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
2090             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2091             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2092             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2093
2094             d                = _mm256_sub_pd(r00,rswitch);
2095             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2096             d2               = _mm256_mul_pd(d,d);
2097             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2098
2099             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2100
2101             /* Evaluate switch function */
2102             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2103             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2104             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2105
2106             fscal            = fvdw;
2107
2108             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2109
2110             /* Calculate temporary vectorial force */
2111             tx               = _mm256_mul_pd(fscal,dx00);
2112             ty               = _mm256_mul_pd(fscal,dy00);
2113             tz               = _mm256_mul_pd(fscal,dz00);
2114
2115             /* Update vectorial force */
2116             fix0             = _mm256_add_pd(fix0,tx);
2117             fiy0             = _mm256_add_pd(fiy0,ty);
2118             fiz0             = _mm256_add_pd(fiz0,tz);
2119
2120             fjx0             = _mm256_add_pd(fjx0,tx);
2121             fjy0             = _mm256_add_pd(fjy0,ty);
2122             fjz0             = _mm256_add_pd(fjz0,tz);
2123
2124             }
2125
2126             /**************************
2127              * CALCULATE INTERACTIONS *
2128              **************************/
2129
2130             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2131             {
2132
2133             r11              = _mm256_mul_pd(rsq11,rinv11);
2134
2135             /* EWALD ELECTROSTATICS */
2136
2137             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2138             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2139             ewitab           = _mm256_cvttpd_epi32(ewrt);
2140             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2141             ewitab           = _mm_slli_epi32(ewitab,2);
2142             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2143             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2144             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2145             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2146             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2147             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2148             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2149             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2150             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2151
2152             d                = _mm256_sub_pd(r11,rswitch);
2153             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2154             d2               = _mm256_mul_pd(d,d);
2155             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2156
2157             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2158
2159             /* Evaluate switch function */
2160             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2161             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2162             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2163
2164             fscal            = felec;
2165
2166             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2167
2168             /* Calculate temporary vectorial force */
2169             tx               = _mm256_mul_pd(fscal,dx11);
2170             ty               = _mm256_mul_pd(fscal,dy11);
2171             tz               = _mm256_mul_pd(fscal,dz11);
2172
2173             /* Update vectorial force */
2174             fix1             = _mm256_add_pd(fix1,tx);
2175             fiy1             = _mm256_add_pd(fiy1,ty);
2176             fiz1             = _mm256_add_pd(fiz1,tz);
2177
2178             fjx1             = _mm256_add_pd(fjx1,tx);
2179             fjy1             = _mm256_add_pd(fjy1,ty);
2180             fjz1             = _mm256_add_pd(fjz1,tz);
2181
2182             }
2183
2184             /**************************
2185              * CALCULATE INTERACTIONS *
2186              **************************/
2187
2188             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2189             {
2190
2191             r12              = _mm256_mul_pd(rsq12,rinv12);
2192
2193             /* EWALD ELECTROSTATICS */
2194
2195             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2196             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2197             ewitab           = _mm256_cvttpd_epi32(ewrt);
2198             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2199             ewitab           = _mm_slli_epi32(ewitab,2);
2200             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2201             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2202             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2203             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2204             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2205             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2206             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2207             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2208             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2209
2210             d                = _mm256_sub_pd(r12,rswitch);
2211             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2212             d2               = _mm256_mul_pd(d,d);
2213             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2214
2215             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2216
2217             /* Evaluate switch function */
2218             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2219             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2220             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2221
2222             fscal            = felec;
2223
2224             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2225
2226             /* Calculate temporary vectorial force */
2227             tx               = _mm256_mul_pd(fscal,dx12);
2228             ty               = _mm256_mul_pd(fscal,dy12);
2229             tz               = _mm256_mul_pd(fscal,dz12);
2230
2231             /* Update vectorial force */
2232             fix1             = _mm256_add_pd(fix1,tx);
2233             fiy1             = _mm256_add_pd(fiy1,ty);
2234             fiz1             = _mm256_add_pd(fiz1,tz);
2235
2236             fjx2             = _mm256_add_pd(fjx2,tx);
2237             fjy2             = _mm256_add_pd(fjy2,ty);
2238             fjz2             = _mm256_add_pd(fjz2,tz);
2239
2240             }
2241
2242             /**************************
2243              * CALCULATE INTERACTIONS *
2244              **************************/
2245
2246             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2247             {
2248
2249             r13              = _mm256_mul_pd(rsq13,rinv13);
2250
2251             /* EWALD ELECTROSTATICS */
2252
2253             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2254             ewrt             = _mm256_mul_pd(r13,ewtabscale);
2255             ewitab           = _mm256_cvttpd_epi32(ewrt);
2256             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2257             ewitab           = _mm_slli_epi32(ewitab,2);
2258             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2259             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2260             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2261             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2262             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2263             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2264             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2265             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2266             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2267
2268             d                = _mm256_sub_pd(r13,rswitch);
2269             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2270             d2               = _mm256_mul_pd(d,d);
2271             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2272
2273             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2274
2275             /* Evaluate switch function */
2276             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2277             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2278             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2279
2280             fscal            = felec;
2281
2282             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2283
2284             /* Calculate temporary vectorial force */
2285             tx               = _mm256_mul_pd(fscal,dx13);
2286             ty               = _mm256_mul_pd(fscal,dy13);
2287             tz               = _mm256_mul_pd(fscal,dz13);
2288
2289             /* Update vectorial force */
2290             fix1             = _mm256_add_pd(fix1,tx);
2291             fiy1             = _mm256_add_pd(fiy1,ty);
2292             fiz1             = _mm256_add_pd(fiz1,tz);
2293
2294             fjx3             = _mm256_add_pd(fjx3,tx);
2295             fjy3             = _mm256_add_pd(fjy3,ty);
2296             fjz3             = _mm256_add_pd(fjz3,tz);
2297
2298             }
2299
2300             /**************************
2301              * CALCULATE INTERACTIONS *
2302              **************************/
2303
2304             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2305             {
2306
2307             r21              = _mm256_mul_pd(rsq21,rinv21);
2308
2309             /* EWALD ELECTROSTATICS */
2310
2311             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2312             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2313             ewitab           = _mm256_cvttpd_epi32(ewrt);
2314             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2315             ewitab           = _mm_slli_epi32(ewitab,2);
2316             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2317             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2318             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2319             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2320             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2321             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2322             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2323             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2324             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2325
2326             d                = _mm256_sub_pd(r21,rswitch);
2327             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2328             d2               = _mm256_mul_pd(d,d);
2329             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2330
2331             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2332
2333             /* Evaluate switch function */
2334             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2335             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2336             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2337
2338             fscal            = felec;
2339
2340             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2341
2342             /* Calculate temporary vectorial force */
2343             tx               = _mm256_mul_pd(fscal,dx21);
2344             ty               = _mm256_mul_pd(fscal,dy21);
2345             tz               = _mm256_mul_pd(fscal,dz21);
2346
2347             /* Update vectorial force */
2348             fix2             = _mm256_add_pd(fix2,tx);
2349             fiy2             = _mm256_add_pd(fiy2,ty);
2350             fiz2             = _mm256_add_pd(fiz2,tz);
2351
2352             fjx1             = _mm256_add_pd(fjx1,tx);
2353             fjy1             = _mm256_add_pd(fjy1,ty);
2354             fjz1             = _mm256_add_pd(fjz1,tz);
2355
2356             }
2357
2358             /**************************
2359              * CALCULATE INTERACTIONS *
2360              **************************/
2361
2362             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2363             {
2364
2365             r22              = _mm256_mul_pd(rsq22,rinv22);
2366
2367             /* EWALD ELECTROSTATICS */
2368
2369             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2370             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2371             ewitab           = _mm256_cvttpd_epi32(ewrt);
2372             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2373             ewitab           = _mm_slli_epi32(ewitab,2);
2374             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2375             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2376             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2377             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2378             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2379             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2380             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2381             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2382             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2383
2384             d                = _mm256_sub_pd(r22,rswitch);
2385             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2386             d2               = _mm256_mul_pd(d,d);
2387             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2388
2389             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2390
2391             /* Evaluate switch function */
2392             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2393             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2394             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2395
2396             fscal            = felec;
2397
2398             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2399
2400             /* Calculate temporary vectorial force */
2401             tx               = _mm256_mul_pd(fscal,dx22);
2402             ty               = _mm256_mul_pd(fscal,dy22);
2403             tz               = _mm256_mul_pd(fscal,dz22);
2404
2405             /* Update vectorial force */
2406             fix2             = _mm256_add_pd(fix2,tx);
2407             fiy2             = _mm256_add_pd(fiy2,ty);
2408             fiz2             = _mm256_add_pd(fiz2,tz);
2409
2410             fjx2             = _mm256_add_pd(fjx2,tx);
2411             fjy2             = _mm256_add_pd(fjy2,ty);
2412             fjz2             = _mm256_add_pd(fjz2,tz);
2413
2414             }
2415
2416             /**************************
2417              * CALCULATE INTERACTIONS *
2418              **************************/
2419
2420             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2421             {
2422
2423             r23              = _mm256_mul_pd(rsq23,rinv23);
2424
2425             /* EWALD ELECTROSTATICS */
2426
2427             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2428             ewrt             = _mm256_mul_pd(r23,ewtabscale);
2429             ewitab           = _mm256_cvttpd_epi32(ewrt);
2430             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2431             ewitab           = _mm_slli_epi32(ewitab,2);
2432             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2433             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2434             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2435             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2436             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2437             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2438             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2439             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
2440             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2441
2442             d                = _mm256_sub_pd(r23,rswitch);
2443             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2444             d2               = _mm256_mul_pd(d,d);
2445             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2446
2447             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2448
2449             /* Evaluate switch function */
2450             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2451             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
2452             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2453
2454             fscal            = felec;
2455
2456             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2457
2458             /* Calculate temporary vectorial force */
2459             tx               = _mm256_mul_pd(fscal,dx23);
2460             ty               = _mm256_mul_pd(fscal,dy23);
2461             tz               = _mm256_mul_pd(fscal,dz23);
2462
2463             /* Update vectorial force */
2464             fix2             = _mm256_add_pd(fix2,tx);
2465             fiy2             = _mm256_add_pd(fiy2,ty);
2466             fiz2             = _mm256_add_pd(fiz2,tz);
2467
2468             fjx3             = _mm256_add_pd(fjx3,tx);
2469             fjy3             = _mm256_add_pd(fjy3,ty);
2470             fjz3             = _mm256_add_pd(fjz3,tz);
2471
2472             }
2473
2474             /**************************
2475              * CALCULATE INTERACTIONS *
2476              **************************/
2477
2478             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2479             {
2480
2481             r31              = _mm256_mul_pd(rsq31,rinv31);
2482
2483             /* EWALD ELECTROSTATICS */
2484
2485             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2486             ewrt             = _mm256_mul_pd(r31,ewtabscale);
2487             ewitab           = _mm256_cvttpd_epi32(ewrt);
2488             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2489             ewitab           = _mm_slli_epi32(ewitab,2);
2490             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2491             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2492             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2493             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2494             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2495             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2496             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2497             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
2498             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2499
2500             d                = _mm256_sub_pd(r31,rswitch);
2501             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2502             d2               = _mm256_mul_pd(d,d);
2503             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2504
2505             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2506
2507             /* Evaluate switch function */
2508             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2509             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
2510             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2511
2512             fscal            = felec;
2513
2514             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2515
2516             /* Calculate temporary vectorial force */
2517             tx               = _mm256_mul_pd(fscal,dx31);
2518             ty               = _mm256_mul_pd(fscal,dy31);
2519             tz               = _mm256_mul_pd(fscal,dz31);
2520
2521             /* Update vectorial force */
2522             fix3             = _mm256_add_pd(fix3,tx);
2523             fiy3             = _mm256_add_pd(fiy3,ty);
2524             fiz3             = _mm256_add_pd(fiz3,tz);
2525
2526             fjx1             = _mm256_add_pd(fjx1,tx);
2527             fjy1             = _mm256_add_pd(fjy1,ty);
2528             fjz1             = _mm256_add_pd(fjz1,tz);
2529
2530             }
2531
2532             /**************************
2533              * CALCULATE INTERACTIONS *
2534              **************************/
2535
2536             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2537             {
2538
2539             r32              = _mm256_mul_pd(rsq32,rinv32);
2540
2541             /* EWALD ELECTROSTATICS */
2542
2543             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2544             ewrt             = _mm256_mul_pd(r32,ewtabscale);
2545             ewitab           = _mm256_cvttpd_epi32(ewrt);
2546             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2547             ewitab           = _mm_slli_epi32(ewitab,2);
2548             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2549             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2550             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2551             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2552             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2553             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2554             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2555             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
2556             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2557
2558             d                = _mm256_sub_pd(r32,rswitch);
2559             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2560             d2               = _mm256_mul_pd(d,d);
2561             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2562
2563             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2564
2565             /* Evaluate switch function */
2566             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2567             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
2568             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2569
2570             fscal            = felec;
2571
2572             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2573
2574             /* Calculate temporary vectorial force */
2575             tx               = _mm256_mul_pd(fscal,dx32);
2576             ty               = _mm256_mul_pd(fscal,dy32);
2577             tz               = _mm256_mul_pd(fscal,dz32);
2578
2579             /* Update vectorial force */
2580             fix3             = _mm256_add_pd(fix3,tx);
2581             fiy3             = _mm256_add_pd(fiy3,ty);
2582             fiz3             = _mm256_add_pd(fiz3,tz);
2583
2584             fjx2             = _mm256_add_pd(fjx2,tx);
2585             fjy2             = _mm256_add_pd(fjy2,ty);
2586             fjz2             = _mm256_add_pd(fjz2,tz);
2587
2588             }
2589
2590             /**************************
2591              * CALCULATE INTERACTIONS *
2592              **************************/
2593
2594             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2595             {
2596
2597             r33              = _mm256_mul_pd(rsq33,rinv33);
2598
2599             /* EWALD ELECTROSTATICS */
2600
2601             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2602             ewrt             = _mm256_mul_pd(r33,ewtabscale);
2603             ewitab           = _mm256_cvttpd_epi32(ewrt);
2604             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2605             ewitab           = _mm_slli_epi32(ewitab,2);
2606             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2607             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2608             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2609             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2610             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2611             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2612             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2613             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
2614             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2615
2616             d                = _mm256_sub_pd(r33,rswitch);
2617             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2618             d2               = _mm256_mul_pd(d,d);
2619             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2620
2621             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2622
2623             /* Evaluate switch function */
2624             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2625             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
2626             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2627
2628             fscal            = felec;
2629
2630             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2631
2632             /* Calculate temporary vectorial force */
2633             tx               = _mm256_mul_pd(fscal,dx33);
2634             ty               = _mm256_mul_pd(fscal,dy33);
2635             tz               = _mm256_mul_pd(fscal,dz33);
2636
2637             /* Update vectorial force */
2638             fix3             = _mm256_add_pd(fix3,tx);
2639             fiy3             = _mm256_add_pd(fiy3,ty);
2640             fiz3             = _mm256_add_pd(fiz3,tz);
2641
2642             fjx3             = _mm256_add_pd(fjx3,tx);
2643             fjy3             = _mm256_add_pd(fjy3,ty);
2644             fjz3             = _mm256_add_pd(fjz3,tz);
2645
2646             }
2647
2648             fjptrA             = f+j_coord_offsetA;
2649             fjptrB             = f+j_coord_offsetB;
2650             fjptrC             = f+j_coord_offsetC;
2651             fjptrD             = f+j_coord_offsetD;
2652
2653             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2654                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2655                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2656
2657             /* Inner loop uses 617 flops */
2658         }
2659
2660         if(jidx<j_index_end)
2661         {
2662
2663             /* Get j neighbor index, and coordinate index */
2664             jnrlistA         = jjnr[jidx];
2665             jnrlistB         = jjnr[jidx+1];
2666             jnrlistC         = jjnr[jidx+2];
2667             jnrlistD         = jjnr[jidx+3];
2668             /* Sign of each element will be negative for non-real atoms.
2669              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2670              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2671              */
2672             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2673
2674             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2675             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2676             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2677
2678             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2679             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2680             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2681             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2682             j_coord_offsetA  = DIM*jnrA;
2683             j_coord_offsetB  = DIM*jnrB;
2684             j_coord_offsetC  = DIM*jnrC;
2685             j_coord_offsetD  = DIM*jnrD;
2686
2687             /* load j atom coordinates */
2688             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2689                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2690                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2691                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2692
2693             /* Calculate displacement vector */
2694             dx00             = _mm256_sub_pd(ix0,jx0);
2695             dy00             = _mm256_sub_pd(iy0,jy0);
2696             dz00             = _mm256_sub_pd(iz0,jz0);
2697             dx11             = _mm256_sub_pd(ix1,jx1);
2698             dy11             = _mm256_sub_pd(iy1,jy1);
2699             dz11             = _mm256_sub_pd(iz1,jz1);
2700             dx12             = _mm256_sub_pd(ix1,jx2);
2701             dy12             = _mm256_sub_pd(iy1,jy2);
2702             dz12             = _mm256_sub_pd(iz1,jz2);
2703             dx13             = _mm256_sub_pd(ix1,jx3);
2704             dy13             = _mm256_sub_pd(iy1,jy3);
2705             dz13             = _mm256_sub_pd(iz1,jz3);
2706             dx21             = _mm256_sub_pd(ix2,jx1);
2707             dy21             = _mm256_sub_pd(iy2,jy1);
2708             dz21             = _mm256_sub_pd(iz2,jz1);
2709             dx22             = _mm256_sub_pd(ix2,jx2);
2710             dy22             = _mm256_sub_pd(iy2,jy2);
2711             dz22             = _mm256_sub_pd(iz2,jz2);
2712             dx23             = _mm256_sub_pd(ix2,jx3);
2713             dy23             = _mm256_sub_pd(iy2,jy3);
2714             dz23             = _mm256_sub_pd(iz2,jz3);
2715             dx31             = _mm256_sub_pd(ix3,jx1);
2716             dy31             = _mm256_sub_pd(iy3,jy1);
2717             dz31             = _mm256_sub_pd(iz3,jz1);
2718             dx32             = _mm256_sub_pd(ix3,jx2);
2719             dy32             = _mm256_sub_pd(iy3,jy2);
2720             dz32             = _mm256_sub_pd(iz3,jz2);
2721             dx33             = _mm256_sub_pd(ix3,jx3);
2722             dy33             = _mm256_sub_pd(iy3,jy3);
2723             dz33             = _mm256_sub_pd(iz3,jz3);
2724
2725             /* Calculate squared distance and things based on it */
2726             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2727             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2728             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2729             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2730             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2731             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2732             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2733             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2734             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2735             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2736
2737             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
2738             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2739             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2740             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
2741             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2742             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2743             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
2744             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
2745             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
2746             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
2747
2748             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2749             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2750             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2751             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2752             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2753             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2754             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2755             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2756             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2757             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2758
2759             fjx0             = _mm256_setzero_pd();
2760             fjy0             = _mm256_setzero_pd();
2761             fjz0             = _mm256_setzero_pd();
2762             fjx1             = _mm256_setzero_pd();
2763             fjy1             = _mm256_setzero_pd();
2764             fjz1             = _mm256_setzero_pd();
2765             fjx2             = _mm256_setzero_pd();
2766             fjy2             = _mm256_setzero_pd();
2767             fjz2             = _mm256_setzero_pd();
2768             fjx3             = _mm256_setzero_pd();
2769             fjy3             = _mm256_setzero_pd();
2770             fjz3             = _mm256_setzero_pd();
2771
2772             /**************************
2773              * CALCULATE INTERACTIONS *
2774              **************************/
2775
2776             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2777             {
2778
2779             r00              = _mm256_mul_pd(rsq00,rinv00);
2780             r00              = _mm256_andnot_pd(dummy_mask,r00);
2781
2782             /* LENNARD-JONES DISPERSION/REPULSION */
2783
2784             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2785             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
2786             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2787             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2788             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2789
2790             d                = _mm256_sub_pd(r00,rswitch);
2791             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2792             d2               = _mm256_mul_pd(d,d);
2793             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2794
2795             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2796
2797             /* Evaluate switch function */
2798             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2799             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2800             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2801
2802             fscal            = fvdw;
2803
2804             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2805
2806             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2807
2808             /* Calculate temporary vectorial force */
2809             tx               = _mm256_mul_pd(fscal,dx00);
2810             ty               = _mm256_mul_pd(fscal,dy00);
2811             tz               = _mm256_mul_pd(fscal,dz00);
2812
2813             /* Update vectorial force */
2814             fix0             = _mm256_add_pd(fix0,tx);
2815             fiy0             = _mm256_add_pd(fiy0,ty);
2816             fiz0             = _mm256_add_pd(fiz0,tz);
2817
2818             fjx0             = _mm256_add_pd(fjx0,tx);
2819             fjy0             = _mm256_add_pd(fjy0,ty);
2820             fjz0             = _mm256_add_pd(fjz0,tz);
2821
2822             }
2823
2824             /**************************
2825              * CALCULATE INTERACTIONS *
2826              **************************/
2827
2828             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2829             {
2830
2831             r11              = _mm256_mul_pd(rsq11,rinv11);
2832             r11              = _mm256_andnot_pd(dummy_mask,r11);
2833
2834             /* EWALD ELECTROSTATICS */
2835
2836             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2837             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2838             ewitab           = _mm256_cvttpd_epi32(ewrt);
2839             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2840             ewitab           = _mm_slli_epi32(ewitab,2);
2841             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2842             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2843             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2844             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2845             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2846             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2847             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2848             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2849             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2850
2851             d                = _mm256_sub_pd(r11,rswitch);
2852             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2853             d2               = _mm256_mul_pd(d,d);
2854             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2855
2856             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2857
2858             /* Evaluate switch function */
2859             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2860             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2861             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2862
2863             fscal            = felec;
2864
2865             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2866
2867             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2868
2869             /* Calculate temporary vectorial force */
2870             tx               = _mm256_mul_pd(fscal,dx11);
2871             ty               = _mm256_mul_pd(fscal,dy11);
2872             tz               = _mm256_mul_pd(fscal,dz11);
2873
2874             /* Update vectorial force */
2875             fix1             = _mm256_add_pd(fix1,tx);
2876             fiy1             = _mm256_add_pd(fiy1,ty);
2877             fiz1             = _mm256_add_pd(fiz1,tz);
2878
2879             fjx1             = _mm256_add_pd(fjx1,tx);
2880             fjy1             = _mm256_add_pd(fjy1,ty);
2881             fjz1             = _mm256_add_pd(fjz1,tz);
2882
2883             }
2884
2885             /**************************
2886              * CALCULATE INTERACTIONS *
2887              **************************/
2888
2889             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2890             {
2891
2892             r12              = _mm256_mul_pd(rsq12,rinv12);
2893             r12              = _mm256_andnot_pd(dummy_mask,r12);
2894
2895             /* EWALD ELECTROSTATICS */
2896
2897             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2898             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2899             ewitab           = _mm256_cvttpd_epi32(ewrt);
2900             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2901             ewitab           = _mm_slli_epi32(ewitab,2);
2902             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2903             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2904             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2905             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2906             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2907             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2908             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2909             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2910             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2911
2912             d                = _mm256_sub_pd(r12,rswitch);
2913             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2914             d2               = _mm256_mul_pd(d,d);
2915             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2916
2917             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2918
2919             /* Evaluate switch function */
2920             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2921             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2922             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2923
2924             fscal            = felec;
2925
2926             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2927
2928             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2929
2930             /* Calculate temporary vectorial force */
2931             tx               = _mm256_mul_pd(fscal,dx12);
2932             ty               = _mm256_mul_pd(fscal,dy12);
2933             tz               = _mm256_mul_pd(fscal,dz12);
2934
2935             /* Update vectorial force */
2936             fix1             = _mm256_add_pd(fix1,tx);
2937             fiy1             = _mm256_add_pd(fiy1,ty);
2938             fiz1             = _mm256_add_pd(fiz1,tz);
2939
2940             fjx2             = _mm256_add_pd(fjx2,tx);
2941             fjy2             = _mm256_add_pd(fjy2,ty);
2942             fjz2             = _mm256_add_pd(fjz2,tz);
2943
2944             }
2945
2946             /**************************
2947              * CALCULATE INTERACTIONS *
2948              **************************/
2949
2950             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2951             {
2952
2953             r13              = _mm256_mul_pd(rsq13,rinv13);
2954             r13              = _mm256_andnot_pd(dummy_mask,r13);
2955
2956             /* EWALD ELECTROSTATICS */
2957
2958             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2959             ewrt             = _mm256_mul_pd(r13,ewtabscale);
2960             ewitab           = _mm256_cvttpd_epi32(ewrt);
2961             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2962             ewitab           = _mm_slli_epi32(ewitab,2);
2963             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2964             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2965             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2966             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2967             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2968             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2969             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2970             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2971             felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2972
2973             d                = _mm256_sub_pd(r13,rswitch);
2974             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2975             d2               = _mm256_mul_pd(d,d);
2976             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2977
2978             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2979
2980             /* Evaluate switch function */
2981             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2982             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2983             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2984
2985             fscal            = felec;
2986
2987             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2988
2989             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2990
2991             /* Calculate temporary vectorial force */
2992             tx               = _mm256_mul_pd(fscal,dx13);
2993             ty               = _mm256_mul_pd(fscal,dy13);
2994             tz               = _mm256_mul_pd(fscal,dz13);
2995
2996             /* Update vectorial force */
2997             fix1             = _mm256_add_pd(fix1,tx);
2998             fiy1             = _mm256_add_pd(fiy1,ty);
2999             fiz1             = _mm256_add_pd(fiz1,tz);
3000
3001             fjx3             = _mm256_add_pd(fjx3,tx);
3002             fjy3             = _mm256_add_pd(fjy3,ty);
3003             fjz3             = _mm256_add_pd(fjz3,tz);
3004
3005             }
3006
3007             /**************************
3008              * CALCULATE INTERACTIONS *
3009              **************************/
3010
3011             if (gmx_mm256_any_lt(rsq21,rcutoff2))
3012             {
3013
3014             r21              = _mm256_mul_pd(rsq21,rinv21);
3015             r21              = _mm256_andnot_pd(dummy_mask,r21);
3016
3017             /* EWALD ELECTROSTATICS */
3018
3019             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3020             ewrt             = _mm256_mul_pd(r21,ewtabscale);
3021             ewitab           = _mm256_cvttpd_epi32(ewrt);
3022             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3023             ewitab           = _mm_slli_epi32(ewitab,2);
3024             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3025             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3026             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3027             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3028             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3029             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3030             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3031             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
3032             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
3033
3034             d                = _mm256_sub_pd(r21,rswitch);
3035             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3036             d2               = _mm256_mul_pd(d,d);
3037             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3038
3039             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3040
3041             /* Evaluate switch function */
3042             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3043             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
3044             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
3045
3046             fscal            = felec;
3047
3048             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3049
3050             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3051
3052             /* Calculate temporary vectorial force */
3053             tx               = _mm256_mul_pd(fscal,dx21);
3054             ty               = _mm256_mul_pd(fscal,dy21);
3055             tz               = _mm256_mul_pd(fscal,dz21);
3056
3057             /* Update vectorial force */
3058             fix2             = _mm256_add_pd(fix2,tx);
3059             fiy2             = _mm256_add_pd(fiy2,ty);
3060             fiz2             = _mm256_add_pd(fiz2,tz);
3061
3062             fjx1             = _mm256_add_pd(fjx1,tx);
3063             fjy1             = _mm256_add_pd(fjy1,ty);
3064             fjz1             = _mm256_add_pd(fjz1,tz);
3065
3066             }
3067
3068             /**************************
3069              * CALCULATE INTERACTIONS *
3070              **************************/
3071
3072             if (gmx_mm256_any_lt(rsq22,rcutoff2))
3073             {
3074
3075             r22              = _mm256_mul_pd(rsq22,rinv22);
3076             r22              = _mm256_andnot_pd(dummy_mask,r22);
3077
3078             /* EWALD ELECTROSTATICS */
3079
3080             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3081             ewrt             = _mm256_mul_pd(r22,ewtabscale);
3082             ewitab           = _mm256_cvttpd_epi32(ewrt);
3083             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3084             ewitab           = _mm_slli_epi32(ewitab,2);
3085             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3086             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3087             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3088             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3089             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3090             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3091             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3092             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
3093             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
3094
3095             d                = _mm256_sub_pd(r22,rswitch);
3096             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3097             d2               = _mm256_mul_pd(d,d);
3098             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3099
3100             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3101
3102             /* Evaluate switch function */
3103             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3104             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
3105             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
3106
3107             fscal            = felec;
3108
3109             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3110
3111             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3112
3113             /* Calculate temporary vectorial force */
3114             tx               = _mm256_mul_pd(fscal,dx22);
3115             ty               = _mm256_mul_pd(fscal,dy22);
3116             tz               = _mm256_mul_pd(fscal,dz22);
3117
3118             /* Update vectorial force */
3119             fix2             = _mm256_add_pd(fix2,tx);
3120             fiy2             = _mm256_add_pd(fiy2,ty);
3121             fiz2             = _mm256_add_pd(fiz2,tz);
3122
3123             fjx2             = _mm256_add_pd(fjx2,tx);
3124             fjy2             = _mm256_add_pd(fjy2,ty);
3125             fjz2             = _mm256_add_pd(fjz2,tz);
3126
3127             }
3128
3129             /**************************
3130              * CALCULATE INTERACTIONS *
3131              **************************/
3132
3133             if (gmx_mm256_any_lt(rsq23,rcutoff2))
3134             {
3135
3136             r23              = _mm256_mul_pd(rsq23,rinv23);
3137             r23              = _mm256_andnot_pd(dummy_mask,r23);
3138
3139             /* EWALD ELECTROSTATICS */
3140
3141             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3142             ewrt             = _mm256_mul_pd(r23,ewtabscale);
3143             ewitab           = _mm256_cvttpd_epi32(ewrt);
3144             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3145             ewitab           = _mm_slli_epi32(ewitab,2);
3146             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3147             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3148             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3149             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3150             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3151             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3152             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3153             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
3154             felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
3155
3156             d                = _mm256_sub_pd(r23,rswitch);
3157             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3158             d2               = _mm256_mul_pd(d,d);
3159             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3160
3161             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3162
3163             /* Evaluate switch function */
3164             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3165             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
3166             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
3167
3168             fscal            = felec;
3169
3170             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3171
3172             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3173
3174             /* Calculate temporary vectorial force */
3175             tx               = _mm256_mul_pd(fscal,dx23);
3176             ty               = _mm256_mul_pd(fscal,dy23);
3177             tz               = _mm256_mul_pd(fscal,dz23);
3178
3179             /* Update vectorial force */
3180             fix2             = _mm256_add_pd(fix2,tx);
3181             fiy2             = _mm256_add_pd(fiy2,ty);
3182             fiz2             = _mm256_add_pd(fiz2,tz);
3183
3184             fjx3             = _mm256_add_pd(fjx3,tx);
3185             fjy3             = _mm256_add_pd(fjy3,ty);
3186             fjz3             = _mm256_add_pd(fjz3,tz);
3187
3188             }
3189
3190             /**************************
3191              * CALCULATE INTERACTIONS *
3192              **************************/
3193
3194             if (gmx_mm256_any_lt(rsq31,rcutoff2))
3195             {
3196
3197             r31              = _mm256_mul_pd(rsq31,rinv31);
3198             r31              = _mm256_andnot_pd(dummy_mask,r31);
3199
3200             /* EWALD ELECTROSTATICS */
3201
3202             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3203             ewrt             = _mm256_mul_pd(r31,ewtabscale);
3204             ewitab           = _mm256_cvttpd_epi32(ewrt);
3205             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3206             ewitab           = _mm_slli_epi32(ewitab,2);
3207             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3208             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3209             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3210             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3211             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3212             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3213             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3214             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
3215             felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
3216
3217             d                = _mm256_sub_pd(r31,rswitch);
3218             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3219             d2               = _mm256_mul_pd(d,d);
3220             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3221
3222             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3223
3224             /* Evaluate switch function */
3225             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3226             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
3227             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
3228
3229             fscal            = felec;
3230
3231             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3232
3233             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3234
3235             /* Calculate temporary vectorial force */
3236             tx               = _mm256_mul_pd(fscal,dx31);
3237             ty               = _mm256_mul_pd(fscal,dy31);
3238             tz               = _mm256_mul_pd(fscal,dz31);
3239
3240             /* Update vectorial force */
3241             fix3             = _mm256_add_pd(fix3,tx);
3242             fiy3             = _mm256_add_pd(fiy3,ty);
3243             fiz3             = _mm256_add_pd(fiz3,tz);
3244
3245             fjx1             = _mm256_add_pd(fjx1,tx);
3246             fjy1             = _mm256_add_pd(fjy1,ty);
3247             fjz1             = _mm256_add_pd(fjz1,tz);
3248
3249             }
3250
3251             /**************************
3252              * CALCULATE INTERACTIONS *
3253              **************************/
3254
3255             if (gmx_mm256_any_lt(rsq32,rcutoff2))
3256             {
3257
3258             r32              = _mm256_mul_pd(rsq32,rinv32);
3259             r32              = _mm256_andnot_pd(dummy_mask,r32);
3260
3261             /* EWALD ELECTROSTATICS */
3262
3263             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3264             ewrt             = _mm256_mul_pd(r32,ewtabscale);
3265             ewitab           = _mm256_cvttpd_epi32(ewrt);
3266             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3267             ewitab           = _mm_slli_epi32(ewitab,2);
3268             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3269             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3270             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3271             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3272             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3273             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3274             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3275             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
3276             felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
3277
3278             d                = _mm256_sub_pd(r32,rswitch);
3279             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3280             d2               = _mm256_mul_pd(d,d);
3281             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3282
3283             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3284
3285             /* Evaluate switch function */
3286             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3287             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
3288             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
3289
3290             fscal            = felec;
3291
3292             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3293
3294             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3295
3296             /* Calculate temporary vectorial force */
3297             tx               = _mm256_mul_pd(fscal,dx32);
3298             ty               = _mm256_mul_pd(fscal,dy32);
3299             tz               = _mm256_mul_pd(fscal,dz32);
3300
3301             /* Update vectorial force */
3302             fix3             = _mm256_add_pd(fix3,tx);
3303             fiy3             = _mm256_add_pd(fiy3,ty);
3304             fiz3             = _mm256_add_pd(fiz3,tz);
3305
3306             fjx2             = _mm256_add_pd(fjx2,tx);
3307             fjy2             = _mm256_add_pd(fjy2,ty);
3308             fjz2             = _mm256_add_pd(fjz2,tz);
3309
3310             }
3311
3312             /**************************
3313              * CALCULATE INTERACTIONS *
3314              **************************/
3315
3316             if (gmx_mm256_any_lt(rsq33,rcutoff2))
3317             {
3318
3319             r33              = _mm256_mul_pd(rsq33,rinv33);
3320             r33              = _mm256_andnot_pd(dummy_mask,r33);
3321
3322             /* EWALD ELECTROSTATICS */
3323
3324             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3325             ewrt             = _mm256_mul_pd(r33,ewtabscale);
3326             ewitab           = _mm256_cvttpd_epi32(ewrt);
3327             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3328             ewitab           = _mm_slli_epi32(ewitab,2);
3329             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3330             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3331             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3332             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3333             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3334             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3335             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3336             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
3337             felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
3338
3339             d                = _mm256_sub_pd(r33,rswitch);
3340             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3341             d2               = _mm256_mul_pd(d,d);
3342             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3343
3344             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3345
3346             /* Evaluate switch function */
3347             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3348             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
3349             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
3350
3351             fscal            = felec;
3352
3353             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3354
3355             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3356
3357             /* Calculate temporary vectorial force */
3358             tx               = _mm256_mul_pd(fscal,dx33);
3359             ty               = _mm256_mul_pd(fscal,dy33);
3360             tz               = _mm256_mul_pd(fscal,dz33);
3361
3362             /* Update vectorial force */
3363             fix3             = _mm256_add_pd(fix3,tx);
3364             fiy3             = _mm256_add_pd(fiy3,ty);
3365             fiz3             = _mm256_add_pd(fiz3,tz);
3366
3367             fjx3             = _mm256_add_pd(fjx3,tx);
3368             fjy3             = _mm256_add_pd(fjy3,ty);
3369             fjz3             = _mm256_add_pd(fjz3,tz);
3370
3371             }
3372
3373             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3374             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3375             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3376             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3377
3378             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
3379                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3380                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3381
3382             /* Inner loop uses 627 flops */
3383         }
3384
3385         /* End of innermost loop */
3386
3387         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3388                                                  f+i_coord_offset,fshift+i_shift_offset);
3389
3390         /* Increment number of inner iterations */
3391         inneriter                  += j_index_end - j_index_start;
3392
3393         /* Outer loop uses 24 flops */
3394     }
3395
3396     /* Increment number of outer iterations */
3397     outeriter        += nri;
3398
3399     /* Update outer/inner flops */
3400
3401     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);
3402 }