Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_avx_256_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_double
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
65     real             rcutoff_scalar;
66     real             *shiftvec,*fshift,*x,*f;
67     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68     real             scratch[4*DIM];
69     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70     real *           vdwioffsetptr0;
71     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72     real *           vdwioffsetptr1;
73     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74     real *           vdwioffsetptr2;
75     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
92     real             *charge;
93     int              nvdwtype;
94     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
95     int              *vdwtype;
96     real             *vdwparam;
97     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
98     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
99     __m128i          ewitab;
100     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
101     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
102     real             *ewtab;
103     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
104     real             rswitch_scalar,d_scalar;
105     __m256d          dummy_mask,cutoff_mask;
106     __m128           tmpmask0,tmpmask1;
107     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
108     __m256d          one     = _mm256_set1_pd(1.0);
109     __m256d          two     = _mm256_set1_pd(2.0);
110     x                = xx[0];
111     f                = ff[0];
112
113     nri              = nlist->nri;
114     iinr             = nlist->iinr;
115     jindex           = nlist->jindex;
116     jjnr             = nlist->jjnr;
117     shiftidx         = nlist->shift;
118     gid              = nlist->gid;
119     shiftvec         = fr->shift_vec[0];
120     fshift           = fr->fshift[0];
121     facel            = _mm256_set1_pd(fr->epsfac);
122     charge           = mdatoms->chargeA;
123     nvdwtype         = fr->ntype;
124     vdwparam         = fr->nbfp;
125     vdwtype          = mdatoms->typeA;
126
127     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
128     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff);
129     beta2            = _mm256_mul_pd(beta,beta);
130     beta3            = _mm256_mul_pd(beta,beta2);
131
132     ewtab            = fr->ic->tabq_coul_FDV0;
133     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
134     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
135
136     /* Setup water-specific parameters */
137     inr              = nlist->iinr[0];
138     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
139     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
140     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
141     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
142
143     jq0              = _mm256_set1_pd(charge[inr+0]);
144     jq1              = _mm256_set1_pd(charge[inr+1]);
145     jq2              = _mm256_set1_pd(charge[inr+2]);
146     vdwjidx0A        = 2*vdwtype[inr+0];
147     qq00             = _mm256_mul_pd(iq0,jq0);
148     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
149     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
150     qq01             = _mm256_mul_pd(iq0,jq1);
151     qq02             = _mm256_mul_pd(iq0,jq2);
152     qq10             = _mm256_mul_pd(iq1,jq0);
153     qq11             = _mm256_mul_pd(iq1,jq1);
154     qq12             = _mm256_mul_pd(iq1,jq2);
155     qq20             = _mm256_mul_pd(iq2,jq0);
156     qq21             = _mm256_mul_pd(iq2,jq1);
157     qq22             = _mm256_mul_pd(iq2,jq2);
158
159     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
160     rcutoff_scalar   = fr->rcoulomb;
161     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
162     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
163
164     rswitch_scalar   = fr->rcoulomb_switch;
165     rswitch          = _mm256_set1_pd(rswitch_scalar);
166     /* Setup switch parameters */
167     d_scalar         = rcutoff_scalar-rswitch_scalar;
168     d                = _mm256_set1_pd(d_scalar);
169     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
170     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
171     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
172     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
173     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
174     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
175
176     /* Avoid stupid compiler warnings */
177     jnrA = jnrB = jnrC = jnrD = 0;
178     j_coord_offsetA = 0;
179     j_coord_offsetB = 0;
180     j_coord_offsetC = 0;
181     j_coord_offsetD = 0;
182
183     outeriter        = 0;
184     inneriter        = 0;
185
186     for(iidx=0;iidx<4*DIM;iidx++)
187     {
188         scratch[iidx] = 0.0;
189     }
190
191     /* Start outer loop over neighborlists */
192     for(iidx=0; iidx<nri; iidx++)
193     {
194         /* Load shift vector for this list */
195         i_shift_offset   = DIM*shiftidx[iidx];
196
197         /* Load limits for loop over neighbors */
198         j_index_start    = jindex[iidx];
199         j_index_end      = jindex[iidx+1];
200
201         /* Get outer coordinate index */
202         inr              = iinr[iidx];
203         i_coord_offset   = DIM*inr;
204
205         /* Load i particle coords and add shift vector */
206         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
207                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
208
209         fix0             = _mm256_setzero_pd();
210         fiy0             = _mm256_setzero_pd();
211         fiz0             = _mm256_setzero_pd();
212         fix1             = _mm256_setzero_pd();
213         fiy1             = _mm256_setzero_pd();
214         fiz1             = _mm256_setzero_pd();
215         fix2             = _mm256_setzero_pd();
216         fiy2             = _mm256_setzero_pd();
217         fiz2             = _mm256_setzero_pd();
218
219         /* Reset potential sums */
220         velecsum         = _mm256_setzero_pd();
221         vvdwsum          = _mm256_setzero_pd();
222
223         /* Start inner kernel loop */
224         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
225         {
226
227             /* Get j neighbor index, and coordinate index */
228             jnrA             = jjnr[jidx];
229             jnrB             = jjnr[jidx+1];
230             jnrC             = jjnr[jidx+2];
231             jnrD             = jjnr[jidx+3];
232             j_coord_offsetA  = DIM*jnrA;
233             j_coord_offsetB  = DIM*jnrB;
234             j_coord_offsetC  = DIM*jnrC;
235             j_coord_offsetD  = DIM*jnrD;
236
237             /* load j atom coordinates */
238             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
239                                                  x+j_coord_offsetC,x+j_coord_offsetD,
240                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
241
242             /* Calculate displacement vector */
243             dx00             = _mm256_sub_pd(ix0,jx0);
244             dy00             = _mm256_sub_pd(iy0,jy0);
245             dz00             = _mm256_sub_pd(iz0,jz0);
246             dx01             = _mm256_sub_pd(ix0,jx1);
247             dy01             = _mm256_sub_pd(iy0,jy1);
248             dz01             = _mm256_sub_pd(iz0,jz1);
249             dx02             = _mm256_sub_pd(ix0,jx2);
250             dy02             = _mm256_sub_pd(iy0,jy2);
251             dz02             = _mm256_sub_pd(iz0,jz2);
252             dx10             = _mm256_sub_pd(ix1,jx0);
253             dy10             = _mm256_sub_pd(iy1,jy0);
254             dz10             = _mm256_sub_pd(iz1,jz0);
255             dx11             = _mm256_sub_pd(ix1,jx1);
256             dy11             = _mm256_sub_pd(iy1,jy1);
257             dz11             = _mm256_sub_pd(iz1,jz1);
258             dx12             = _mm256_sub_pd(ix1,jx2);
259             dy12             = _mm256_sub_pd(iy1,jy2);
260             dz12             = _mm256_sub_pd(iz1,jz2);
261             dx20             = _mm256_sub_pd(ix2,jx0);
262             dy20             = _mm256_sub_pd(iy2,jy0);
263             dz20             = _mm256_sub_pd(iz2,jz0);
264             dx21             = _mm256_sub_pd(ix2,jx1);
265             dy21             = _mm256_sub_pd(iy2,jy1);
266             dz21             = _mm256_sub_pd(iz2,jz1);
267             dx22             = _mm256_sub_pd(ix2,jx2);
268             dy22             = _mm256_sub_pd(iy2,jy2);
269             dz22             = _mm256_sub_pd(iz2,jz2);
270
271             /* Calculate squared distance and things based on it */
272             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
273             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
274             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
275             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
276             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
277             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
278             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
279             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
280             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
281
282             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
283             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
284             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
285             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
286             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
287             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
288             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
289             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
290             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
291
292             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
293             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
294             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
295             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
296             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
297             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
298             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
299             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
300             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
301
302             fjx0             = _mm256_setzero_pd();
303             fjy0             = _mm256_setzero_pd();
304             fjz0             = _mm256_setzero_pd();
305             fjx1             = _mm256_setzero_pd();
306             fjy1             = _mm256_setzero_pd();
307             fjz1             = _mm256_setzero_pd();
308             fjx2             = _mm256_setzero_pd();
309             fjy2             = _mm256_setzero_pd();
310             fjz2             = _mm256_setzero_pd();
311
312             /**************************
313              * CALCULATE INTERACTIONS *
314              **************************/
315
316             if (gmx_mm256_any_lt(rsq00,rcutoff2))
317             {
318
319             r00              = _mm256_mul_pd(rsq00,rinv00);
320
321             /* EWALD ELECTROSTATICS */
322
323             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
324             ewrt             = _mm256_mul_pd(r00,ewtabscale);
325             ewitab           = _mm256_cvttpd_epi32(ewrt);
326             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
327             ewitab           = _mm_slli_epi32(ewitab,2);
328             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
329             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
330             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
331             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
332             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
333             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
334             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
335             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
336             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
337
338             /* LENNARD-JONES DISPERSION/REPULSION */
339
340             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
341             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
342             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
343             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
344             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
345
346             d                = _mm256_sub_pd(r00,rswitch);
347             d                = _mm256_max_pd(d,_mm256_setzero_pd());
348             d2               = _mm256_mul_pd(d,d);
349             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
350
351             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
352
353             /* Evaluate switch function */
354             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
355             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
356             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
357             velec            = _mm256_mul_pd(velec,sw);
358             vvdw             = _mm256_mul_pd(vvdw,sw);
359             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
360
361             /* Update potential sum for this i atom from the interaction with this j atom. */
362             velec            = _mm256_and_pd(velec,cutoff_mask);
363             velecsum         = _mm256_add_pd(velecsum,velec);
364             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
365             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
366
367             fscal            = _mm256_add_pd(felec,fvdw);
368
369             fscal            = _mm256_and_pd(fscal,cutoff_mask);
370
371             /* Calculate temporary vectorial force */
372             tx               = _mm256_mul_pd(fscal,dx00);
373             ty               = _mm256_mul_pd(fscal,dy00);
374             tz               = _mm256_mul_pd(fscal,dz00);
375
376             /* Update vectorial force */
377             fix0             = _mm256_add_pd(fix0,tx);
378             fiy0             = _mm256_add_pd(fiy0,ty);
379             fiz0             = _mm256_add_pd(fiz0,tz);
380
381             fjx0             = _mm256_add_pd(fjx0,tx);
382             fjy0             = _mm256_add_pd(fjy0,ty);
383             fjz0             = _mm256_add_pd(fjz0,tz);
384
385             }
386
387             /**************************
388              * CALCULATE INTERACTIONS *
389              **************************/
390
391             if (gmx_mm256_any_lt(rsq01,rcutoff2))
392             {
393
394             r01              = _mm256_mul_pd(rsq01,rinv01);
395
396             /* EWALD ELECTROSTATICS */
397
398             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
399             ewrt             = _mm256_mul_pd(r01,ewtabscale);
400             ewitab           = _mm256_cvttpd_epi32(ewrt);
401             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
402             ewitab           = _mm_slli_epi32(ewitab,2);
403             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
404             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
405             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
406             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
407             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
408             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
409             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
410             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
411             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
412
413             d                = _mm256_sub_pd(r01,rswitch);
414             d                = _mm256_max_pd(d,_mm256_setzero_pd());
415             d2               = _mm256_mul_pd(d,d);
416             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
417
418             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
419
420             /* Evaluate switch function */
421             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
422             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
423             velec            = _mm256_mul_pd(velec,sw);
424             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
425
426             /* Update potential sum for this i atom from the interaction with this j atom. */
427             velec            = _mm256_and_pd(velec,cutoff_mask);
428             velecsum         = _mm256_add_pd(velecsum,velec);
429
430             fscal            = felec;
431
432             fscal            = _mm256_and_pd(fscal,cutoff_mask);
433
434             /* Calculate temporary vectorial force */
435             tx               = _mm256_mul_pd(fscal,dx01);
436             ty               = _mm256_mul_pd(fscal,dy01);
437             tz               = _mm256_mul_pd(fscal,dz01);
438
439             /* Update vectorial force */
440             fix0             = _mm256_add_pd(fix0,tx);
441             fiy0             = _mm256_add_pd(fiy0,ty);
442             fiz0             = _mm256_add_pd(fiz0,tz);
443
444             fjx1             = _mm256_add_pd(fjx1,tx);
445             fjy1             = _mm256_add_pd(fjy1,ty);
446             fjz1             = _mm256_add_pd(fjz1,tz);
447
448             }
449
450             /**************************
451              * CALCULATE INTERACTIONS *
452              **************************/
453
454             if (gmx_mm256_any_lt(rsq02,rcutoff2))
455             {
456
457             r02              = _mm256_mul_pd(rsq02,rinv02);
458
459             /* EWALD ELECTROSTATICS */
460
461             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
462             ewrt             = _mm256_mul_pd(r02,ewtabscale);
463             ewitab           = _mm256_cvttpd_epi32(ewrt);
464             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
465             ewitab           = _mm_slli_epi32(ewitab,2);
466             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
467             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
468             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
469             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
470             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
471             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
472             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
473             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
474             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
475
476             d                = _mm256_sub_pd(r02,rswitch);
477             d                = _mm256_max_pd(d,_mm256_setzero_pd());
478             d2               = _mm256_mul_pd(d,d);
479             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
480
481             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
482
483             /* Evaluate switch function */
484             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
485             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
486             velec            = _mm256_mul_pd(velec,sw);
487             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
488
489             /* Update potential sum for this i atom from the interaction with this j atom. */
490             velec            = _mm256_and_pd(velec,cutoff_mask);
491             velecsum         = _mm256_add_pd(velecsum,velec);
492
493             fscal            = felec;
494
495             fscal            = _mm256_and_pd(fscal,cutoff_mask);
496
497             /* Calculate temporary vectorial force */
498             tx               = _mm256_mul_pd(fscal,dx02);
499             ty               = _mm256_mul_pd(fscal,dy02);
500             tz               = _mm256_mul_pd(fscal,dz02);
501
502             /* Update vectorial force */
503             fix0             = _mm256_add_pd(fix0,tx);
504             fiy0             = _mm256_add_pd(fiy0,ty);
505             fiz0             = _mm256_add_pd(fiz0,tz);
506
507             fjx2             = _mm256_add_pd(fjx2,tx);
508             fjy2             = _mm256_add_pd(fjy2,ty);
509             fjz2             = _mm256_add_pd(fjz2,tz);
510
511             }
512
513             /**************************
514              * CALCULATE INTERACTIONS *
515              **************************/
516
517             if (gmx_mm256_any_lt(rsq10,rcutoff2))
518             {
519
520             r10              = _mm256_mul_pd(rsq10,rinv10);
521
522             /* EWALD ELECTROSTATICS */
523
524             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
525             ewrt             = _mm256_mul_pd(r10,ewtabscale);
526             ewitab           = _mm256_cvttpd_epi32(ewrt);
527             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
528             ewitab           = _mm_slli_epi32(ewitab,2);
529             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
530             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
531             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
532             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
533             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
534             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
535             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
536             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
537             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
538
539             d                = _mm256_sub_pd(r10,rswitch);
540             d                = _mm256_max_pd(d,_mm256_setzero_pd());
541             d2               = _mm256_mul_pd(d,d);
542             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
543
544             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
545
546             /* Evaluate switch function */
547             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
548             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
549             velec            = _mm256_mul_pd(velec,sw);
550             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
551
552             /* Update potential sum for this i atom from the interaction with this j atom. */
553             velec            = _mm256_and_pd(velec,cutoff_mask);
554             velecsum         = _mm256_add_pd(velecsum,velec);
555
556             fscal            = felec;
557
558             fscal            = _mm256_and_pd(fscal,cutoff_mask);
559
560             /* Calculate temporary vectorial force */
561             tx               = _mm256_mul_pd(fscal,dx10);
562             ty               = _mm256_mul_pd(fscal,dy10);
563             tz               = _mm256_mul_pd(fscal,dz10);
564
565             /* Update vectorial force */
566             fix1             = _mm256_add_pd(fix1,tx);
567             fiy1             = _mm256_add_pd(fiy1,ty);
568             fiz1             = _mm256_add_pd(fiz1,tz);
569
570             fjx0             = _mm256_add_pd(fjx0,tx);
571             fjy0             = _mm256_add_pd(fjy0,ty);
572             fjz0             = _mm256_add_pd(fjz0,tz);
573
574             }
575
576             /**************************
577              * CALCULATE INTERACTIONS *
578              **************************/
579
580             if (gmx_mm256_any_lt(rsq11,rcutoff2))
581             {
582
583             r11              = _mm256_mul_pd(rsq11,rinv11);
584
585             /* EWALD ELECTROSTATICS */
586
587             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
588             ewrt             = _mm256_mul_pd(r11,ewtabscale);
589             ewitab           = _mm256_cvttpd_epi32(ewrt);
590             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
591             ewitab           = _mm_slli_epi32(ewitab,2);
592             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
593             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
594             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
595             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
596             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
597             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
598             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
599             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
600             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
601
602             d                = _mm256_sub_pd(r11,rswitch);
603             d                = _mm256_max_pd(d,_mm256_setzero_pd());
604             d2               = _mm256_mul_pd(d,d);
605             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
606
607             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
608
609             /* Evaluate switch function */
610             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
611             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
612             velec            = _mm256_mul_pd(velec,sw);
613             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
614
615             /* Update potential sum for this i atom from the interaction with this j atom. */
616             velec            = _mm256_and_pd(velec,cutoff_mask);
617             velecsum         = _mm256_add_pd(velecsum,velec);
618
619             fscal            = felec;
620
621             fscal            = _mm256_and_pd(fscal,cutoff_mask);
622
623             /* Calculate temporary vectorial force */
624             tx               = _mm256_mul_pd(fscal,dx11);
625             ty               = _mm256_mul_pd(fscal,dy11);
626             tz               = _mm256_mul_pd(fscal,dz11);
627
628             /* Update vectorial force */
629             fix1             = _mm256_add_pd(fix1,tx);
630             fiy1             = _mm256_add_pd(fiy1,ty);
631             fiz1             = _mm256_add_pd(fiz1,tz);
632
633             fjx1             = _mm256_add_pd(fjx1,tx);
634             fjy1             = _mm256_add_pd(fjy1,ty);
635             fjz1             = _mm256_add_pd(fjz1,tz);
636
637             }
638
639             /**************************
640              * CALCULATE INTERACTIONS *
641              **************************/
642
643             if (gmx_mm256_any_lt(rsq12,rcutoff2))
644             {
645
646             r12              = _mm256_mul_pd(rsq12,rinv12);
647
648             /* EWALD ELECTROSTATICS */
649
650             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
651             ewrt             = _mm256_mul_pd(r12,ewtabscale);
652             ewitab           = _mm256_cvttpd_epi32(ewrt);
653             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
654             ewitab           = _mm_slli_epi32(ewitab,2);
655             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
656             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
657             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
658             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
659             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
660             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
661             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
662             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
663             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
664
665             d                = _mm256_sub_pd(r12,rswitch);
666             d                = _mm256_max_pd(d,_mm256_setzero_pd());
667             d2               = _mm256_mul_pd(d,d);
668             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
669
670             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
671
672             /* Evaluate switch function */
673             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
674             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
675             velec            = _mm256_mul_pd(velec,sw);
676             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
677
678             /* Update potential sum for this i atom from the interaction with this j atom. */
679             velec            = _mm256_and_pd(velec,cutoff_mask);
680             velecsum         = _mm256_add_pd(velecsum,velec);
681
682             fscal            = felec;
683
684             fscal            = _mm256_and_pd(fscal,cutoff_mask);
685
686             /* Calculate temporary vectorial force */
687             tx               = _mm256_mul_pd(fscal,dx12);
688             ty               = _mm256_mul_pd(fscal,dy12);
689             tz               = _mm256_mul_pd(fscal,dz12);
690
691             /* Update vectorial force */
692             fix1             = _mm256_add_pd(fix1,tx);
693             fiy1             = _mm256_add_pd(fiy1,ty);
694             fiz1             = _mm256_add_pd(fiz1,tz);
695
696             fjx2             = _mm256_add_pd(fjx2,tx);
697             fjy2             = _mm256_add_pd(fjy2,ty);
698             fjz2             = _mm256_add_pd(fjz2,tz);
699
700             }
701
702             /**************************
703              * CALCULATE INTERACTIONS *
704              **************************/
705
706             if (gmx_mm256_any_lt(rsq20,rcutoff2))
707             {
708
709             r20              = _mm256_mul_pd(rsq20,rinv20);
710
711             /* EWALD ELECTROSTATICS */
712
713             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
714             ewrt             = _mm256_mul_pd(r20,ewtabscale);
715             ewitab           = _mm256_cvttpd_epi32(ewrt);
716             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
717             ewitab           = _mm_slli_epi32(ewitab,2);
718             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
719             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
720             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
721             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
722             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
723             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
724             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
725             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
726             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
727
728             d                = _mm256_sub_pd(r20,rswitch);
729             d                = _mm256_max_pd(d,_mm256_setzero_pd());
730             d2               = _mm256_mul_pd(d,d);
731             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
732
733             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
734
735             /* Evaluate switch function */
736             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
737             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
738             velec            = _mm256_mul_pd(velec,sw);
739             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
740
741             /* Update potential sum for this i atom from the interaction with this j atom. */
742             velec            = _mm256_and_pd(velec,cutoff_mask);
743             velecsum         = _mm256_add_pd(velecsum,velec);
744
745             fscal            = felec;
746
747             fscal            = _mm256_and_pd(fscal,cutoff_mask);
748
749             /* Calculate temporary vectorial force */
750             tx               = _mm256_mul_pd(fscal,dx20);
751             ty               = _mm256_mul_pd(fscal,dy20);
752             tz               = _mm256_mul_pd(fscal,dz20);
753
754             /* Update vectorial force */
755             fix2             = _mm256_add_pd(fix2,tx);
756             fiy2             = _mm256_add_pd(fiy2,ty);
757             fiz2             = _mm256_add_pd(fiz2,tz);
758
759             fjx0             = _mm256_add_pd(fjx0,tx);
760             fjy0             = _mm256_add_pd(fjy0,ty);
761             fjz0             = _mm256_add_pd(fjz0,tz);
762
763             }
764
765             /**************************
766              * CALCULATE INTERACTIONS *
767              **************************/
768
769             if (gmx_mm256_any_lt(rsq21,rcutoff2))
770             {
771
772             r21              = _mm256_mul_pd(rsq21,rinv21);
773
774             /* EWALD ELECTROSTATICS */
775
776             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
777             ewrt             = _mm256_mul_pd(r21,ewtabscale);
778             ewitab           = _mm256_cvttpd_epi32(ewrt);
779             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
780             ewitab           = _mm_slli_epi32(ewitab,2);
781             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
782             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
783             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
784             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
785             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
786             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
787             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
788             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
789             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
790
791             d                = _mm256_sub_pd(r21,rswitch);
792             d                = _mm256_max_pd(d,_mm256_setzero_pd());
793             d2               = _mm256_mul_pd(d,d);
794             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
795
796             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
797
798             /* Evaluate switch function */
799             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
800             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
801             velec            = _mm256_mul_pd(velec,sw);
802             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
803
804             /* Update potential sum for this i atom from the interaction with this j atom. */
805             velec            = _mm256_and_pd(velec,cutoff_mask);
806             velecsum         = _mm256_add_pd(velecsum,velec);
807
808             fscal            = felec;
809
810             fscal            = _mm256_and_pd(fscal,cutoff_mask);
811
812             /* Calculate temporary vectorial force */
813             tx               = _mm256_mul_pd(fscal,dx21);
814             ty               = _mm256_mul_pd(fscal,dy21);
815             tz               = _mm256_mul_pd(fscal,dz21);
816
817             /* Update vectorial force */
818             fix2             = _mm256_add_pd(fix2,tx);
819             fiy2             = _mm256_add_pd(fiy2,ty);
820             fiz2             = _mm256_add_pd(fiz2,tz);
821
822             fjx1             = _mm256_add_pd(fjx1,tx);
823             fjy1             = _mm256_add_pd(fjy1,ty);
824             fjz1             = _mm256_add_pd(fjz1,tz);
825
826             }
827
828             /**************************
829              * CALCULATE INTERACTIONS *
830              **************************/
831
832             if (gmx_mm256_any_lt(rsq22,rcutoff2))
833             {
834
835             r22              = _mm256_mul_pd(rsq22,rinv22);
836
837             /* EWALD ELECTROSTATICS */
838
839             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
840             ewrt             = _mm256_mul_pd(r22,ewtabscale);
841             ewitab           = _mm256_cvttpd_epi32(ewrt);
842             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
843             ewitab           = _mm_slli_epi32(ewitab,2);
844             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
845             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
846             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
847             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
848             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
849             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
850             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
851             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
852             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
853
854             d                = _mm256_sub_pd(r22,rswitch);
855             d                = _mm256_max_pd(d,_mm256_setzero_pd());
856             d2               = _mm256_mul_pd(d,d);
857             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
858
859             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
860
861             /* Evaluate switch function */
862             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
863             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
864             velec            = _mm256_mul_pd(velec,sw);
865             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
866
867             /* Update potential sum for this i atom from the interaction with this j atom. */
868             velec            = _mm256_and_pd(velec,cutoff_mask);
869             velecsum         = _mm256_add_pd(velecsum,velec);
870
871             fscal            = felec;
872
873             fscal            = _mm256_and_pd(fscal,cutoff_mask);
874
875             /* Calculate temporary vectorial force */
876             tx               = _mm256_mul_pd(fscal,dx22);
877             ty               = _mm256_mul_pd(fscal,dy22);
878             tz               = _mm256_mul_pd(fscal,dz22);
879
880             /* Update vectorial force */
881             fix2             = _mm256_add_pd(fix2,tx);
882             fiy2             = _mm256_add_pd(fiy2,ty);
883             fiz2             = _mm256_add_pd(fiz2,tz);
884
885             fjx2             = _mm256_add_pd(fjx2,tx);
886             fjy2             = _mm256_add_pd(fjy2,ty);
887             fjz2             = _mm256_add_pd(fjz2,tz);
888
889             }
890
891             fjptrA             = f+j_coord_offsetA;
892             fjptrB             = f+j_coord_offsetB;
893             fjptrC             = f+j_coord_offsetC;
894             fjptrD             = f+j_coord_offsetD;
895
896             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
897                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
898
899             /* Inner loop uses 603 flops */
900         }
901
902         if(jidx<j_index_end)
903         {
904
905             /* Get j neighbor index, and coordinate index */
906             jnrlistA         = jjnr[jidx];
907             jnrlistB         = jjnr[jidx+1];
908             jnrlistC         = jjnr[jidx+2];
909             jnrlistD         = jjnr[jidx+3];
910             /* Sign of each element will be negative for non-real atoms.
911              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
912              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
913              */
914             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
915
916             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
917             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
918             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
919
920             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
921             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
922             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
923             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
924             j_coord_offsetA  = DIM*jnrA;
925             j_coord_offsetB  = DIM*jnrB;
926             j_coord_offsetC  = DIM*jnrC;
927             j_coord_offsetD  = DIM*jnrD;
928
929             /* load j atom coordinates */
930             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
931                                                  x+j_coord_offsetC,x+j_coord_offsetD,
932                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
933
934             /* Calculate displacement vector */
935             dx00             = _mm256_sub_pd(ix0,jx0);
936             dy00             = _mm256_sub_pd(iy0,jy0);
937             dz00             = _mm256_sub_pd(iz0,jz0);
938             dx01             = _mm256_sub_pd(ix0,jx1);
939             dy01             = _mm256_sub_pd(iy0,jy1);
940             dz01             = _mm256_sub_pd(iz0,jz1);
941             dx02             = _mm256_sub_pd(ix0,jx2);
942             dy02             = _mm256_sub_pd(iy0,jy2);
943             dz02             = _mm256_sub_pd(iz0,jz2);
944             dx10             = _mm256_sub_pd(ix1,jx0);
945             dy10             = _mm256_sub_pd(iy1,jy0);
946             dz10             = _mm256_sub_pd(iz1,jz0);
947             dx11             = _mm256_sub_pd(ix1,jx1);
948             dy11             = _mm256_sub_pd(iy1,jy1);
949             dz11             = _mm256_sub_pd(iz1,jz1);
950             dx12             = _mm256_sub_pd(ix1,jx2);
951             dy12             = _mm256_sub_pd(iy1,jy2);
952             dz12             = _mm256_sub_pd(iz1,jz2);
953             dx20             = _mm256_sub_pd(ix2,jx0);
954             dy20             = _mm256_sub_pd(iy2,jy0);
955             dz20             = _mm256_sub_pd(iz2,jz0);
956             dx21             = _mm256_sub_pd(ix2,jx1);
957             dy21             = _mm256_sub_pd(iy2,jy1);
958             dz21             = _mm256_sub_pd(iz2,jz1);
959             dx22             = _mm256_sub_pd(ix2,jx2);
960             dy22             = _mm256_sub_pd(iy2,jy2);
961             dz22             = _mm256_sub_pd(iz2,jz2);
962
963             /* Calculate squared distance and things based on it */
964             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
965             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
966             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
967             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
968             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
969             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
970             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
971             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
972             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
973
974             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
975             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
976             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
977             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
978             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
979             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
980             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
981             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
982             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
983
984             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
985             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
986             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
987             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
988             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
989             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
990             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
991             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
992             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
993
994             fjx0             = _mm256_setzero_pd();
995             fjy0             = _mm256_setzero_pd();
996             fjz0             = _mm256_setzero_pd();
997             fjx1             = _mm256_setzero_pd();
998             fjy1             = _mm256_setzero_pd();
999             fjz1             = _mm256_setzero_pd();
1000             fjx2             = _mm256_setzero_pd();
1001             fjy2             = _mm256_setzero_pd();
1002             fjz2             = _mm256_setzero_pd();
1003
1004             /**************************
1005              * CALCULATE INTERACTIONS *
1006              **************************/
1007
1008             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1009             {
1010
1011             r00              = _mm256_mul_pd(rsq00,rinv00);
1012             r00              = _mm256_andnot_pd(dummy_mask,r00);
1013
1014             /* EWALD ELECTROSTATICS */
1015
1016             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1017             ewrt             = _mm256_mul_pd(r00,ewtabscale);
1018             ewitab           = _mm256_cvttpd_epi32(ewrt);
1019             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1020             ewitab           = _mm_slli_epi32(ewitab,2);
1021             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1022             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1023             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1024             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1025             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1026             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1027             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1028             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
1029             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1030
1031             /* LENNARD-JONES DISPERSION/REPULSION */
1032
1033             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1034             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1035             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1036             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1037             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1038
1039             d                = _mm256_sub_pd(r00,rswitch);
1040             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1041             d2               = _mm256_mul_pd(d,d);
1042             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1043
1044             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1045
1046             /* Evaluate switch function */
1047             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1048             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
1049             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1050             velec            = _mm256_mul_pd(velec,sw);
1051             vvdw             = _mm256_mul_pd(vvdw,sw);
1052             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1053
1054             /* Update potential sum for this i atom from the interaction with this j atom. */
1055             velec            = _mm256_and_pd(velec,cutoff_mask);
1056             velec            = _mm256_andnot_pd(dummy_mask,velec);
1057             velecsum         = _mm256_add_pd(velecsum,velec);
1058             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
1059             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
1060             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
1061
1062             fscal            = _mm256_add_pd(felec,fvdw);
1063
1064             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1065
1066             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1067
1068             /* Calculate temporary vectorial force */
1069             tx               = _mm256_mul_pd(fscal,dx00);
1070             ty               = _mm256_mul_pd(fscal,dy00);
1071             tz               = _mm256_mul_pd(fscal,dz00);
1072
1073             /* Update vectorial force */
1074             fix0             = _mm256_add_pd(fix0,tx);
1075             fiy0             = _mm256_add_pd(fiy0,ty);
1076             fiz0             = _mm256_add_pd(fiz0,tz);
1077
1078             fjx0             = _mm256_add_pd(fjx0,tx);
1079             fjy0             = _mm256_add_pd(fjy0,ty);
1080             fjz0             = _mm256_add_pd(fjz0,tz);
1081
1082             }
1083
1084             /**************************
1085              * CALCULATE INTERACTIONS *
1086              **************************/
1087
1088             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1089             {
1090
1091             r01              = _mm256_mul_pd(rsq01,rinv01);
1092             r01              = _mm256_andnot_pd(dummy_mask,r01);
1093
1094             /* EWALD ELECTROSTATICS */
1095
1096             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1097             ewrt             = _mm256_mul_pd(r01,ewtabscale);
1098             ewitab           = _mm256_cvttpd_epi32(ewrt);
1099             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1100             ewitab           = _mm_slli_epi32(ewitab,2);
1101             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1102             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1103             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1104             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1105             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1106             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1107             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1108             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
1109             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1110
1111             d                = _mm256_sub_pd(r01,rswitch);
1112             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1113             d2               = _mm256_mul_pd(d,d);
1114             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1115
1116             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1117
1118             /* Evaluate switch function */
1119             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1120             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
1121             velec            = _mm256_mul_pd(velec,sw);
1122             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1123
1124             /* Update potential sum for this i atom from the interaction with this j atom. */
1125             velec            = _mm256_and_pd(velec,cutoff_mask);
1126             velec            = _mm256_andnot_pd(dummy_mask,velec);
1127             velecsum         = _mm256_add_pd(velecsum,velec);
1128
1129             fscal            = felec;
1130
1131             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1132
1133             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1134
1135             /* Calculate temporary vectorial force */
1136             tx               = _mm256_mul_pd(fscal,dx01);
1137             ty               = _mm256_mul_pd(fscal,dy01);
1138             tz               = _mm256_mul_pd(fscal,dz01);
1139
1140             /* Update vectorial force */
1141             fix0             = _mm256_add_pd(fix0,tx);
1142             fiy0             = _mm256_add_pd(fiy0,ty);
1143             fiz0             = _mm256_add_pd(fiz0,tz);
1144
1145             fjx1             = _mm256_add_pd(fjx1,tx);
1146             fjy1             = _mm256_add_pd(fjy1,ty);
1147             fjz1             = _mm256_add_pd(fjz1,tz);
1148
1149             }
1150
1151             /**************************
1152              * CALCULATE INTERACTIONS *
1153              **************************/
1154
1155             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1156             {
1157
1158             r02              = _mm256_mul_pd(rsq02,rinv02);
1159             r02              = _mm256_andnot_pd(dummy_mask,r02);
1160
1161             /* EWALD ELECTROSTATICS */
1162
1163             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1164             ewrt             = _mm256_mul_pd(r02,ewtabscale);
1165             ewitab           = _mm256_cvttpd_epi32(ewrt);
1166             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1167             ewitab           = _mm_slli_epi32(ewitab,2);
1168             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1169             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1170             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1171             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1172             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1173             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1174             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1175             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
1176             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1177
1178             d                = _mm256_sub_pd(r02,rswitch);
1179             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1180             d2               = _mm256_mul_pd(d,d);
1181             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1182
1183             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1184
1185             /* Evaluate switch function */
1186             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1187             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
1188             velec            = _mm256_mul_pd(velec,sw);
1189             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1190
1191             /* Update potential sum for this i atom from the interaction with this j atom. */
1192             velec            = _mm256_and_pd(velec,cutoff_mask);
1193             velec            = _mm256_andnot_pd(dummy_mask,velec);
1194             velecsum         = _mm256_add_pd(velecsum,velec);
1195
1196             fscal            = felec;
1197
1198             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1199
1200             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1201
1202             /* Calculate temporary vectorial force */
1203             tx               = _mm256_mul_pd(fscal,dx02);
1204             ty               = _mm256_mul_pd(fscal,dy02);
1205             tz               = _mm256_mul_pd(fscal,dz02);
1206
1207             /* Update vectorial force */
1208             fix0             = _mm256_add_pd(fix0,tx);
1209             fiy0             = _mm256_add_pd(fiy0,ty);
1210             fiz0             = _mm256_add_pd(fiz0,tz);
1211
1212             fjx2             = _mm256_add_pd(fjx2,tx);
1213             fjy2             = _mm256_add_pd(fjy2,ty);
1214             fjz2             = _mm256_add_pd(fjz2,tz);
1215
1216             }
1217
1218             /**************************
1219              * CALCULATE INTERACTIONS *
1220              **************************/
1221
1222             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1223             {
1224
1225             r10              = _mm256_mul_pd(rsq10,rinv10);
1226             r10              = _mm256_andnot_pd(dummy_mask,r10);
1227
1228             /* EWALD ELECTROSTATICS */
1229
1230             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1231             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1232             ewitab           = _mm256_cvttpd_epi32(ewrt);
1233             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1234             ewitab           = _mm_slli_epi32(ewitab,2);
1235             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1236             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1237             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1238             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1239             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1240             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1241             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1242             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1243             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1244
1245             d                = _mm256_sub_pd(r10,rswitch);
1246             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1247             d2               = _mm256_mul_pd(d,d);
1248             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1249
1250             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1251
1252             /* Evaluate switch function */
1253             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1254             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
1255             velec            = _mm256_mul_pd(velec,sw);
1256             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1257
1258             /* Update potential sum for this i atom from the interaction with this j atom. */
1259             velec            = _mm256_and_pd(velec,cutoff_mask);
1260             velec            = _mm256_andnot_pd(dummy_mask,velec);
1261             velecsum         = _mm256_add_pd(velecsum,velec);
1262
1263             fscal            = felec;
1264
1265             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1266
1267             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1268
1269             /* Calculate temporary vectorial force */
1270             tx               = _mm256_mul_pd(fscal,dx10);
1271             ty               = _mm256_mul_pd(fscal,dy10);
1272             tz               = _mm256_mul_pd(fscal,dz10);
1273
1274             /* Update vectorial force */
1275             fix1             = _mm256_add_pd(fix1,tx);
1276             fiy1             = _mm256_add_pd(fiy1,ty);
1277             fiz1             = _mm256_add_pd(fiz1,tz);
1278
1279             fjx0             = _mm256_add_pd(fjx0,tx);
1280             fjy0             = _mm256_add_pd(fjy0,ty);
1281             fjz0             = _mm256_add_pd(fjz0,tz);
1282
1283             }
1284
1285             /**************************
1286              * CALCULATE INTERACTIONS *
1287              **************************/
1288
1289             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1290             {
1291
1292             r11              = _mm256_mul_pd(rsq11,rinv11);
1293             r11              = _mm256_andnot_pd(dummy_mask,r11);
1294
1295             /* EWALD ELECTROSTATICS */
1296
1297             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1298             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1299             ewitab           = _mm256_cvttpd_epi32(ewrt);
1300             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1301             ewitab           = _mm_slli_epi32(ewitab,2);
1302             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1303             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1304             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1305             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1306             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1307             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1308             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1309             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1310             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1311
1312             d                = _mm256_sub_pd(r11,rswitch);
1313             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1314             d2               = _mm256_mul_pd(d,d);
1315             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1316
1317             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1318
1319             /* Evaluate switch function */
1320             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1321             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1322             velec            = _mm256_mul_pd(velec,sw);
1323             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1324
1325             /* Update potential sum for this i atom from the interaction with this j atom. */
1326             velec            = _mm256_and_pd(velec,cutoff_mask);
1327             velec            = _mm256_andnot_pd(dummy_mask,velec);
1328             velecsum         = _mm256_add_pd(velecsum,velec);
1329
1330             fscal            = felec;
1331
1332             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1333
1334             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1335
1336             /* Calculate temporary vectorial force */
1337             tx               = _mm256_mul_pd(fscal,dx11);
1338             ty               = _mm256_mul_pd(fscal,dy11);
1339             tz               = _mm256_mul_pd(fscal,dz11);
1340
1341             /* Update vectorial force */
1342             fix1             = _mm256_add_pd(fix1,tx);
1343             fiy1             = _mm256_add_pd(fiy1,ty);
1344             fiz1             = _mm256_add_pd(fiz1,tz);
1345
1346             fjx1             = _mm256_add_pd(fjx1,tx);
1347             fjy1             = _mm256_add_pd(fjy1,ty);
1348             fjz1             = _mm256_add_pd(fjz1,tz);
1349
1350             }
1351
1352             /**************************
1353              * CALCULATE INTERACTIONS *
1354              **************************/
1355
1356             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1357             {
1358
1359             r12              = _mm256_mul_pd(rsq12,rinv12);
1360             r12              = _mm256_andnot_pd(dummy_mask,r12);
1361
1362             /* EWALD ELECTROSTATICS */
1363
1364             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1365             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1366             ewitab           = _mm256_cvttpd_epi32(ewrt);
1367             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1368             ewitab           = _mm_slli_epi32(ewitab,2);
1369             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1370             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1371             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1372             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1373             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1374             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1375             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1376             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1377             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1378
1379             d                = _mm256_sub_pd(r12,rswitch);
1380             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1381             d2               = _mm256_mul_pd(d,d);
1382             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1383
1384             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1385
1386             /* Evaluate switch function */
1387             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1388             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1389             velec            = _mm256_mul_pd(velec,sw);
1390             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1391
1392             /* Update potential sum for this i atom from the interaction with this j atom. */
1393             velec            = _mm256_and_pd(velec,cutoff_mask);
1394             velec            = _mm256_andnot_pd(dummy_mask,velec);
1395             velecsum         = _mm256_add_pd(velecsum,velec);
1396
1397             fscal            = felec;
1398
1399             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1400
1401             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1402
1403             /* Calculate temporary vectorial force */
1404             tx               = _mm256_mul_pd(fscal,dx12);
1405             ty               = _mm256_mul_pd(fscal,dy12);
1406             tz               = _mm256_mul_pd(fscal,dz12);
1407
1408             /* Update vectorial force */
1409             fix1             = _mm256_add_pd(fix1,tx);
1410             fiy1             = _mm256_add_pd(fiy1,ty);
1411             fiz1             = _mm256_add_pd(fiz1,tz);
1412
1413             fjx2             = _mm256_add_pd(fjx2,tx);
1414             fjy2             = _mm256_add_pd(fjy2,ty);
1415             fjz2             = _mm256_add_pd(fjz2,tz);
1416
1417             }
1418
1419             /**************************
1420              * CALCULATE INTERACTIONS *
1421              **************************/
1422
1423             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1424             {
1425
1426             r20              = _mm256_mul_pd(rsq20,rinv20);
1427             r20              = _mm256_andnot_pd(dummy_mask,r20);
1428
1429             /* EWALD ELECTROSTATICS */
1430
1431             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1432             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1433             ewitab           = _mm256_cvttpd_epi32(ewrt);
1434             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1435             ewitab           = _mm_slli_epi32(ewitab,2);
1436             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1437             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1438             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1439             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1440             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1441             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1442             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1443             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1444             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1445
1446             d                = _mm256_sub_pd(r20,rswitch);
1447             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1448             d2               = _mm256_mul_pd(d,d);
1449             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1450
1451             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1452
1453             /* Evaluate switch function */
1454             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1455             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
1456             velec            = _mm256_mul_pd(velec,sw);
1457             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1458
1459             /* Update potential sum for this i atom from the interaction with this j atom. */
1460             velec            = _mm256_and_pd(velec,cutoff_mask);
1461             velec            = _mm256_andnot_pd(dummy_mask,velec);
1462             velecsum         = _mm256_add_pd(velecsum,velec);
1463
1464             fscal            = felec;
1465
1466             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1467
1468             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1469
1470             /* Calculate temporary vectorial force */
1471             tx               = _mm256_mul_pd(fscal,dx20);
1472             ty               = _mm256_mul_pd(fscal,dy20);
1473             tz               = _mm256_mul_pd(fscal,dz20);
1474
1475             /* Update vectorial force */
1476             fix2             = _mm256_add_pd(fix2,tx);
1477             fiy2             = _mm256_add_pd(fiy2,ty);
1478             fiz2             = _mm256_add_pd(fiz2,tz);
1479
1480             fjx0             = _mm256_add_pd(fjx0,tx);
1481             fjy0             = _mm256_add_pd(fjy0,ty);
1482             fjz0             = _mm256_add_pd(fjz0,tz);
1483
1484             }
1485
1486             /**************************
1487              * CALCULATE INTERACTIONS *
1488              **************************/
1489
1490             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1491             {
1492
1493             r21              = _mm256_mul_pd(rsq21,rinv21);
1494             r21              = _mm256_andnot_pd(dummy_mask,r21);
1495
1496             /* EWALD ELECTROSTATICS */
1497
1498             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1499             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1500             ewitab           = _mm256_cvttpd_epi32(ewrt);
1501             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1502             ewitab           = _mm_slli_epi32(ewitab,2);
1503             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1504             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1505             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1506             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1507             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1508             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1509             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1510             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1511             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1512
1513             d                = _mm256_sub_pd(r21,rswitch);
1514             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1515             d2               = _mm256_mul_pd(d,d);
1516             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1517
1518             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1519
1520             /* Evaluate switch function */
1521             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1522             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
1523             velec            = _mm256_mul_pd(velec,sw);
1524             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1525
1526             /* Update potential sum for this i atom from the interaction with this j atom. */
1527             velec            = _mm256_and_pd(velec,cutoff_mask);
1528             velec            = _mm256_andnot_pd(dummy_mask,velec);
1529             velecsum         = _mm256_add_pd(velecsum,velec);
1530
1531             fscal            = felec;
1532
1533             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1534
1535             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1536
1537             /* Calculate temporary vectorial force */
1538             tx               = _mm256_mul_pd(fscal,dx21);
1539             ty               = _mm256_mul_pd(fscal,dy21);
1540             tz               = _mm256_mul_pd(fscal,dz21);
1541
1542             /* Update vectorial force */
1543             fix2             = _mm256_add_pd(fix2,tx);
1544             fiy2             = _mm256_add_pd(fiy2,ty);
1545             fiz2             = _mm256_add_pd(fiz2,tz);
1546
1547             fjx1             = _mm256_add_pd(fjx1,tx);
1548             fjy1             = _mm256_add_pd(fjy1,ty);
1549             fjz1             = _mm256_add_pd(fjz1,tz);
1550
1551             }
1552
1553             /**************************
1554              * CALCULATE INTERACTIONS *
1555              **************************/
1556
1557             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1558             {
1559
1560             r22              = _mm256_mul_pd(rsq22,rinv22);
1561             r22              = _mm256_andnot_pd(dummy_mask,r22);
1562
1563             /* EWALD ELECTROSTATICS */
1564
1565             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1566             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1567             ewitab           = _mm256_cvttpd_epi32(ewrt);
1568             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1569             ewitab           = _mm_slli_epi32(ewitab,2);
1570             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1571             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1572             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1573             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1574             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1575             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1576             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1577             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1578             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1579
1580             d                = _mm256_sub_pd(r22,rswitch);
1581             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1582             d2               = _mm256_mul_pd(d,d);
1583             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1584
1585             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1586
1587             /* Evaluate switch function */
1588             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1589             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
1590             velec            = _mm256_mul_pd(velec,sw);
1591             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1592
1593             /* Update potential sum for this i atom from the interaction with this j atom. */
1594             velec            = _mm256_and_pd(velec,cutoff_mask);
1595             velec            = _mm256_andnot_pd(dummy_mask,velec);
1596             velecsum         = _mm256_add_pd(velecsum,velec);
1597
1598             fscal            = felec;
1599
1600             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1601
1602             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1603
1604             /* Calculate temporary vectorial force */
1605             tx               = _mm256_mul_pd(fscal,dx22);
1606             ty               = _mm256_mul_pd(fscal,dy22);
1607             tz               = _mm256_mul_pd(fscal,dz22);
1608
1609             /* Update vectorial force */
1610             fix2             = _mm256_add_pd(fix2,tx);
1611             fiy2             = _mm256_add_pd(fiy2,ty);
1612             fiz2             = _mm256_add_pd(fiz2,tz);
1613
1614             fjx2             = _mm256_add_pd(fjx2,tx);
1615             fjy2             = _mm256_add_pd(fjy2,ty);
1616             fjz2             = _mm256_add_pd(fjz2,tz);
1617
1618             }
1619
1620             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1621             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1622             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1623             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1624
1625             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1626                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1627
1628             /* Inner loop uses 612 flops */
1629         }
1630
1631         /* End of innermost loop */
1632
1633         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1634                                                  f+i_coord_offset,fshift+i_shift_offset);
1635
1636         ggid                        = gid[iidx];
1637         /* Update potential energies */
1638         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1639         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1640
1641         /* Increment number of inner iterations */
1642         inneriter                  += j_index_end - j_index_start;
1643
1644         /* Outer loop uses 20 flops */
1645     }
1646
1647     /* Increment number of outer iterations */
1648     outeriter        += nri;
1649
1650     /* Update outer/inner flops */
1651
1652     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*612);
1653 }
1654 /*
1655  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_double
1656  * Electrostatics interaction: Ewald
1657  * VdW interaction:            LennardJones
1658  * Geometry:                   Water3-Water3
1659  * Calculate force/pot:        Force
1660  */
1661 void
1662 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_double
1663                     (t_nblist * gmx_restrict                nlist,
1664                      rvec * gmx_restrict                    xx,
1665                      rvec * gmx_restrict                    ff,
1666                      t_forcerec * gmx_restrict              fr,
1667                      t_mdatoms * gmx_restrict               mdatoms,
1668                      nb_kernel_data_t * gmx_restrict        kernel_data,
1669                      t_nrnb * gmx_restrict                  nrnb)
1670 {
1671     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1672      * just 0 for non-waters.
1673      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1674      * jnr indices corresponding to data put in the four positions in the SIMD register.
1675      */
1676     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1677     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1678     int              jnrA,jnrB,jnrC,jnrD;
1679     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1680     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1681     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1682     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1683     real             rcutoff_scalar;
1684     real             *shiftvec,*fshift,*x,*f;
1685     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1686     real             scratch[4*DIM];
1687     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1688     real *           vdwioffsetptr0;
1689     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1690     real *           vdwioffsetptr1;
1691     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1692     real *           vdwioffsetptr2;
1693     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1694     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1695     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1696     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1697     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1698     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1699     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1700     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1701     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1702     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1703     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1704     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1705     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1706     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1707     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1708     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1709     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1710     real             *charge;
1711     int              nvdwtype;
1712     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1713     int              *vdwtype;
1714     real             *vdwparam;
1715     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1716     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1717     __m128i          ewitab;
1718     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1719     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1720     real             *ewtab;
1721     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1722     real             rswitch_scalar,d_scalar;
1723     __m256d          dummy_mask,cutoff_mask;
1724     __m128           tmpmask0,tmpmask1;
1725     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1726     __m256d          one     = _mm256_set1_pd(1.0);
1727     __m256d          two     = _mm256_set1_pd(2.0);
1728     x                = xx[0];
1729     f                = ff[0];
1730
1731     nri              = nlist->nri;
1732     iinr             = nlist->iinr;
1733     jindex           = nlist->jindex;
1734     jjnr             = nlist->jjnr;
1735     shiftidx         = nlist->shift;
1736     gid              = nlist->gid;
1737     shiftvec         = fr->shift_vec[0];
1738     fshift           = fr->fshift[0];
1739     facel            = _mm256_set1_pd(fr->epsfac);
1740     charge           = mdatoms->chargeA;
1741     nvdwtype         = fr->ntype;
1742     vdwparam         = fr->nbfp;
1743     vdwtype          = mdatoms->typeA;
1744
1745     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1746     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff);
1747     beta2            = _mm256_mul_pd(beta,beta);
1748     beta3            = _mm256_mul_pd(beta,beta2);
1749
1750     ewtab            = fr->ic->tabq_coul_FDV0;
1751     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1752     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1753
1754     /* Setup water-specific parameters */
1755     inr              = nlist->iinr[0];
1756     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1757     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1758     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1759     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1760
1761     jq0              = _mm256_set1_pd(charge[inr+0]);
1762     jq1              = _mm256_set1_pd(charge[inr+1]);
1763     jq2              = _mm256_set1_pd(charge[inr+2]);
1764     vdwjidx0A        = 2*vdwtype[inr+0];
1765     qq00             = _mm256_mul_pd(iq0,jq0);
1766     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1767     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1768     qq01             = _mm256_mul_pd(iq0,jq1);
1769     qq02             = _mm256_mul_pd(iq0,jq2);
1770     qq10             = _mm256_mul_pd(iq1,jq0);
1771     qq11             = _mm256_mul_pd(iq1,jq1);
1772     qq12             = _mm256_mul_pd(iq1,jq2);
1773     qq20             = _mm256_mul_pd(iq2,jq0);
1774     qq21             = _mm256_mul_pd(iq2,jq1);
1775     qq22             = _mm256_mul_pd(iq2,jq2);
1776
1777     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1778     rcutoff_scalar   = fr->rcoulomb;
1779     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1780     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1781
1782     rswitch_scalar   = fr->rcoulomb_switch;
1783     rswitch          = _mm256_set1_pd(rswitch_scalar);
1784     /* Setup switch parameters */
1785     d_scalar         = rcutoff_scalar-rswitch_scalar;
1786     d                = _mm256_set1_pd(d_scalar);
1787     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1788     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1789     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1790     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1791     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1792     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1793
1794     /* Avoid stupid compiler warnings */
1795     jnrA = jnrB = jnrC = jnrD = 0;
1796     j_coord_offsetA = 0;
1797     j_coord_offsetB = 0;
1798     j_coord_offsetC = 0;
1799     j_coord_offsetD = 0;
1800
1801     outeriter        = 0;
1802     inneriter        = 0;
1803
1804     for(iidx=0;iidx<4*DIM;iidx++)
1805     {
1806         scratch[iidx] = 0.0;
1807     }
1808
1809     /* Start outer loop over neighborlists */
1810     for(iidx=0; iidx<nri; iidx++)
1811     {
1812         /* Load shift vector for this list */
1813         i_shift_offset   = DIM*shiftidx[iidx];
1814
1815         /* Load limits for loop over neighbors */
1816         j_index_start    = jindex[iidx];
1817         j_index_end      = jindex[iidx+1];
1818
1819         /* Get outer coordinate index */
1820         inr              = iinr[iidx];
1821         i_coord_offset   = DIM*inr;
1822
1823         /* Load i particle coords and add shift vector */
1824         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1825                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1826
1827         fix0             = _mm256_setzero_pd();
1828         fiy0             = _mm256_setzero_pd();
1829         fiz0             = _mm256_setzero_pd();
1830         fix1             = _mm256_setzero_pd();
1831         fiy1             = _mm256_setzero_pd();
1832         fiz1             = _mm256_setzero_pd();
1833         fix2             = _mm256_setzero_pd();
1834         fiy2             = _mm256_setzero_pd();
1835         fiz2             = _mm256_setzero_pd();
1836
1837         /* Start inner kernel loop */
1838         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1839         {
1840
1841             /* Get j neighbor index, and coordinate index */
1842             jnrA             = jjnr[jidx];
1843             jnrB             = jjnr[jidx+1];
1844             jnrC             = jjnr[jidx+2];
1845             jnrD             = jjnr[jidx+3];
1846             j_coord_offsetA  = DIM*jnrA;
1847             j_coord_offsetB  = DIM*jnrB;
1848             j_coord_offsetC  = DIM*jnrC;
1849             j_coord_offsetD  = DIM*jnrD;
1850
1851             /* load j atom coordinates */
1852             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1853                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1854                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1855
1856             /* Calculate displacement vector */
1857             dx00             = _mm256_sub_pd(ix0,jx0);
1858             dy00             = _mm256_sub_pd(iy0,jy0);
1859             dz00             = _mm256_sub_pd(iz0,jz0);
1860             dx01             = _mm256_sub_pd(ix0,jx1);
1861             dy01             = _mm256_sub_pd(iy0,jy1);
1862             dz01             = _mm256_sub_pd(iz0,jz1);
1863             dx02             = _mm256_sub_pd(ix0,jx2);
1864             dy02             = _mm256_sub_pd(iy0,jy2);
1865             dz02             = _mm256_sub_pd(iz0,jz2);
1866             dx10             = _mm256_sub_pd(ix1,jx0);
1867             dy10             = _mm256_sub_pd(iy1,jy0);
1868             dz10             = _mm256_sub_pd(iz1,jz0);
1869             dx11             = _mm256_sub_pd(ix1,jx1);
1870             dy11             = _mm256_sub_pd(iy1,jy1);
1871             dz11             = _mm256_sub_pd(iz1,jz1);
1872             dx12             = _mm256_sub_pd(ix1,jx2);
1873             dy12             = _mm256_sub_pd(iy1,jy2);
1874             dz12             = _mm256_sub_pd(iz1,jz2);
1875             dx20             = _mm256_sub_pd(ix2,jx0);
1876             dy20             = _mm256_sub_pd(iy2,jy0);
1877             dz20             = _mm256_sub_pd(iz2,jz0);
1878             dx21             = _mm256_sub_pd(ix2,jx1);
1879             dy21             = _mm256_sub_pd(iy2,jy1);
1880             dz21             = _mm256_sub_pd(iz2,jz1);
1881             dx22             = _mm256_sub_pd(ix2,jx2);
1882             dy22             = _mm256_sub_pd(iy2,jy2);
1883             dz22             = _mm256_sub_pd(iz2,jz2);
1884
1885             /* Calculate squared distance and things based on it */
1886             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1887             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1888             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1889             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1890             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1891             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1892             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1893             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1894             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1895
1896             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1897             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1898             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1899             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1900             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1901             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1902             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1903             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1904             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1905
1906             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1907             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1908             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1909             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1910             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1911             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1912             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1913             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1914             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1915
1916             fjx0             = _mm256_setzero_pd();
1917             fjy0             = _mm256_setzero_pd();
1918             fjz0             = _mm256_setzero_pd();
1919             fjx1             = _mm256_setzero_pd();
1920             fjy1             = _mm256_setzero_pd();
1921             fjz1             = _mm256_setzero_pd();
1922             fjx2             = _mm256_setzero_pd();
1923             fjy2             = _mm256_setzero_pd();
1924             fjz2             = _mm256_setzero_pd();
1925
1926             /**************************
1927              * CALCULATE INTERACTIONS *
1928              **************************/
1929
1930             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1931             {
1932
1933             r00              = _mm256_mul_pd(rsq00,rinv00);
1934
1935             /* EWALD ELECTROSTATICS */
1936
1937             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1938             ewrt             = _mm256_mul_pd(r00,ewtabscale);
1939             ewitab           = _mm256_cvttpd_epi32(ewrt);
1940             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1941             ewitab           = _mm_slli_epi32(ewitab,2);
1942             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1943             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1944             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1945             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1946             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1947             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1948             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1949             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
1950             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1951
1952             /* LENNARD-JONES DISPERSION/REPULSION */
1953
1954             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1955             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1956             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1957             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1958             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1959
1960             d                = _mm256_sub_pd(r00,rswitch);
1961             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1962             d2               = _mm256_mul_pd(d,d);
1963             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1964
1965             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1966
1967             /* Evaluate switch function */
1968             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1969             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
1970             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1971             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1972
1973             fscal            = _mm256_add_pd(felec,fvdw);
1974
1975             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1976
1977             /* Calculate temporary vectorial force */
1978             tx               = _mm256_mul_pd(fscal,dx00);
1979             ty               = _mm256_mul_pd(fscal,dy00);
1980             tz               = _mm256_mul_pd(fscal,dz00);
1981
1982             /* Update vectorial force */
1983             fix0             = _mm256_add_pd(fix0,tx);
1984             fiy0             = _mm256_add_pd(fiy0,ty);
1985             fiz0             = _mm256_add_pd(fiz0,tz);
1986
1987             fjx0             = _mm256_add_pd(fjx0,tx);
1988             fjy0             = _mm256_add_pd(fjy0,ty);
1989             fjz0             = _mm256_add_pd(fjz0,tz);
1990
1991             }
1992
1993             /**************************
1994              * CALCULATE INTERACTIONS *
1995              **************************/
1996
1997             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1998             {
1999
2000             r01              = _mm256_mul_pd(rsq01,rinv01);
2001
2002             /* EWALD ELECTROSTATICS */
2003
2004             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2005             ewrt             = _mm256_mul_pd(r01,ewtabscale);
2006             ewitab           = _mm256_cvttpd_epi32(ewrt);
2007             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2008             ewitab           = _mm_slli_epi32(ewitab,2);
2009             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2010             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2011             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2012             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2013             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2014             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2015             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2016             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
2017             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2018
2019             d                = _mm256_sub_pd(r01,rswitch);
2020             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2021             d2               = _mm256_mul_pd(d,d);
2022             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2023
2024             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2025
2026             /* Evaluate switch function */
2027             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2028             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
2029             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2030
2031             fscal            = felec;
2032
2033             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2034
2035             /* Calculate temporary vectorial force */
2036             tx               = _mm256_mul_pd(fscal,dx01);
2037             ty               = _mm256_mul_pd(fscal,dy01);
2038             tz               = _mm256_mul_pd(fscal,dz01);
2039
2040             /* Update vectorial force */
2041             fix0             = _mm256_add_pd(fix0,tx);
2042             fiy0             = _mm256_add_pd(fiy0,ty);
2043             fiz0             = _mm256_add_pd(fiz0,tz);
2044
2045             fjx1             = _mm256_add_pd(fjx1,tx);
2046             fjy1             = _mm256_add_pd(fjy1,ty);
2047             fjz1             = _mm256_add_pd(fjz1,tz);
2048
2049             }
2050
2051             /**************************
2052              * CALCULATE INTERACTIONS *
2053              **************************/
2054
2055             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2056             {
2057
2058             r02              = _mm256_mul_pd(rsq02,rinv02);
2059
2060             /* EWALD ELECTROSTATICS */
2061
2062             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2063             ewrt             = _mm256_mul_pd(r02,ewtabscale);
2064             ewitab           = _mm256_cvttpd_epi32(ewrt);
2065             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2066             ewitab           = _mm_slli_epi32(ewitab,2);
2067             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2068             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2069             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2070             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2071             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2072             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2073             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2074             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
2075             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2076
2077             d                = _mm256_sub_pd(r02,rswitch);
2078             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2079             d2               = _mm256_mul_pd(d,d);
2080             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2081
2082             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2083
2084             /* Evaluate switch function */
2085             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2086             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
2087             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2088
2089             fscal            = felec;
2090
2091             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2092
2093             /* Calculate temporary vectorial force */
2094             tx               = _mm256_mul_pd(fscal,dx02);
2095             ty               = _mm256_mul_pd(fscal,dy02);
2096             tz               = _mm256_mul_pd(fscal,dz02);
2097
2098             /* Update vectorial force */
2099             fix0             = _mm256_add_pd(fix0,tx);
2100             fiy0             = _mm256_add_pd(fiy0,ty);
2101             fiz0             = _mm256_add_pd(fiz0,tz);
2102
2103             fjx2             = _mm256_add_pd(fjx2,tx);
2104             fjy2             = _mm256_add_pd(fjy2,ty);
2105             fjz2             = _mm256_add_pd(fjz2,tz);
2106
2107             }
2108
2109             /**************************
2110              * CALCULATE INTERACTIONS *
2111              **************************/
2112
2113             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2114             {
2115
2116             r10              = _mm256_mul_pd(rsq10,rinv10);
2117
2118             /* EWALD ELECTROSTATICS */
2119
2120             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2121             ewrt             = _mm256_mul_pd(r10,ewtabscale);
2122             ewitab           = _mm256_cvttpd_epi32(ewrt);
2123             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2124             ewitab           = _mm_slli_epi32(ewitab,2);
2125             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2126             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2127             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2128             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2129             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2130             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2131             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2132             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
2133             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2134
2135             d                = _mm256_sub_pd(r10,rswitch);
2136             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2137             d2               = _mm256_mul_pd(d,d);
2138             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2139
2140             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2141
2142             /* Evaluate switch function */
2143             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2144             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
2145             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2146
2147             fscal            = felec;
2148
2149             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2150
2151             /* Calculate temporary vectorial force */
2152             tx               = _mm256_mul_pd(fscal,dx10);
2153             ty               = _mm256_mul_pd(fscal,dy10);
2154             tz               = _mm256_mul_pd(fscal,dz10);
2155
2156             /* Update vectorial force */
2157             fix1             = _mm256_add_pd(fix1,tx);
2158             fiy1             = _mm256_add_pd(fiy1,ty);
2159             fiz1             = _mm256_add_pd(fiz1,tz);
2160
2161             fjx0             = _mm256_add_pd(fjx0,tx);
2162             fjy0             = _mm256_add_pd(fjy0,ty);
2163             fjz0             = _mm256_add_pd(fjz0,tz);
2164
2165             }
2166
2167             /**************************
2168              * CALCULATE INTERACTIONS *
2169              **************************/
2170
2171             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2172             {
2173
2174             r11              = _mm256_mul_pd(rsq11,rinv11);
2175
2176             /* EWALD ELECTROSTATICS */
2177
2178             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2179             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2180             ewitab           = _mm256_cvttpd_epi32(ewrt);
2181             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2182             ewitab           = _mm_slli_epi32(ewitab,2);
2183             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2184             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2185             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2186             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2187             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2188             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2189             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2190             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2191             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2192
2193             d                = _mm256_sub_pd(r11,rswitch);
2194             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2195             d2               = _mm256_mul_pd(d,d);
2196             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2197
2198             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2199
2200             /* Evaluate switch function */
2201             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2202             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2203             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2204
2205             fscal            = felec;
2206
2207             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2208
2209             /* Calculate temporary vectorial force */
2210             tx               = _mm256_mul_pd(fscal,dx11);
2211             ty               = _mm256_mul_pd(fscal,dy11);
2212             tz               = _mm256_mul_pd(fscal,dz11);
2213
2214             /* Update vectorial force */
2215             fix1             = _mm256_add_pd(fix1,tx);
2216             fiy1             = _mm256_add_pd(fiy1,ty);
2217             fiz1             = _mm256_add_pd(fiz1,tz);
2218
2219             fjx1             = _mm256_add_pd(fjx1,tx);
2220             fjy1             = _mm256_add_pd(fjy1,ty);
2221             fjz1             = _mm256_add_pd(fjz1,tz);
2222
2223             }
2224
2225             /**************************
2226              * CALCULATE INTERACTIONS *
2227              **************************/
2228
2229             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2230             {
2231
2232             r12              = _mm256_mul_pd(rsq12,rinv12);
2233
2234             /* EWALD ELECTROSTATICS */
2235
2236             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2237             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2238             ewitab           = _mm256_cvttpd_epi32(ewrt);
2239             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2240             ewitab           = _mm_slli_epi32(ewitab,2);
2241             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2242             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2243             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2244             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2245             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2246             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2247             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2248             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2249             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2250
2251             d                = _mm256_sub_pd(r12,rswitch);
2252             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2253             d2               = _mm256_mul_pd(d,d);
2254             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2255
2256             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2257
2258             /* Evaluate switch function */
2259             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2260             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2261             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2262
2263             fscal            = felec;
2264
2265             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2266
2267             /* Calculate temporary vectorial force */
2268             tx               = _mm256_mul_pd(fscal,dx12);
2269             ty               = _mm256_mul_pd(fscal,dy12);
2270             tz               = _mm256_mul_pd(fscal,dz12);
2271
2272             /* Update vectorial force */
2273             fix1             = _mm256_add_pd(fix1,tx);
2274             fiy1             = _mm256_add_pd(fiy1,ty);
2275             fiz1             = _mm256_add_pd(fiz1,tz);
2276
2277             fjx2             = _mm256_add_pd(fjx2,tx);
2278             fjy2             = _mm256_add_pd(fjy2,ty);
2279             fjz2             = _mm256_add_pd(fjz2,tz);
2280
2281             }
2282
2283             /**************************
2284              * CALCULATE INTERACTIONS *
2285              **************************/
2286
2287             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2288             {
2289
2290             r20              = _mm256_mul_pd(rsq20,rinv20);
2291
2292             /* EWALD ELECTROSTATICS */
2293
2294             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2295             ewrt             = _mm256_mul_pd(r20,ewtabscale);
2296             ewitab           = _mm256_cvttpd_epi32(ewrt);
2297             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2298             ewitab           = _mm_slli_epi32(ewitab,2);
2299             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2300             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2301             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2302             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2303             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2304             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2305             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2306             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
2307             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2308
2309             d                = _mm256_sub_pd(r20,rswitch);
2310             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2311             d2               = _mm256_mul_pd(d,d);
2312             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2313
2314             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2315
2316             /* Evaluate switch function */
2317             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2318             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
2319             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2320
2321             fscal            = felec;
2322
2323             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2324
2325             /* Calculate temporary vectorial force */
2326             tx               = _mm256_mul_pd(fscal,dx20);
2327             ty               = _mm256_mul_pd(fscal,dy20);
2328             tz               = _mm256_mul_pd(fscal,dz20);
2329
2330             /* Update vectorial force */
2331             fix2             = _mm256_add_pd(fix2,tx);
2332             fiy2             = _mm256_add_pd(fiy2,ty);
2333             fiz2             = _mm256_add_pd(fiz2,tz);
2334
2335             fjx0             = _mm256_add_pd(fjx0,tx);
2336             fjy0             = _mm256_add_pd(fjy0,ty);
2337             fjz0             = _mm256_add_pd(fjz0,tz);
2338
2339             }
2340
2341             /**************************
2342              * CALCULATE INTERACTIONS *
2343              **************************/
2344
2345             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2346             {
2347
2348             r21              = _mm256_mul_pd(rsq21,rinv21);
2349
2350             /* EWALD ELECTROSTATICS */
2351
2352             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2353             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2354             ewitab           = _mm256_cvttpd_epi32(ewrt);
2355             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2356             ewitab           = _mm_slli_epi32(ewitab,2);
2357             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2358             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2359             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2360             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2361             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2362             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2363             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2364             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2365             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2366
2367             d                = _mm256_sub_pd(r21,rswitch);
2368             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2369             d2               = _mm256_mul_pd(d,d);
2370             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2371
2372             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2373
2374             /* Evaluate switch function */
2375             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2376             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2377             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2378
2379             fscal            = felec;
2380
2381             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2382
2383             /* Calculate temporary vectorial force */
2384             tx               = _mm256_mul_pd(fscal,dx21);
2385             ty               = _mm256_mul_pd(fscal,dy21);
2386             tz               = _mm256_mul_pd(fscal,dz21);
2387
2388             /* Update vectorial force */
2389             fix2             = _mm256_add_pd(fix2,tx);
2390             fiy2             = _mm256_add_pd(fiy2,ty);
2391             fiz2             = _mm256_add_pd(fiz2,tz);
2392
2393             fjx1             = _mm256_add_pd(fjx1,tx);
2394             fjy1             = _mm256_add_pd(fjy1,ty);
2395             fjz1             = _mm256_add_pd(fjz1,tz);
2396
2397             }
2398
2399             /**************************
2400              * CALCULATE INTERACTIONS *
2401              **************************/
2402
2403             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2404             {
2405
2406             r22              = _mm256_mul_pd(rsq22,rinv22);
2407
2408             /* EWALD ELECTROSTATICS */
2409
2410             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2411             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2412             ewitab           = _mm256_cvttpd_epi32(ewrt);
2413             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2414             ewitab           = _mm_slli_epi32(ewitab,2);
2415             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2416             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2417             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2418             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2419             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2420             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2421             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2422             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2423             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2424
2425             d                = _mm256_sub_pd(r22,rswitch);
2426             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2427             d2               = _mm256_mul_pd(d,d);
2428             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2429
2430             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2431
2432             /* Evaluate switch function */
2433             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2434             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2435             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2436
2437             fscal            = felec;
2438
2439             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2440
2441             /* Calculate temporary vectorial force */
2442             tx               = _mm256_mul_pd(fscal,dx22);
2443             ty               = _mm256_mul_pd(fscal,dy22);
2444             tz               = _mm256_mul_pd(fscal,dz22);
2445
2446             /* Update vectorial force */
2447             fix2             = _mm256_add_pd(fix2,tx);
2448             fiy2             = _mm256_add_pd(fiy2,ty);
2449             fiz2             = _mm256_add_pd(fiz2,tz);
2450
2451             fjx2             = _mm256_add_pd(fjx2,tx);
2452             fjy2             = _mm256_add_pd(fjy2,ty);
2453             fjz2             = _mm256_add_pd(fjz2,tz);
2454
2455             }
2456
2457             fjptrA             = f+j_coord_offsetA;
2458             fjptrB             = f+j_coord_offsetB;
2459             fjptrC             = f+j_coord_offsetC;
2460             fjptrD             = f+j_coord_offsetD;
2461
2462             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2463                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2464
2465             /* Inner loop uses 573 flops */
2466         }
2467
2468         if(jidx<j_index_end)
2469         {
2470
2471             /* Get j neighbor index, and coordinate index */
2472             jnrlistA         = jjnr[jidx];
2473             jnrlistB         = jjnr[jidx+1];
2474             jnrlistC         = jjnr[jidx+2];
2475             jnrlistD         = jjnr[jidx+3];
2476             /* Sign of each element will be negative for non-real atoms.
2477              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2478              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2479              */
2480             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2481
2482             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2483             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2484             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2485
2486             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2487             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2488             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2489             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2490             j_coord_offsetA  = DIM*jnrA;
2491             j_coord_offsetB  = DIM*jnrB;
2492             j_coord_offsetC  = DIM*jnrC;
2493             j_coord_offsetD  = DIM*jnrD;
2494
2495             /* load j atom coordinates */
2496             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2497                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2498                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2499
2500             /* Calculate displacement vector */
2501             dx00             = _mm256_sub_pd(ix0,jx0);
2502             dy00             = _mm256_sub_pd(iy0,jy0);
2503             dz00             = _mm256_sub_pd(iz0,jz0);
2504             dx01             = _mm256_sub_pd(ix0,jx1);
2505             dy01             = _mm256_sub_pd(iy0,jy1);
2506             dz01             = _mm256_sub_pd(iz0,jz1);
2507             dx02             = _mm256_sub_pd(ix0,jx2);
2508             dy02             = _mm256_sub_pd(iy0,jy2);
2509             dz02             = _mm256_sub_pd(iz0,jz2);
2510             dx10             = _mm256_sub_pd(ix1,jx0);
2511             dy10             = _mm256_sub_pd(iy1,jy0);
2512             dz10             = _mm256_sub_pd(iz1,jz0);
2513             dx11             = _mm256_sub_pd(ix1,jx1);
2514             dy11             = _mm256_sub_pd(iy1,jy1);
2515             dz11             = _mm256_sub_pd(iz1,jz1);
2516             dx12             = _mm256_sub_pd(ix1,jx2);
2517             dy12             = _mm256_sub_pd(iy1,jy2);
2518             dz12             = _mm256_sub_pd(iz1,jz2);
2519             dx20             = _mm256_sub_pd(ix2,jx0);
2520             dy20             = _mm256_sub_pd(iy2,jy0);
2521             dz20             = _mm256_sub_pd(iz2,jz0);
2522             dx21             = _mm256_sub_pd(ix2,jx1);
2523             dy21             = _mm256_sub_pd(iy2,jy1);
2524             dz21             = _mm256_sub_pd(iz2,jz1);
2525             dx22             = _mm256_sub_pd(ix2,jx2);
2526             dy22             = _mm256_sub_pd(iy2,jy2);
2527             dz22             = _mm256_sub_pd(iz2,jz2);
2528
2529             /* Calculate squared distance and things based on it */
2530             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2531             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2532             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2533             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2534             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2535             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2536             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2537             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2538             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2539
2540             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
2541             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
2542             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
2543             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
2544             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2545             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2546             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
2547             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2548             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2549
2550             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2551             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
2552             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
2553             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
2554             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2555             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2556             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
2557             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2558             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2559
2560             fjx0             = _mm256_setzero_pd();
2561             fjy0             = _mm256_setzero_pd();
2562             fjz0             = _mm256_setzero_pd();
2563             fjx1             = _mm256_setzero_pd();
2564             fjy1             = _mm256_setzero_pd();
2565             fjz1             = _mm256_setzero_pd();
2566             fjx2             = _mm256_setzero_pd();
2567             fjy2             = _mm256_setzero_pd();
2568             fjz2             = _mm256_setzero_pd();
2569
2570             /**************************
2571              * CALCULATE INTERACTIONS *
2572              **************************/
2573
2574             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2575             {
2576
2577             r00              = _mm256_mul_pd(rsq00,rinv00);
2578             r00              = _mm256_andnot_pd(dummy_mask,r00);
2579
2580             /* EWALD ELECTROSTATICS */
2581
2582             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2583             ewrt             = _mm256_mul_pd(r00,ewtabscale);
2584             ewitab           = _mm256_cvttpd_epi32(ewrt);
2585             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2586             ewitab           = _mm_slli_epi32(ewitab,2);
2587             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2588             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2589             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2590             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2591             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2592             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2593             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2594             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
2595             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2596
2597             /* LENNARD-JONES DISPERSION/REPULSION */
2598
2599             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2600             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
2601             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2602             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2603             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2604
2605             d                = _mm256_sub_pd(r00,rswitch);
2606             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2607             d2               = _mm256_mul_pd(d,d);
2608             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2609
2610             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2611
2612             /* Evaluate switch function */
2613             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2614             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
2615             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2616             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2617
2618             fscal            = _mm256_add_pd(felec,fvdw);
2619
2620             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2621
2622             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2623
2624             /* Calculate temporary vectorial force */
2625             tx               = _mm256_mul_pd(fscal,dx00);
2626             ty               = _mm256_mul_pd(fscal,dy00);
2627             tz               = _mm256_mul_pd(fscal,dz00);
2628
2629             /* Update vectorial force */
2630             fix0             = _mm256_add_pd(fix0,tx);
2631             fiy0             = _mm256_add_pd(fiy0,ty);
2632             fiz0             = _mm256_add_pd(fiz0,tz);
2633
2634             fjx0             = _mm256_add_pd(fjx0,tx);
2635             fjy0             = _mm256_add_pd(fjy0,ty);
2636             fjz0             = _mm256_add_pd(fjz0,tz);
2637
2638             }
2639
2640             /**************************
2641              * CALCULATE INTERACTIONS *
2642              **************************/
2643
2644             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2645             {
2646
2647             r01              = _mm256_mul_pd(rsq01,rinv01);
2648             r01              = _mm256_andnot_pd(dummy_mask,r01);
2649
2650             /* EWALD ELECTROSTATICS */
2651
2652             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2653             ewrt             = _mm256_mul_pd(r01,ewtabscale);
2654             ewitab           = _mm256_cvttpd_epi32(ewrt);
2655             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2656             ewitab           = _mm_slli_epi32(ewitab,2);
2657             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2658             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2659             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2660             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2661             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2662             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2663             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2664             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
2665             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2666
2667             d                = _mm256_sub_pd(r01,rswitch);
2668             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2669             d2               = _mm256_mul_pd(d,d);
2670             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2671
2672             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2673
2674             /* Evaluate switch function */
2675             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2676             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
2677             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2678
2679             fscal            = felec;
2680
2681             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2682
2683             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2684
2685             /* Calculate temporary vectorial force */
2686             tx               = _mm256_mul_pd(fscal,dx01);
2687             ty               = _mm256_mul_pd(fscal,dy01);
2688             tz               = _mm256_mul_pd(fscal,dz01);
2689
2690             /* Update vectorial force */
2691             fix0             = _mm256_add_pd(fix0,tx);
2692             fiy0             = _mm256_add_pd(fiy0,ty);
2693             fiz0             = _mm256_add_pd(fiz0,tz);
2694
2695             fjx1             = _mm256_add_pd(fjx1,tx);
2696             fjy1             = _mm256_add_pd(fjy1,ty);
2697             fjz1             = _mm256_add_pd(fjz1,tz);
2698
2699             }
2700
2701             /**************************
2702              * CALCULATE INTERACTIONS *
2703              **************************/
2704
2705             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2706             {
2707
2708             r02              = _mm256_mul_pd(rsq02,rinv02);
2709             r02              = _mm256_andnot_pd(dummy_mask,r02);
2710
2711             /* EWALD ELECTROSTATICS */
2712
2713             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2714             ewrt             = _mm256_mul_pd(r02,ewtabscale);
2715             ewitab           = _mm256_cvttpd_epi32(ewrt);
2716             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2717             ewitab           = _mm_slli_epi32(ewitab,2);
2718             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2719             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2720             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2721             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2722             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2723             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2724             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2725             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
2726             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2727
2728             d                = _mm256_sub_pd(r02,rswitch);
2729             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2730             d2               = _mm256_mul_pd(d,d);
2731             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2732
2733             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2734
2735             /* Evaluate switch function */
2736             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2737             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
2738             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2739
2740             fscal            = felec;
2741
2742             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2743
2744             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2745
2746             /* Calculate temporary vectorial force */
2747             tx               = _mm256_mul_pd(fscal,dx02);
2748             ty               = _mm256_mul_pd(fscal,dy02);
2749             tz               = _mm256_mul_pd(fscal,dz02);
2750
2751             /* Update vectorial force */
2752             fix0             = _mm256_add_pd(fix0,tx);
2753             fiy0             = _mm256_add_pd(fiy0,ty);
2754             fiz0             = _mm256_add_pd(fiz0,tz);
2755
2756             fjx2             = _mm256_add_pd(fjx2,tx);
2757             fjy2             = _mm256_add_pd(fjy2,ty);
2758             fjz2             = _mm256_add_pd(fjz2,tz);
2759
2760             }
2761
2762             /**************************
2763              * CALCULATE INTERACTIONS *
2764              **************************/
2765
2766             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2767             {
2768
2769             r10              = _mm256_mul_pd(rsq10,rinv10);
2770             r10              = _mm256_andnot_pd(dummy_mask,r10);
2771
2772             /* EWALD ELECTROSTATICS */
2773
2774             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2775             ewrt             = _mm256_mul_pd(r10,ewtabscale);
2776             ewitab           = _mm256_cvttpd_epi32(ewrt);
2777             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2778             ewitab           = _mm_slli_epi32(ewitab,2);
2779             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2780             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2781             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2782             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2783             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2784             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2785             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2786             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
2787             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2788
2789             d                = _mm256_sub_pd(r10,rswitch);
2790             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2791             d2               = _mm256_mul_pd(d,d);
2792             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2793
2794             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2795
2796             /* Evaluate switch function */
2797             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2798             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
2799             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2800
2801             fscal            = felec;
2802
2803             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2804
2805             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2806
2807             /* Calculate temporary vectorial force */
2808             tx               = _mm256_mul_pd(fscal,dx10);
2809             ty               = _mm256_mul_pd(fscal,dy10);
2810             tz               = _mm256_mul_pd(fscal,dz10);
2811
2812             /* Update vectorial force */
2813             fix1             = _mm256_add_pd(fix1,tx);
2814             fiy1             = _mm256_add_pd(fiy1,ty);
2815             fiz1             = _mm256_add_pd(fiz1,tz);
2816
2817             fjx0             = _mm256_add_pd(fjx0,tx);
2818             fjy0             = _mm256_add_pd(fjy0,ty);
2819             fjz0             = _mm256_add_pd(fjz0,tz);
2820
2821             }
2822
2823             /**************************
2824              * CALCULATE INTERACTIONS *
2825              **************************/
2826
2827             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2828             {
2829
2830             r11              = _mm256_mul_pd(rsq11,rinv11);
2831             r11              = _mm256_andnot_pd(dummy_mask,r11);
2832
2833             /* EWALD ELECTROSTATICS */
2834
2835             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2836             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2837             ewitab           = _mm256_cvttpd_epi32(ewrt);
2838             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2839             ewitab           = _mm_slli_epi32(ewitab,2);
2840             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2841             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2842             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2843             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2844             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2845             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2846             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2847             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2848             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2849
2850             d                = _mm256_sub_pd(r11,rswitch);
2851             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2852             d2               = _mm256_mul_pd(d,d);
2853             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2854
2855             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2856
2857             /* Evaluate switch function */
2858             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2859             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2860             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2861
2862             fscal            = felec;
2863
2864             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2865
2866             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2867
2868             /* Calculate temporary vectorial force */
2869             tx               = _mm256_mul_pd(fscal,dx11);
2870             ty               = _mm256_mul_pd(fscal,dy11);
2871             tz               = _mm256_mul_pd(fscal,dz11);
2872
2873             /* Update vectorial force */
2874             fix1             = _mm256_add_pd(fix1,tx);
2875             fiy1             = _mm256_add_pd(fiy1,ty);
2876             fiz1             = _mm256_add_pd(fiz1,tz);
2877
2878             fjx1             = _mm256_add_pd(fjx1,tx);
2879             fjy1             = _mm256_add_pd(fjy1,ty);
2880             fjz1             = _mm256_add_pd(fjz1,tz);
2881
2882             }
2883
2884             /**************************
2885              * CALCULATE INTERACTIONS *
2886              **************************/
2887
2888             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2889             {
2890
2891             r12              = _mm256_mul_pd(rsq12,rinv12);
2892             r12              = _mm256_andnot_pd(dummy_mask,r12);
2893
2894             /* EWALD ELECTROSTATICS */
2895
2896             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2897             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2898             ewitab           = _mm256_cvttpd_epi32(ewrt);
2899             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2900             ewitab           = _mm_slli_epi32(ewitab,2);
2901             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2902             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2903             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2904             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2905             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2906             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2907             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2908             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2909             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2910
2911             d                = _mm256_sub_pd(r12,rswitch);
2912             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2913             d2               = _mm256_mul_pd(d,d);
2914             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2915
2916             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2917
2918             /* Evaluate switch function */
2919             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2920             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2921             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2922
2923             fscal            = felec;
2924
2925             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2926
2927             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2928
2929             /* Calculate temporary vectorial force */
2930             tx               = _mm256_mul_pd(fscal,dx12);
2931             ty               = _mm256_mul_pd(fscal,dy12);
2932             tz               = _mm256_mul_pd(fscal,dz12);
2933
2934             /* Update vectorial force */
2935             fix1             = _mm256_add_pd(fix1,tx);
2936             fiy1             = _mm256_add_pd(fiy1,ty);
2937             fiz1             = _mm256_add_pd(fiz1,tz);
2938
2939             fjx2             = _mm256_add_pd(fjx2,tx);
2940             fjy2             = _mm256_add_pd(fjy2,ty);
2941             fjz2             = _mm256_add_pd(fjz2,tz);
2942
2943             }
2944
2945             /**************************
2946              * CALCULATE INTERACTIONS *
2947              **************************/
2948
2949             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2950             {
2951
2952             r20              = _mm256_mul_pd(rsq20,rinv20);
2953             r20              = _mm256_andnot_pd(dummy_mask,r20);
2954
2955             /* EWALD ELECTROSTATICS */
2956
2957             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2958             ewrt             = _mm256_mul_pd(r20,ewtabscale);
2959             ewitab           = _mm256_cvttpd_epi32(ewrt);
2960             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2961             ewitab           = _mm_slli_epi32(ewitab,2);
2962             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2963             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2964             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2965             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2966             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2967             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2968             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2969             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
2970             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2971
2972             d                = _mm256_sub_pd(r20,rswitch);
2973             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2974             d2               = _mm256_mul_pd(d,d);
2975             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2976
2977             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2978
2979             /* Evaluate switch function */
2980             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2981             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
2982             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2983
2984             fscal            = felec;
2985
2986             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2987
2988             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2989
2990             /* Calculate temporary vectorial force */
2991             tx               = _mm256_mul_pd(fscal,dx20);
2992             ty               = _mm256_mul_pd(fscal,dy20);
2993             tz               = _mm256_mul_pd(fscal,dz20);
2994
2995             /* Update vectorial force */
2996             fix2             = _mm256_add_pd(fix2,tx);
2997             fiy2             = _mm256_add_pd(fiy2,ty);
2998             fiz2             = _mm256_add_pd(fiz2,tz);
2999
3000             fjx0             = _mm256_add_pd(fjx0,tx);
3001             fjy0             = _mm256_add_pd(fjy0,ty);
3002             fjz0             = _mm256_add_pd(fjz0,tz);
3003
3004             }
3005
3006             /**************************
3007              * CALCULATE INTERACTIONS *
3008              **************************/
3009
3010             if (gmx_mm256_any_lt(rsq21,rcutoff2))
3011             {
3012
3013             r21              = _mm256_mul_pd(rsq21,rinv21);
3014             r21              = _mm256_andnot_pd(dummy_mask,r21);
3015
3016             /* EWALD ELECTROSTATICS */
3017
3018             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3019             ewrt             = _mm256_mul_pd(r21,ewtabscale);
3020             ewitab           = _mm256_cvttpd_epi32(ewrt);
3021             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3022             ewitab           = _mm_slli_epi32(ewitab,2);
3023             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3024             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3025             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3026             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3027             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3028             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3029             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3030             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
3031             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
3032
3033             d                = _mm256_sub_pd(r21,rswitch);
3034             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3035             d2               = _mm256_mul_pd(d,d);
3036             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3037
3038             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3039
3040             /* Evaluate switch function */
3041             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3042             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
3043             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
3044
3045             fscal            = felec;
3046
3047             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3048
3049             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3050
3051             /* Calculate temporary vectorial force */
3052             tx               = _mm256_mul_pd(fscal,dx21);
3053             ty               = _mm256_mul_pd(fscal,dy21);
3054             tz               = _mm256_mul_pd(fscal,dz21);
3055
3056             /* Update vectorial force */
3057             fix2             = _mm256_add_pd(fix2,tx);
3058             fiy2             = _mm256_add_pd(fiy2,ty);
3059             fiz2             = _mm256_add_pd(fiz2,tz);
3060
3061             fjx1             = _mm256_add_pd(fjx1,tx);
3062             fjy1             = _mm256_add_pd(fjy1,ty);
3063             fjz1             = _mm256_add_pd(fjz1,tz);
3064
3065             }
3066
3067             /**************************
3068              * CALCULATE INTERACTIONS *
3069              **************************/
3070
3071             if (gmx_mm256_any_lt(rsq22,rcutoff2))
3072             {
3073
3074             r22              = _mm256_mul_pd(rsq22,rinv22);
3075             r22              = _mm256_andnot_pd(dummy_mask,r22);
3076
3077             /* EWALD ELECTROSTATICS */
3078
3079             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3080             ewrt             = _mm256_mul_pd(r22,ewtabscale);
3081             ewitab           = _mm256_cvttpd_epi32(ewrt);
3082             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3083             ewitab           = _mm_slli_epi32(ewitab,2);
3084             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3085             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3086             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3087             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3088             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3089             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3090             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3091             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
3092             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
3093
3094             d                = _mm256_sub_pd(r22,rswitch);
3095             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3096             d2               = _mm256_mul_pd(d,d);
3097             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3098
3099             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3100
3101             /* Evaluate switch function */
3102             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3103             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
3104             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
3105
3106             fscal            = felec;
3107
3108             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3109
3110             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3111
3112             /* Calculate temporary vectorial force */
3113             tx               = _mm256_mul_pd(fscal,dx22);
3114             ty               = _mm256_mul_pd(fscal,dy22);
3115             tz               = _mm256_mul_pd(fscal,dz22);
3116
3117             /* Update vectorial force */
3118             fix2             = _mm256_add_pd(fix2,tx);
3119             fiy2             = _mm256_add_pd(fiy2,ty);
3120             fiz2             = _mm256_add_pd(fiz2,tz);
3121
3122             fjx2             = _mm256_add_pd(fjx2,tx);
3123             fjy2             = _mm256_add_pd(fjy2,ty);
3124             fjz2             = _mm256_add_pd(fjz2,tz);
3125
3126             }
3127
3128             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3129             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3130             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3131             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3132
3133             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
3134                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3135
3136             /* Inner loop uses 582 flops */
3137         }
3138
3139         /* End of innermost loop */
3140
3141         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3142                                                  f+i_coord_offset,fshift+i_shift_offset);
3143
3144         /* Increment number of inner iterations */
3145         inneriter                  += j_index_end - j_index_start;
3146
3147         /* Outer loop uses 18 flops */
3148     }
3149
3150     /* Increment number of outer iterations */
3151     outeriter        += nri;
3152
3153     /* Update outer/inner flops */
3154
3155     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*582);
3156 }