0f9b46652a9036f0ef36f6f876a9a31a1960d7dc
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_double
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82     real             scratch[4*DIM];
83     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84     real *           vdwioffsetptr0;
85     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86     real *           vdwioffsetptr1;
87     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88     real *           vdwioffsetptr2;
89     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
106     real             *charge;
107     int              nvdwtype;
108     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109     int              *vdwtype;
110     real             *vdwparam;
111     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
112     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
113     __m128i          ewitab;
114     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
115     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
116     real             *ewtab;
117     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
118     real             rswitch_scalar,d_scalar;
119     __m256d          dummy_mask,cutoff_mask;
120     __m128           tmpmask0,tmpmask1;
121     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
122     __m256d          one     = _mm256_set1_pd(1.0);
123     __m256d          two     = _mm256_set1_pd(2.0);
124     x                = xx[0];
125     f                = ff[0];
126
127     nri              = nlist->nri;
128     iinr             = nlist->iinr;
129     jindex           = nlist->jindex;
130     jjnr             = nlist->jjnr;
131     shiftidx         = nlist->shift;
132     gid              = nlist->gid;
133     shiftvec         = fr->shift_vec[0];
134     fshift           = fr->fshift[0];
135     facel            = _mm256_set1_pd(fr->epsfac);
136     charge           = mdatoms->chargeA;
137     nvdwtype         = fr->ntype;
138     vdwparam         = fr->nbfp;
139     vdwtype          = mdatoms->typeA;
140
141     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
142     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
143     beta2            = _mm256_mul_pd(beta,beta);
144     beta3            = _mm256_mul_pd(beta,beta2);
145
146     ewtab            = fr->ic->tabq_coul_FDV0;
147     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
148     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
149
150     /* Setup water-specific parameters */
151     inr              = nlist->iinr[0];
152     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
153     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
154     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
155     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
156
157     jq0              = _mm256_set1_pd(charge[inr+0]);
158     jq1              = _mm256_set1_pd(charge[inr+1]);
159     jq2              = _mm256_set1_pd(charge[inr+2]);
160     vdwjidx0A        = 2*vdwtype[inr+0];
161     qq00             = _mm256_mul_pd(iq0,jq0);
162     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
163     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
164     qq01             = _mm256_mul_pd(iq0,jq1);
165     qq02             = _mm256_mul_pd(iq0,jq2);
166     qq10             = _mm256_mul_pd(iq1,jq0);
167     qq11             = _mm256_mul_pd(iq1,jq1);
168     qq12             = _mm256_mul_pd(iq1,jq2);
169     qq20             = _mm256_mul_pd(iq2,jq0);
170     qq21             = _mm256_mul_pd(iq2,jq1);
171     qq22             = _mm256_mul_pd(iq2,jq2);
172
173     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
174     rcutoff_scalar   = fr->rcoulomb;
175     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
176     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
177
178     rswitch_scalar   = fr->rcoulomb_switch;
179     rswitch          = _mm256_set1_pd(rswitch_scalar);
180     /* Setup switch parameters */
181     d_scalar         = rcutoff_scalar-rswitch_scalar;
182     d                = _mm256_set1_pd(d_scalar);
183     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
184     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
185     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
186     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
187     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
188     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
189
190     /* Avoid stupid compiler warnings */
191     jnrA = jnrB = jnrC = jnrD = 0;
192     j_coord_offsetA = 0;
193     j_coord_offsetB = 0;
194     j_coord_offsetC = 0;
195     j_coord_offsetD = 0;
196
197     outeriter        = 0;
198     inneriter        = 0;
199
200     for(iidx=0;iidx<4*DIM;iidx++)
201     {
202         scratch[iidx] = 0.0;
203     }
204
205     /* Start outer loop over neighborlists */
206     for(iidx=0; iidx<nri; iidx++)
207     {
208         /* Load shift vector for this list */
209         i_shift_offset   = DIM*shiftidx[iidx];
210
211         /* Load limits for loop over neighbors */
212         j_index_start    = jindex[iidx];
213         j_index_end      = jindex[iidx+1];
214
215         /* Get outer coordinate index */
216         inr              = iinr[iidx];
217         i_coord_offset   = DIM*inr;
218
219         /* Load i particle coords and add shift vector */
220         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
221                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
222
223         fix0             = _mm256_setzero_pd();
224         fiy0             = _mm256_setzero_pd();
225         fiz0             = _mm256_setzero_pd();
226         fix1             = _mm256_setzero_pd();
227         fiy1             = _mm256_setzero_pd();
228         fiz1             = _mm256_setzero_pd();
229         fix2             = _mm256_setzero_pd();
230         fiy2             = _mm256_setzero_pd();
231         fiz2             = _mm256_setzero_pd();
232
233         /* Reset potential sums */
234         velecsum         = _mm256_setzero_pd();
235         vvdwsum          = _mm256_setzero_pd();
236
237         /* Start inner kernel loop */
238         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
239         {
240
241             /* Get j neighbor index, and coordinate index */
242             jnrA             = jjnr[jidx];
243             jnrB             = jjnr[jidx+1];
244             jnrC             = jjnr[jidx+2];
245             jnrD             = jjnr[jidx+3];
246             j_coord_offsetA  = DIM*jnrA;
247             j_coord_offsetB  = DIM*jnrB;
248             j_coord_offsetC  = DIM*jnrC;
249             j_coord_offsetD  = DIM*jnrD;
250
251             /* load j atom coordinates */
252             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
253                                                  x+j_coord_offsetC,x+j_coord_offsetD,
254                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
255
256             /* Calculate displacement vector */
257             dx00             = _mm256_sub_pd(ix0,jx0);
258             dy00             = _mm256_sub_pd(iy0,jy0);
259             dz00             = _mm256_sub_pd(iz0,jz0);
260             dx01             = _mm256_sub_pd(ix0,jx1);
261             dy01             = _mm256_sub_pd(iy0,jy1);
262             dz01             = _mm256_sub_pd(iz0,jz1);
263             dx02             = _mm256_sub_pd(ix0,jx2);
264             dy02             = _mm256_sub_pd(iy0,jy2);
265             dz02             = _mm256_sub_pd(iz0,jz2);
266             dx10             = _mm256_sub_pd(ix1,jx0);
267             dy10             = _mm256_sub_pd(iy1,jy0);
268             dz10             = _mm256_sub_pd(iz1,jz0);
269             dx11             = _mm256_sub_pd(ix1,jx1);
270             dy11             = _mm256_sub_pd(iy1,jy1);
271             dz11             = _mm256_sub_pd(iz1,jz1);
272             dx12             = _mm256_sub_pd(ix1,jx2);
273             dy12             = _mm256_sub_pd(iy1,jy2);
274             dz12             = _mm256_sub_pd(iz1,jz2);
275             dx20             = _mm256_sub_pd(ix2,jx0);
276             dy20             = _mm256_sub_pd(iy2,jy0);
277             dz20             = _mm256_sub_pd(iz2,jz0);
278             dx21             = _mm256_sub_pd(ix2,jx1);
279             dy21             = _mm256_sub_pd(iy2,jy1);
280             dz21             = _mm256_sub_pd(iz2,jz1);
281             dx22             = _mm256_sub_pd(ix2,jx2);
282             dy22             = _mm256_sub_pd(iy2,jy2);
283             dz22             = _mm256_sub_pd(iz2,jz2);
284
285             /* Calculate squared distance and things based on it */
286             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
287             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
288             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
289             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
290             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
291             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
292             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
293             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
294             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
295
296             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
297             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
298             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
299             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
300             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
301             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
302             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
303             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
304             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
305
306             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
307             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
308             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
309             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
310             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
311             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
312             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
313             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
314             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
315
316             fjx0             = _mm256_setzero_pd();
317             fjy0             = _mm256_setzero_pd();
318             fjz0             = _mm256_setzero_pd();
319             fjx1             = _mm256_setzero_pd();
320             fjy1             = _mm256_setzero_pd();
321             fjz1             = _mm256_setzero_pd();
322             fjx2             = _mm256_setzero_pd();
323             fjy2             = _mm256_setzero_pd();
324             fjz2             = _mm256_setzero_pd();
325
326             /**************************
327              * CALCULATE INTERACTIONS *
328              **************************/
329
330             if (gmx_mm256_any_lt(rsq00,rcutoff2))
331             {
332
333             r00              = _mm256_mul_pd(rsq00,rinv00);
334
335             /* EWALD ELECTROSTATICS */
336
337             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
338             ewrt             = _mm256_mul_pd(r00,ewtabscale);
339             ewitab           = _mm256_cvttpd_epi32(ewrt);
340             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
341             ewitab           = _mm_slli_epi32(ewitab,2);
342             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
343             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
344             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
345             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
346             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
347             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
348             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
349             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
350             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
351
352             /* LENNARD-JONES DISPERSION/REPULSION */
353
354             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
355             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
356             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
357             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
358             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
359
360             d                = _mm256_sub_pd(r00,rswitch);
361             d                = _mm256_max_pd(d,_mm256_setzero_pd());
362             d2               = _mm256_mul_pd(d,d);
363             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
364
365             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
366
367             /* Evaluate switch function */
368             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
369             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
370             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
371             velec            = _mm256_mul_pd(velec,sw);
372             vvdw             = _mm256_mul_pd(vvdw,sw);
373             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
374
375             /* Update potential sum for this i atom from the interaction with this j atom. */
376             velec            = _mm256_and_pd(velec,cutoff_mask);
377             velecsum         = _mm256_add_pd(velecsum,velec);
378             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
379             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
380
381             fscal            = _mm256_add_pd(felec,fvdw);
382
383             fscal            = _mm256_and_pd(fscal,cutoff_mask);
384
385             /* Calculate temporary vectorial force */
386             tx               = _mm256_mul_pd(fscal,dx00);
387             ty               = _mm256_mul_pd(fscal,dy00);
388             tz               = _mm256_mul_pd(fscal,dz00);
389
390             /* Update vectorial force */
391             fix0             = _mm256_add_pd(fix0,tx);
392             fiy0             = _mm256_add_pd(fiy0,ty);
393             fiz0             = _mm256_add_pd(fiz0,tz);
394
395             fjx0             = _mm256_add_pd(fjx0,tx);
396             fjy0             = _mm256_add_pd(fjy0,ty);
397             fjz0             = _mm256_add_pd(fjz0,tz);
398
399             }
400
401             /**************************
402              * CALCULATE INTERACTIONS *
403              **************************/
404
405             if (gmx_mm256_any_lt(rsq01,rcutoff2))
406             {
407
408             r01              = _mm256_mul_pd(rsq01,rinv01);
409
410             /* EWALD ELECTROSTATICS */
411
412             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
413             ewrt             = _mm256_mul_pd(r01,ewtabscale);
414             ewitab           = _mm256_cvttpd_epi32(ewrt);
415             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
416             ewitab           = _mm_slli_epi32(ewitab,2);
417             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
418             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
419             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
420             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
421             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
422             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
423             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
424             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
425             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
426
427             d                = _mm256_sub_pd(r01,rswitch);
428             d                = _mm256_max_pd(d,_mm256_setzero_pd());
429             d2               = _mm256_mul_pd(d,d);
430             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
431
432             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
433
434             /* Evaluate switch function */
435             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
436             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
437             velec            = _mm256_mul_pd(velec,sw);
438             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
439
440             /* Update potential sum for this i atom from the interaction with this j atom. */
441             velec            = _mm256_and_pd(velec,cutoff_mask);
442             velecsum         = _mm256_add_pd(velecsum,velec);
443
444             fscal            = felec;
445
446             fscal            = _mm256_and_pd(fscal,cutoff_mask);
447
448             /* Calculate temporary vectorial force */
449             tx               = _mm256_mul_pd(fscal,dx01);
450             ty               = _mm256_mul_pd(fscal,dy01);
451             tz               = _mm256_mul_pd(fscal,dz01);
452
453             /* Update vectorial force */
454             fix0             = _mm256_add_pd(fix0,tx);
455             fiy0             = _mm256_add_pd(fiy0,ty);
456             fiz0             = _mm256_add_pd(fiz0,tz);
457
458             fjx1             = _mm256_add_pd(fjx1,tx);
459             fjy1             = _mm256_add_pd(fjy1,ty);
460             fjz1             = _mm256_add_pd(fjz1,tz);
461
462             }
463
464             /**************************
465              * CALCULATE INTERACTIONS *
466              **************************/
467
468             if (gmx_mm256_any_lt(rsq02,rcutoff2))
469             {
470
471             r02              = _mm256_mul_pd(rsq02,rinv02);
472
473             /* EWALD ELECTROSTATICS */
474
475             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
476             ewrt             = _mm256_mul_pd(r02,ewtabscale);
477             ewitab           = _mm256_cvttpd_epi32(ewrt);
478             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
479             ewitab           = _mm_slli_epi32(ewitab,2);
480             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
481             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
482             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
483             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
484             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
485             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
486             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
487             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
488             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
489
490             d                = _mm256_sub_pd(r02,rswitch);
491             d                = _mm256_max_pd(d,_mm256_setzero_pd());
492             d2               = _mm256_mul_pd(d,d);
493             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
494
495             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
496
497             /* Evaluate switch function */
498             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
499             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
500             velec            = _mm256_mul_pd(velec,sw);
501             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
502
503             /* Update potential sum for this i atom from the interaction with this j atom. */
504             velec            = _mm256_and_pd(velec,cutoff_mask);
505             velecsum         = _mm256_add_pd(velecsum,velec);
506
507             fscal            = felec;
508
509             fscal            = _mm256_and_pd(fscal,cutoff_mask);
510
511             /* Calculate temporary vectorial force */
512             tx               = _mm256_mul_pd(fscal,dx02);
513             ty               = _mm256_mul_pd(fscal,dy02);
514             tz               = _mm256_mul_pd(fscal,dz02);
515
516             /* Update vectorial force */
517             fix0             = _mm256_add_pd(fix0,tx);
518             fiy0             = _mm256_add_pd(fiy0,ty);
519             fiz0             = _mm256_add_pd(fiz0,tz);
520
521             fjx2             = _mm256_add_pd(fjx2,tx);
522             fjy2             = _mm256_add_pd(fjy2,ty);
523             fjz2             = _mm256_add_pd(fjz2,tz);
524
525             }
526
527             /**************************
528              * CALCULATE INTERACTIONS *
529              **************************/
530
531             if (gmx_mm256_any_lt(rsq10,rcutoff2))
532             {
533
534             r10              = _mm256_mul_pd(rsq10,rinv10);
535
536             /* EWALD ELECTROSTATICS */
537
538             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
539             ewrt             = _mm256_mul_pd(r10,ewtabscale);
540             ewitab           = _mm256_cvttpd_epi32(ewrt);
541             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
542             ewitab           = _mm_slli_epi32(ewitab,2);
543             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
544             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
545             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
546             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
547             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
548             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
549             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
550             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
551             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
552
553             d                = _mm256_sub_pd(r10,rswitch);
554             d                = _mm256_max_pd(d,_mm256_setzero_pd());
555             d2               = _mm256_mul_pd(d,d);
556             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
557
558             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
559
560             /* Evaluate switch function */
561             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
562             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
563             velec            = _mm256_mul_pd(velec,sw);
564             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
565
566             /* Update potential sum for this i atom from the interaction with this j atom. */
567             velec            = _mm256_and_pd(velec,cutoff_mask);
568             velecsum         = _mm256_add_pd(velecsum,velec);
569
570             fscal            = felec;
571
572             fscal            = _mm256_and_pd(fscal,cutoff_mask);
573
574             /* Calculate temporary vectorial force */
575             tx               = _mm256_mul_pd(fscal,dx10);
576             ty               = _mm256_mul_pd(fscal,dy10);
577             tz               = _mm256_mul_pd(fscal,dz10);
578
579             /* Update vectorial force */
580             fix1             = _mm256_add_pd(fix1,tx);
581             fiy1             = _mm256_add_pd(fiy1,ty);
582             fiz1             = _mm256_add_pd(fiz1,tz);
583
584             fjx0             = _mm256_add_pd(fjx0,tx);
585             fjy0             = _mm256_add_pd(fjy0,ty);
586             fjz0             = _mm256_add_pd(fjz0,tz);
587
588             }
589
590             /**************************
591              * CALCULATE INTERACTIONS *
592              **************************/
593
594             if (gmx_mm256_any_lt(rsq11,rcutoff2))
595             {
596
597             r11              = _mm256_mul_pd(rsq11,rinv11);
598
599             /* EWALD ELECTROSTATICS */
600
601             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
602             ewrt             = _mm256_mul_pd(r11,ewtabscale);
603             ewitab           = _mm256_cvttpd_epi32(ewrt);
604             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
605             ewitab           = _mm_slli_epi32(ewitab,2);
606             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
607             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
608             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
609             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
610             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
611             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
612             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
613             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
614             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
615
616             d                = _mm256_sub_pd(r11,rswitch);
617             d                = _mm256_max_pd(d,_mm256_setzero_pd());
618             d2               = _mm256_mul_pd(d,d);
619             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
620
621             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
622
623             /* Evaluate switch function */
624             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
625             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
626             velec            = _mm256_mul_pd(velec,sw);
627             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
628
629             /* Update potential sum for this i atom from the interaction with this j atom. */
630             velec            = _mm256_and_pd(velec,cutoff_mask);
631             velecsum         = _mm256_add_pd(velecsum,velec);
632
633             fscal            = felec;
634
635             fscal            = _mm256_and_pd(fscal,cutoff_mask);
636
637             /* Calculate temporary vectorial force */
638             tx               = _mm256_mul_pd(fscal,dx11);
639             ty               = _mm256_mul_pd(fscal,dy11);
640             tz               = _mm256_mul_pd(fscal,dz11);
641
642             /* Update vectorial force */
643             fix1             = _mm256_add_pd(fix1,tx);
644             fiy1             = _mm256_add_pd(fiy1,ty);
645             fiz1             = _mm256_add_pd(fiz1,tz);
646
647             fjx1             = _mm256_add_pd(fjx1,tx);
648             fjy1             = _mm256_add_pd(fjy1,ty);
649             fjz1             = _mm256_add_pd(fjz1,tz);
650
651             }
652
653             /**************************
654              * CALCULATE INTERACTIONS *
655              **************************/
656
657             if (gmx_mm256_any_lt(rsq12,rcutoff2))
658             {
659
660             r12              = _mm256_mul_pd(rsq12,rinv12);
661
662             /* EWALD ELECTROSTATICS */
663
664             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
665             ewrt             = _mm256_mul_pd(r12,ewtabscale);
666             ewitab           = _mm256_cvttpd_epi32(ewrt);
667             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
668             ewitab           = _mm_slli_epi32(ewitab,2);
669             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
670             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
671             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
672             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
673             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
674             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
675             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
676             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
677             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
678
679             d                = _mm256_sub_pd(r12,rswitch);
680             d                = _mm256_max_pd(d,_mm256_setzero_pd());
681             d2               = _mm256_mul_pd(d,d);
682             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
683
684             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
685
686             /* Evaluate switch function */
687             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
688             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
689             velec            = _mm256_mul_pd(velec,sw);
690             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
691
692             /* Update potential sum for this i atom from the interaction with this j atom. */
693             velec            = _mm256_and_pd(velec,cutoff_mask);
694             velecsum         = _mm256_add_pd(velecsum,velec);
695
696             fscal            = felec;
697
698             fscal            = _mm256_and_pd(fscal,cutoff_mask);
699
700             /* Calculate temporary vectorial force */
701             tx               = _mm256_mul_pd(fscal,dx12);
702             ty               = _mm256_mul_pd(fscal,dy12);
703             tz               = _mm256_mul_pd(fscal,dz12);
704
705             /* Update vectorial force */
706             fix1             = _mm256_add_pd(fix1,tx);
707             fiy1             = _mm256_add_pd(fiy1,ty);
708             fiz1             = _mm256_add_pd(fiz1,tz);
709
710             fjx2             = _mm256_add_pd(fjx2,tx);
711             fjy2             = _mm256_add_pd(fjy2,ty);
712             fjz2             = _mm256_add_pd(fjz2,tz);
713
714             }
715
716             /**************************
717              * CALCULATE INTERACTIONS *
718              **************************/
719
720             if (gmx_mm256_any_lt(rsq20,rcutoff2))
721             {
722
723             r20              = _mm256_mul_pd(rsq20,rinv20);
724
725             /* EWALD ELECTROSTATICS */
726
727             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
728             ewrt             = _mm256_mul_pd(r20,ewtabscale);
729             ewitab           = _mm256_cvttpd_epi32(ewrt);
730             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
731             ewitab           = _mm_slli_epi32(ewitab,2);
732             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
733             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
734             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
735             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
736             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
737             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
738             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
739             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
740             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
741
742             d                = _mm256_sub_pd(r20,rswitch);
743             d                = _mm256_max_pd(d,_mm256_setzero_pd());
744             d2               = _mm256_mul_pd(d,d);
745             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
746
747             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
748
749             /* Evaluate switch function */
750             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
751             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
752             velec            = _mm256_mul_pd(velec,sw);
753             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
754
755             /* Update potential sum for this i atom from the interaction with this j atom. */
756             velec            = _mm256_and_pd(velec,cutoff_mask);
757             velecsum         = _mm256_add_pd(velecsum,velec);
758
759             fscal            = felec;
760
761             fscal            = _mm256_and_pd(fscal,cutoff_mask);
762
763             /* Calculate temporary vectorial force */
764             tx               = _mm256_mul_pd(fscal,dx20);
765             ty               = _mm256_mul_pd(fscal,dy20);
766             tz               = _mm256_mul_pd(fscal,dz20);
767
768             /* Update vectorial force */
769             fix2             = _mm256_add_pd(fix2,tx);
770             fiy2             = _mm256_add_pd(fiy2,ty);
771             fiz2             = _mm256_add_pd(fiz2,tz);
772
773             fjx0             = _mm256_add_pd(fjx0,tx);
774             fjy0             = _mm256_add_pd(fjy0,ty);
775             fjz0             = _mm256_add_pd(fjz0,tz);
776
777             }
778
779             /**************************
780              * CALCULATE INTERACTIONS *
781              **************************/
782
783             if (gmx_mm256_any_lt(rsq21,rcutoff2))
784             {
785
786             r21              = _mm256_mul_pd(rsq21,rinv21);
787
788             /* EWALD ELECTROSTATICS */
789
790             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
791             ewrt             = _mm256_mul_pd(r21,ewtabscale);
792             ewitab           = _mm256_cvttpd_epi32(ewrt);
793             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
794             ewitab           = _mm_slli_epi32(ewitab,2);
795             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
796             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
797             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
798             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
799             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
800             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
801             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
802             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
803             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
804
805             d                = _mm256_sub_pd(r21,rswitch);
806             d                = _mm256_max_pd(d,_mm256_setzero_pd());
807             d2               = _mm256_mul_pd(d,d);
808             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
809
810             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
811
812             /* Evaluate switch function */
813             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
814             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
815             velec            = _mm256_mul_pd(velec,sw);
816             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
817
818             /* Update potential sum for this i atom from the interaction with this j atom. */
819             velec            = _mm256_and_pd(velec,cutoff_mask);
820             velecsum         = _mm256_add_pd(velecsum,velec);
821
822             fscal            = felec;
823
824             fscal            = _mm256_and_pd(fscal,cutoff_mask);
825
826             /* Calculate temporary vectorial force */
827             tx               = _mm256_mul_pd(fscal,dx21);
828             ty               = _mm256_mul_pd(fscal,dy21);
829             tz               = _mm256_mul_pd(fscal,dz21);
830
831             /* Update vectorial force */
832             fix2             = _mm256_add_pd(fix2,tx);
833             fiy2             = _mm256_add_pd(fiy2,ty);
834             fiz2             = _mm256_add_pd(fiz2,tz);
835
836             fjx1             = _mm256_add_pd(fjx1,tx);
837             fjy1             = _mm256_add_pd(fjy1,ty);
838             fjz1             = _mm256_add_pd(fjz1,tz);
839
840             }
841
842             /**************************
843              * CALCULATE INTERACTIONS *
844              **************************/
845
846             if (gmx_mm256_any_lt(rsq22,rcutoff2))
847             {
848
849             r22              = _mm256_mul_pd(rsq22,rinv22);
850
851             /* EWALD ELECTROSTATICS */
852
853             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
854             ewrt             = _mm256_mul_pd(r22,ewtabscale);
855             ewitab           = _mm256_cvttpd_epi32(ewrt);
856             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
857             ewitab           = _mm_slli_epi32(ewitab,2);
858             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
859             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
860             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
861             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
862             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
863             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
864             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
865             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
866             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
867
868             d                = _mm256_sub_pd(r22,rswitch);
869             d                = _mm256_max_pd(d,_mm256_setzero_pd());
870             d2               = _mm256_mul_pd(d,d);
871             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
872
873             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
874
875             /* Evaluate switch function */
876             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
877             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
878             velec            = _mm256_mul_pd(velec,sw);
879             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
880
881             /* Update potential sum for this i atom from the interaction with this j atom. */
882             velec            = _mm256_and_pd(velec,cutoff_mask);
883             velecsum         = _mm256_add_pd(velecsum,velec);
884
885             fscal            = felec;
886
887             fscal            = _mm256_and_pd(fscal,cutoff_mask);
888
889             /* Calculate temporary vectorial force */
890             tx               = _mm256_mul_pd(fscal,dx22);
891             ty               = _mm256_mul_pd(fscal,dy22);
892             tz               = _mm256_mul_pd(fscal,dz22);
893
894             /* Update vectorial force */
895             fix2             = _mm256_add_pd(fix2,tx);
896             fiy2             = _mm256_add_pd(fiy2,ty);
897             fiz2             = _mm256_add_pd(fiz2,tz);
898
899             fjx2             = _mm256_add_pd(fjx2,tx);
900             fjy2             = _mm256_add_pd(fjy2,ty);
901             fjz2             = _mm256_add_pd(fjz2,tz);
902
903             }
904
905             fjptrA             = f+j_coord_offsetA;
906             fjptrB             = f+j_coord_offsetB;
907             fjptrC             = f+j_coord_offsetC;
908             fjptrD             = f+j_coord_offsetD;
909
910             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
911                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
912
913             /* Inner loop uses 603 flops */
914         }
915
916         if(jidx<j_index_end)
917         {
918
919             /* Get j neighbor index, and coordinate index */
920             jnrlistA         = jjnr[jidx];
921             jnrlistB         = jjnr[jidx+1];
922             jnrlistC         = jjnr[jidx+2];
923             jnrlistD         = jjnr[jidx+3];
924             /* Sign of each element will be negative for non-real atoms.
925              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
926              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
927              */
928             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
929
930             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
931             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
932             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
933
934             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
935             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
936             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
937             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
938             j_coord_offsetA  = DIM*jnrA;
939             j_coord_offsetB  = DIM*jnrB;
940             j_coord_offsetC  = DIM*jnrC;
941             j_coord_offsetD  = DIM*jnrD;
942
943             /* load j atom coordinates */
944             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
945                                                  x+j_coord_offsetC,x+j_coord_offsetD,
946                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
947
948             /* Calculate displacement vector */
949             dx00             = _mm256_sub_pd(ix0,jx0);
950             dy00             = _mm256_sub_pd(iy0,jy0);
951             dz00             = _mm256_sub_pd(iz0,jz0);
952             dx01             = _mm256_sub_pd(ix0,jx1);
953             dy01             = _mm256_sub_pd(iy0,jy1);
954             dz01             = _mm256_sub_pd(iz0,jz1);
955             dx02             = _mm256_sub_pd(ix0,jx2);
956             dy02             = _mm256_sub_pd(iy0,jy2);
957             dz02             = _mm256_sub_pd(iz0,jz2);
958             dx10             = _mm256_sub_pd(ix1,jx0);
959             dy10             = _mm256_sub_pd(iy1,jy0);
960             dz10             = _mm256_sub_pd(iz1,jz0);
961             dx11             = _mm256_sub_pd(ix1,jx1);
962             dy11             = _mm256_sub_pd(iy1,jy1);
963             dz11             = _mm256_sub_pd(iz1,jz1);
964             dx12             = _mm256_sub_pd(ix1,jx2);
965             dy12             = _mm256_sub_pd(iy1,jy2);
966             dz12             = _mm256_sub_pd(iz1,jz2);
967             dx20             = _mm256_sub_pd(ix2,jx0);
968             dy20             = _mm256_sub_pd(iy2,jy0);
969             dz20             = _mm256_sub_pd(iz2,jz0);
970             dx21             = _mm256_sub_pd(ix2,jx1);
971             dy21             = _mm256_sub_pd(iy2,jy1);
972             dz21             = _mm256_sub_pd(iz2,jz1);
973             dx22             = _mm256_sub_pd(ix2,jx2);
974             dy22             = _mm256_sub_pd(iy2,jy2);
975             dz22             = _mm256_sub_pd(iz2,jz2);
976
977             /* Calculate squared distance and things based on it */
978             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
979             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
980             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
981             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
982             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
983             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
984             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
985             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
986             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
987
988             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
989             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
990             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
991             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
992             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
993             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
994             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
995             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
996             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
997
998             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
999             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1000             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1001             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1002             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1003             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1004             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1005             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1006             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1007
1008             fjx0             = _mm256_setzero_pd();
1009             fjy0             = _mm256_setzero_pd();
1010             fjz0             = _mm256_setzero_pd();
1011             fjx1             = _mm256_setzero_pd();
1012             fjy1             = _mm256_setzero_pd();
1013             fjz1             = _mm256_setzero_pd();
1014             fjx2             = _mm256_setzero_pd();
1015             fjy2             = _mm256_setzero_pd();
1016             fjz2             = _mm256_setzero_pd();
1017
1018             /**************************
1019              * CALCULATE INTERACTIONS *
1020              **************************/
1021
1022             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1023             {
1024
1025             r00              = _mm256_mul_pd(rsq00,rinv00);
1026             r00              = _mm256_andnot_pd(dummy_mask,r00);
1027
1028             /* EWALD ELECTROSTATICS */
1029
1030             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031             ewrt             = _mm256_mul_pd(r00,ewtabscale);
1032             ewitab           = _mm256_cvttpd_epi32(ewrt);
1033             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1034             ewitab           = _mm_slli_epi32(ewitab,2);
1035             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1036             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1037             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1038             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1039             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1040             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1041             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1042             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
1043             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1044
1045             /* LENNARD-JONES DISPERSION/REPULSION */
1046
1047             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1048             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1049             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1050             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1051             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1052
1053             d                = _mm256_sub_pd(r00,rswitch);
1054             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1055             d2               = _mm256_mul_pd(d,d);
1056             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1057
1058             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1059
1060             /* Evaluate switch function */
1061             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1062             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
1063             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1064             velec            = _mm256_mul_pd(velec,sw);
1065             vvdw             = _mm256_mul_pd(vvdw,sw);
1066             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1067
1068             /* Update potential sum for this i atom from the interaction with this j atom. */
1069             velec            = _mm256_and_pd(velec,cutoff_mask);
1070             velec            = _mm256_andnot_pd(dummy_mask,velec);
1071             velecsum         = _mm256_add_pd(velecsum,velec);
1072             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
1073             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
1074             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
1075
1076             fscal            = _mm256_add_pd(felec,fvdw);
1077
1078             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1079
1080             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1081
1082             /* Calculate temporary vectorial force */
1083             tx               = _mm256_mul_pd(fscal,dx00);
1084             ty               = _mm256_mul_pd(fscal,dy00);
1085             tz               = _mm256_mul_pd(fscal,dz00);
1086
1087             /* Update vectorial force */
1088             fix0             = _mm256_add_pd(fix0,tx);
1089             fiy0             = _mm256_add_pd(fiy0,ty);
1090             fiz0             = _mm256_add_pd(fiz0,tz);
1091
1092             fjx0             = _mm256_add_pd(fjx0,tx);
1093             fjy0             = _mm256_add_pd(fjy0,ty);
1094             fjz0             = _mm256_add_pd(fjz0,tz);
1095
1096             }
1097
1098             /**************************
1099              * CALCULATE INTERACTIONS *
1100              **************************/
1101
1102             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1103             {
1104
1105             r01              = _mm256_mul_pd(rsq01,rinv01);
1106             r01              = _mm256_andnot_pd(dummy_mask,r01);
1107
1108             /* EWALD ELECTROSTATICS */
1109
1110             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1111             ewrt             = _mm256_mul_pd(r01,ewtabscale);
1112             ewitab           = _mm256_cvttpd_epi32(ewrt);
1113             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1114             ewitab           = _mm_slli_epi32(ewitab,2);
1115             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1116             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1117             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1118             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1119             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1120             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1121             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1122             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
1123             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1124
1125             d                = _mm256_sub_pd(r01,rswitch);
1126             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1127             d2               = _mm256_mul_pd(d,d);
1128             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1129
1130             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1131
1132             /* Evaluate switch function */
1133             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1134             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
1135             velec            = _mm256_mul_pd(velec,sw);
1136             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1137
1138             /* Update potential sum for this i atom from the interaction with this j atom. */
1139             velec            = _mm256_and_pd(velec,cutoff_mask);
1140             velec            = _mm256_andnot_pd(dummy_mask,velec);
1141             velecsum         = _mm256_add_pd(velecsum,velec);
1142
1143             fscal            = felec;
1144
1145             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1146
1147             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1148
1149             /* Calculate temporary vectorial force */
1150             tx               = _mm256_mul_pd(fscal,dx01);
1151             ty               = _mm256_mul_pd(fscal,dy01);
1152             tz               = _mm256_mul_pd(fscal,dz01);
1153
1154             /* Update vectorial force */
1155             fix0             = _mm256_add_pd(fix0,tx);
1156             fiy0             = _mm256_add_pd(fiy0,ty);
1157             fiz0             = _mm256_add_pd(fiz0,tz);
1158
1159             fjx1             = _mm256_add_pd(fjx1,tx);
1160             fjy1             = _mm256_add_pd(fjy1,ty);
1161             fjz1             = _mm256_add_pd(fjz1,tz);
1162
1163             }
1164
1165             /**************************
1166              * CALCULATE INTERACTIONS *
1167              **************************/
1168
1169             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1170             {
1171
1172             r02              = _mm256_mul_pd(rsq02,rinv02);
1173             r02              = _mm256_andnot_pd(dummy_mask,r02);
1174
1175             /* EWALD ELECTROSTATICS */
1176
1177             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1178             ewrt             = _mm256_mul_pd(r02,ewtabscale);
1179             ewitab           = _mm256_cvttpd_epi32(ewrt);
1180             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1181             ewitab           = _mm_slli_epi32(ewitab,2);
1182             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1183             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1184             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1185             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1186             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1187             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1188             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1189             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
1190             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1191
1192             d                = _mm256_sub_pd(r02,rswitch);
1193             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1194             d2               = _mm256_mul_pd(d,d);
1195             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1196
1197             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1198
1199             /* Evaluate switch function */
1200             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1201             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
1202             velec            = _mm256_mul_pd(velec,sw);
1203             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1204
1205             /* Update potential sum for this i atom from the interaction with this j atom. */
1206             velec            = _mm256_and_pd(velec,cutoff_mask);
1207             velec            = _mm256_andnot_pd(dummy_mask,velec);
1208             velecsum         = _mm256_add_pd(velecsum,velec);
1209
1210             fscal            = felec;
1211
1212             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1213
1214             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1215
1216             /* Calculate temporary vectorial force */
1217             tx               = _mm256_mul_pd(fscal,dx02);
1218             ty               = _mm256_mul_pd(fscal,dy02);
1219             tz               = _mm256_mul_pd(fscal,dz02);
1220
1221             /* Update vectorial force */
1222             fix0             = _mm256_add_pd(fix0,tx);
1223             fiy0             = _mm256_add_pd(fiy0,ty);
1224             fiz0             = _mm256_add_pd(fiz0,tz);
1225
1226             fjx2             = _mm256_add_pd(fjx2,tx);
1227             fjy2             = _mm256_add_pd(fjy2,ty);
1228             fjz2             = _mm256_add_pd(fjz2,tz);
1229
1230             }
1231
1232             /**************************
1233              * CALCULATE INTERACTIONS *
1234              **************************/
1235
1236             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1237             {
1238
1239             r10              = _mm256_mul_pd(rsq10,rinv10);
1240             r10              = _mm256_andnot_pd(dummy_mask,r10);
1241
1242             /* EWALD ELECTROSTATICS */
1243
1244             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1245             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1246             ewitab           = _mm256_cvttpd_epi32(ewrt);
1247             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1248             ewitab           = _mm_slli_epi32(ewitab,2);
1249             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1250             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1251             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1252             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1253             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1254             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1255             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1256             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1257             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1258
1259             d                = _mm256_sub_pd(r10,rswitch);
1260             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1261             d2               = _mm256_mul_pd(d,d);
1262             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1263
1264             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1265
1266             /* Evaluate switch function */
1267             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1268             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
1269             velec            = _mm256_mul_pd(velec,sw);
1270             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1271
1272             /* Update potential sum for this i atom from the interaction with this j atom. */
1273             velec            = _mm256_and_pd(velec,cutoff_mask);
1274             velec            = _mm256_andnot_pd(dummy_mask,velec);
1275             velecsum         = _mm256_add_pd(velecsum,velec);
1276
1277             fscal            = felec;
1278
1279             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1280
1281             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1282
1283             /* Calculate temporary vectorial force */
1284             tx               = _mm256_mul_pd(fscal,dx10);
1285             ty               = _mm256_mul_pd(fscal,dy10);
1286             tz               = _mm256_mul_pd(fscal,dz10);
1287
1288             /* Update vectorial force */
1289             fix1             = _mm256_add_pd(fix1,tx);
1290             fiy1             = _mm256_add_pd(fiy1,ty);
1291             fiz1             = _mm256_add_pd(fiz1,tz);
1292
1293             fjx0             = _mm256_add_pd(fjx0,tx);
1294             fjy0             = _mm256_add_pd(fjy0,ty);
1295             fjz0             = _mm256_add_pd(fjz0,tz);
1296
1297             }
1298
1299             /**************************
1300              * CALCULATE INTERACTIONS *
1301              **************************/
1302
1303             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1304             {
1305
1306             r11              = _mm256_mul_pd(rsq11,rinv11);
1307             r11              = _mm256_andnot_pd(dummy_mask,r11);
1308
1309             /* EWALD ELECTROSTATICS */
1310
1311             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1312             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1313             ewitab           = _mm256_cvttpd_epi32(ewrt);
1314             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1315             ewitab           = _mm_slli_epi32(ewitab,2);
1316             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1317             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1318             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1319             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1320             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1321             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1322             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1323             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1324             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1325
1326             d                = _mm256_sub_pd(r11,rswitch);
1327             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1328             d2               = _mm256_mul_pd(d,d);
1329             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1330
1331             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1332
1333             /* Evaluate switch function */
1334             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1335             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1336             velec            = _mm256_mul_pd(velec,sw);
1337             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1338
1339             /* Update potential sum for this i atom from the interaction with this j atom. */
1340             velec            = _mm256_and_pd(velec,cutoff_mask);
1341             velec            = _mm256_andnot_pd(dummy_mask,velec);
1342             velecsum         = _mm256_add_pd(velecsum,velec);
1343
1344             fscal            = felec;
1345
1346             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1347
1348             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1349
1350             /* Calculate temporary vectorial force */
1351             tx               = _mm256_mul_pd(fscal,dx11);
1352             ty               = _mm256_mul_pd(fscal,dy11);
1353             tz               = _mm256_mul_pd(fscal,dz11);
1354
1355             /* Update vectorial force */
1356             fix1             = _mm256_add_pd(fix1,tx);
1357             fiy1             = _mm256_add_pd(fiy1,ty);
1358             fiz1             = _mm256_add_pd(fiz1,tz);
1359
1360             fjx1             = _mm256_add_pd(fjx1,tx);
1361             fjy1             = _mm256_add_pd(fjy1,ty);
1362             fjz1             = _mm256_add_pd(fjz1,tz);
1363
1364             }
1365
1366             /**************************
1367              * CALCULATE INTERACTIONS *
1368              **************************/
1369
1370             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1371             {
1372
1373             r12              = _mm256_mul_pd(rsq12,rinv12);
1374             r12              = _mm256_andnot_pd(dummy_mask,r12);
1375
1376             /* EWALD ELECTROSTATICS */
1377
1378             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1379             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1380             ewitab           = _mm256_cvttpd_epi32(ewrt);
1381             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1382             ewitab           = _mm_slli_epi32(ewitab,2);
1383             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1384             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1385             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1386             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1387             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1388             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1389             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1390             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1391             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1392
1393             d                = _mm256_sub_pd(r12,rswitch);
1394             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1395             d2               = _mm256_mul_pd(d,d);
1396             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1397
1398             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1399
1400             /* Evaluate switch function */
1401             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1402             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1403             velec            = _mm256_mul_pd(velec,sw);
1404             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1405
1406             /* Update potential sum for this i atom from the interaction with this j atom. */
1407             velec            = _mm256_and_pd(velec,cutoff_mask);
1408             velec            = _mm256_andnot_pd(dummy_mask,velec);
1409             velecsum         = _mm256_add_pd(velecsum,velec);
1410
1411             fscal            = felec;
1412
1413             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1414
1415             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1416
1417             /* Calculate temporary vectorial force */
1418             tx               = _mm256_mul_pd(fscal,dx12);
1419             ty               = _mm256_mul_pd(fscal,dy12);
1420             tz               = _mm256_mul_pd(fscal,dz12);
1421
1422             /* Update vectorial force */
1423             fix1             = _mm256_add_pd(fix1,tx);
1424             fiy1             = _mm256_add_pd(fiy1,ty);
1425             fiz1             = _mm256_add_pd(fiz1,tz);
1426
1427             fjx2             = _mm256_add_pd(fjx2,tx);
1428             fjy2             = _mm256_add_pd(fjy2,ty);
1429             fjz2             = _mm256_add_pd(fjz2,tz);
1430
1431             }
1432
1433             /**************************
1434              * CALCULATE INTERACTIONS *
1435              **************************/
1436
1437             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1438             {
1439
1440             r20              = _mm256_mul_pd(rsq20,rinv20);
1441             r20              = _mm256_andnot_pd(dummy_mask,r20);
1442
1443             /* EWALD ELECTROSTATICS */
1444
1445             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1446             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1447             ewitab           = _mm256_cvttpd_epi32(ewrt);
1448             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1449             ewitab           = _mm_slli_epi32(ewitab,2);
1450             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1451             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1452             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1453             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1454             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1455             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1456             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1457             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1458             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1459
1460             d                = _mm256_sub_pd(r20,rswitch);
1461             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1462             d2               = _mm256_mul_pd(d,d);
1463             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1464
1465             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1466
1467             /* Evaluate switch function */
1468             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1469             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
1470             velec            = _mm256_mul_pd(velec,sw);
1471             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1472
1473             /* Update potential sum for this i atom from the interaction with this j atom. */
1474             velec            = _mm256_and_pd(velec,cutoff_mask);
1475             velec            = _mm256_andnot_pd(dummy_mask,velec);
1476             velecsum         = _mm256_add_pd(velecsum,velec);
1477
1478             fscal            = felec;
1479
1480             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1481
1482             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1483
1484             /* Calculate temporary vectorial force */
1485             tx               = _mm256_mul_pd(fscal,dx20);
1486             ty               = _mm256_mul_pd(fscal,dy20);
1487             tz               = _mm256_mul_pd(fscal,dz20);
1488
1489             /* Update vectorial force */
1490             fix2             = _mm256_add_pd(fix2,tx);
1491             fiy2             = _mm256_add_pd(fiy2,ty);
1492             fiz2             = _mm256_add_pd(fiz2,tz);
1493
1494             fjx0             = _mm256_add_pd(fjx0,tx);
1495             fjy0             = _mm256_add_pd(fjy0,ty);
1496             fjz0             = _mm256_add_pd(fjz0,tz);
1497
1498             }
1499
1500             /**************************
1501              * CALCULATE INTERACTIONS *
1502              **************************/
1503
1504             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1505             {
1506
1507             r21              = _mm256_mul_pd(rsq21,rinv21);
1508             r21              = _mm256_andnot_pd(dummy_mask,r21);
1509
1510             /* EWALD ELECTROSTATICS */
1511
1512             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1513             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1514             ewitab           = _mm256_cvttpd_epi32(ewrt);
1515             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1516             ewitab           = _mm_slli_epi32(ewitab,2);
1517             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1518             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1519             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1520             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1521             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1522             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1523             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1524             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1525             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1526
1527             d                = _mm256_sub_pd(r21,rswitch);
1528             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1529             d2               = _mm256_mul_pd(d,d);
1530             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1531
1532             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1533
1534             /* Evaluate switch function */
1535             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1536             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
1537             velec            = _mm256_mul_pd(velec,sw);
1538             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1539
1540             /* Update potential sum for this i atom from the interaction with this j atom. */
1541             velec            = _mm256_and_pd(velec,cutoff_mask);
1542             velec            = _mm256_andnot_pd(dummy_mask,velec);
1543             velecsum         = _mm256_add_pd(velecsum,velec);
1544
1545             fscal            = felec;
1546
1547             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1548
1549             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1550
1551             /* Calculate temporary vectorial force */
1552             tx               = _mm256_mul_pd(fscal,dx21);
1553             ty               = _mm256_mul_pd(fscal,dy21);
1554             tz               = _mm256_mul_pd(fscal,dz21);
1555
1556             /* Update vectorial force */
1557             fix2             = _mm256_add_pd(fix2,tx);
1558             fiy2             = _mm256_add_pd(fiy2,ty);
1559             fiz2             = _mm256_add_pd(fiz2,tz);
1560
1561             fjx1             = _mm256_add_pd(fjx1,tx);
1562             fjy1             = _mm256_add_pd(fjy1,ty);
1563             fjz1             = _mm256_add_pd(fjz1,tz);
1564
1565             }
1566
1567             /**************************
1568              * CALCULATE INTERACTIONS *
1569              **************************/
1570
1571             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1572             {
1573
1574             r22              = _mm256_mul_pd(rsq22,rinv22);
1575             r22              = _mm256_andnot_pd(dummy_mask,r22);
1576
1577             /* EWALD ELECTROSTATICS */
1578
1579             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1580             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1581             ewitab           = _mm256_cvttpd_epi32(ewrt);
1582             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1583             ewitab           = _mm_slli_epi32(ewitab,2);
1584             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1585             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1586             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1587             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1588             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1589             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1590             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1591             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1592             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1593
1594             d                = _mm256_sub_pd(r22,rswitch);
1595             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1596             d2               = _mm256_mul_pd(d,d);
1597             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1598
1599             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1600
1601             /* Evaluate switch function */
1602             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1603             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
1604             velec            = _mm256_mul_pd(velec,sw);
1605             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1606
1607             /* Update potential sum for this i atom from the interaction with this j atom. */
1608             velec            = _mm256_and_pd(velec,cutoff_mask);
1609             velec            = _mm256_andnot_pd(dummy_mask,velec);
1610             velecsum         = _mm256_add_pd(velecsum,velec);
1611
1612             fscal            = felec;
1613
1614             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1615
1616             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1617
1618             /* Calculate temporary vectorial force */
1619             tx               = _mm256_mul_pd(fscal,dx22);
1620             ty               = _mm256_mul_pd(fscal,dy22);
1621             tz               = _mm256_mul_pd(fscal,dz22);
1622
1623             /* Update vectorial force */
1624             fix2             = _mm256_add_pd(fix2,tx);
1625             fiy2             = _mm256_add_pd(fiy2,ty);
1626             fiz2             = _mm256_add_pd(fiz2,tz);
1627
1628             fjx2             = _mm256_add_pd(fjx2,tx);
1629             fjy2             = _mm256_add_pd(fjy2,ty);
1630             fjz2             = _mm256_add_pd(fjz2,tz);
1631
1632             }
1633
1634             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1635             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1636             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1637             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1638
1639             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1640                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1641
1642             /* Inner loop uses 612 flops */
1643         }
1644
1645         /* End of innermost loop */
1646
1647         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1648                                                  f+i_coord_offset,fshift+i_shift_offset);
1649
1650         ggid                        = gid[iidx];
1651         /* Update potential energies */
1652         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1653         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1654
1655         /* Increment number of inner iterations */
1656         inneriter                  += j_index_end - j_index_start;
1657
1658         /* Outer loop uses 20 flops */
1659     }
1660
1661     /* Increment number of outer iterations */
1662     outeriter        += nri;
1663
1664     /* Update outer/inner flops */
1665
1666     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*612);
1667 }
1668 /*
1669  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_double
1670  * Electrostatics interaction: Ewald
1671  * VdW interaction:            LennardJones
1672  * Geometry:                   Water3-Water3
1673  * Calculate force/pot:        Force
1674  */
1675 void
1676 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_double
1677                     (t_nblist                    * gmx_restrict       nlist,
1678                      rvec                        * gmx_restrict          xx,
1679                      rvec                        * gmx_restrict          ff,
1680                      t_forcerec                  * gmx_restrict          fr,
1681                      t_mdatoms                   * gmx_restrict     mdatoms,
1682                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1683                      t_nrnb                      * gmx_restrict        nrnb)
1684 {
1685     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1686      * just 0 for non-waters.
1687      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1688      * jnr indices corresponding to data put in the four positions in the SIMD register.
1689      */
1690     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1691     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1692     int              jnrA,jnrB,jnrC,jnrD;
1693     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1694     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1695     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1696     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1697     real             rcutoff_scalar;
1698     real             *shiftvec,*fshift,*x,*f;
1699     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1700     real             scratch[4*DIM];
1701     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1702     real *           vdwioffsetptr0;
1703     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1704     real *           vdwioffsetptr1;
1705     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1706     real *           vdwioffsetptr2;
1707     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1708     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1709     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1710     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1711     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1712     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1713     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1714     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1715     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1716     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1717     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1718     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1719     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1720     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1721     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1722     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1723     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1724     real             *charge;
1725     int              nvdwtype;
1726     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1727     int              *vdwtype;
1728     real             *vdwparam;
1729     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1730     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1731     __m128i          ewitab;
1732     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1733     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1734     real             *ewtab;
1735     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1736     real             rswitch_scalar,d_scalar;
1737     __m256d          dummy_mask,cutoff_mask;
1738     __m128           tmpmask0,tmpmask1;
1739     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1740     __m256d          one     = _mm256_set1_pd(1.0);
1741     __m256d          two     = _mm256_set1_pd(2.0);
1742     x                = xx[0];
1743     f                = ff[0];
1744
1745     nri              = nlist->nri;
1746     iinr             = nlist->iinr;
1747     jindex           = nlist->jindex;
1748     jjnr             = nlist->jjnr;
1749     shiftidx         = nlist->shift;
1750     gid              = nlist->gid;
1751     shiftvec         = fr->shift_vec[0];
1752     fshift           = fr->fshift[0];
1753     facel            = _mm256_set1_pd(fr->epsfac);
1754     charge           = mdatoms->chargeA;
1755     nvdwtype         = fr->ntype;
1756     vdwparam         = fr->nbfp;
1757     vdwtype          = mdatoms->typeA;
1758
1759     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1760     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1761     beta2            = _mm256_mul_pd(beta,beta);
1762     beta3            = _mm256_mul_pd(beta,beta2);
1763
1764     ewtab            = fr->ic->tabq_coul_FDV0;
1765     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1766     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1767
1768     /* Setup water-specific parameters */
1769     inr              = nlist->iinr[0];
1770     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1771     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1772     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1773     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1774
1775     jq0              = _mm256_set1_pd(charge[inr+0]);
1776     jq1              = _mm256_set1_pd(charge[inr+1]);
1777     jq2              = _mm256_set1_pd(charge[inr+2]);
1778     vdwjidx0A        = 2*vdwtype[inr+0];
1779     qq00             = _mm256_mul_pd(iq0,jq0);
1780     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1781     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1782     qq01             = _mm256_mul_pd(iq0,jq1);
1783     qq02             = _mm256_mul_pd(iq0,jq2);
1784     qq10             = _mm256_mul_pd(iq1,jq0);
1785     qq11             = _mm256_mul_pd(iq1,jq1);
1786     qq12             = _mm256_mul_pd(iq1,jq2);
1787     qq20             = _mm256_mul_pd(iq2,jq0);
1788     qq21             = _mm256_mul_pd(iq2,jq1);
1789     qq22             = _mm256_mul_pd(iq2,jq2);
1790
1791     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1792     rcutoff_scalar   = fr->rcoulomb;
1793     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1794     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1795
1796     rswitch_scalar   = fr->rcoulomb_switch;
1797     rswitch          = _mm256_set1_pd(rswitch_scalar);
1798     /* Setup switch parameters */
1799     d_scalar         = rcutoff_scalar-rswitch_scalar;
1800     d                = _mm256_set1_pd(d_scalar);
1801     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1802     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1803     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1804     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1805     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1806     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1807
1808     /* Avoid stupid compiler warnings */
1809     jnrA = jnrB = jnrC = jnrD = 0;
1810     j_coord_offsetA = 0;
1811     j_coord_offsetB = 0;
1812     j_coord_offsetC = 0;
1813     j_coord_offsetD = 0;
1814
1815     outeriter        = 0;
1816     inneriter        = 0;
1817
1818     for(iidx=0;iidx<4*DIM;iidx++)
1819     {
1820         scratch[iidx] = 0.0;
1821     }
1822
1823     /* Start outer loop over neighborlists */
1824     for(iidx=0; iidx<nri; iidx++)
1825     {
1826         /* Load shift vector for this list */
1827         i_shift_offset   = DIM*shiftidx[iidx];
1828
1829         /* Load limits for loop over neighbors */
1830         j_index_start    = jindex[iidx];
1831         j_index_end      = jindex[iidx+1];
1832
1833         /* Get outer coordinate index */
1834         inr              = iinr[iidx];
1835         i_coord_offset   = DIM*inr;
1836
1837         /* Load i particle coords and add shift vector */
1838         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1839                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1840
1841         fix0             = _mm256_setzero_pd();
1842         fiy0             = _mm256_setzero_pd();
1843         fiz0             = _mm256_setzero_pd();
1844         fix1             = _mm256_setzero_pd();
1845         fiy1             = _mm256_setzero_pd();
1846         fiz1             = _mm256_setzero_pd();
1847         fix2             = _mm256_setzero_pd();
1848         fiy2             = _mm256_setzero_pd();
1849         fiz2             = _mm256_setzero_pd();
1850
1851         /* Start inner kernel loop */
1852         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1853         {
1854
1855             /* Get j neighbor index, and coordinate index */
1856             jnrA             = jjnr[jidx];
1857             jnrB             = jjnr[jidx+1];
1858             jnrC             = jjnr[jidx+2];
1859             jnrD             = jjnr[jidx+3];
1860             j_coord_offsetA  = DIM*jnrA;
1861             j_coord_offsetB  = DIM*jnrB;
1862             j_coord_offsetC  = DIM*jnrC;
1863             j_coord_offsetD  = DIM*jnrD;
1864
1865             /* load j atom coordinates */
1866             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1867                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1868                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1869
1870             /* Calculate displacement vector */
1871             dx00             = _mm256_sub_pd(ix0,jx0);
1872             dy00             = _mm256_sub_pd(iy0,jy0);
1873             dz00             = _mm256_sub_pd(iz0,jz0);
1874             dx01             = _mm256_sub_pd(ix0,jx1);
1875             dy01             = _mm256_sub_pd(iy0,jy1);
1876             dz01             = _mm256_sub_pd(iz0,jz1);
1877             dx02             = _mm256_sub_pd(ix0,jx2);
1878             dy02             = _mm256_sub_pd(iy0,jy2);
1879             dz02             = _mm256_sub_pd(iz0,jz2);
1880             dx10             = _mm256_sub_pd(ix1,jx0);
1881             dy10             = _mm256_sub_pd(iy1,jy0);
1882             dz10             = _mm256_sub_pd(iz1,jz0);
1883             dx11             = _mm256_sub_pd(ix1,jx1);
1884             dy11             = _mm256_sub_pd(iy1,jy1);
1885             dz11             = _mm256_sub_pd(iz1,jz1);
1886             dx12             = _mm256_sub_pd(ix1,jx2);
1887             dy12             = _mm256_sub_pd(iy1,jy2);
1888             dz12             = _mm256_sub_pd(iz1,jz2);
1889             dx20             = _mm256_sub_pd(ix2,jx0);
1890             dy20             = _mm256_sub_pd(iy2,jy0);
1891             dz20             = _mm256_sub_pd(iz2,jz0);
1892             dx21             = _mm256_sub_pd(ix2,jx1);
1893             dy21             = _mm256_sub_pd(iy2,jy1);
1894             dz21             = _mm256_sub_pd(iz2,jz1);
1895             dx22             = _mm256_sub_pd(ix2,jx2);
1896             dy22             = _mm256_sub_pd(iy2,jy2);
1897             dz22             = _mm256_sub_pd(iz2,jz2);
1898
1899             /* Calculate squared distance and things based on it */
1900             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1901             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1902             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1903             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1904             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1905             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1906             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1907             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1908             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1909
1910             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1911             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1912             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1913             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1914             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1915             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1916             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1917             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1918             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1919
1920             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1921             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1922             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1923             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1924             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1925             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1926             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1927             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1928             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1929
1930             fjx0             = _mm256_setzero_pd();
1931             fjy0             = _mm256_setzero_pd();
1932             fjz0             = _mm256_setzero_pd();
1933             fjx1             = _mm256_setzero_pd();
1934             fjy1             = _mm256_setzero_pd();
1935             fjz1             = _mm256_setzero_pd();
1936             fjx2             = _mm256_setzero_pd();
1937             fjy2             = _mm256_setzero_pd();
1938             fjz2             = _mm256_setzero_pd();
1939
1940             /**************************
1941              * CALCULATE INTERACTIONS *
1942              **************************/
1943
1944             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1945             {
1946
1947             r00              = _mm256_mul_pd(rsq00,rinv00);
1948
1949             /* EWALD ELECTROSTATICS */
1950
1951             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1952             ewrt             = _mm256_mul_pd(r00,ewtabscale);
1953             ewitab           = _mm256_cvttpd_epi32(ewrt);
1954             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1955             ewitab           = _mm_slli_epi32(ewitab,2);
1956             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1957             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1958             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1959             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1960             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1961             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1962             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1963             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
1964             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1965
1966             /* LENNARD-JONES DISPERSION/REPULSION */
1967
1968             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1969             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1970             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1971             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1972             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1973
1974             d                = _mm256_sub_pd(r00,rswitch);
1975             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1976             d2               = _mm256_mul_pd(d,d);
1977             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1978
1979             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1980
1981             /* Evaluate switch function */
1982             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1983             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
1984             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1985             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1986
1987             fscal            = _mm256_add_pd(felec,fvdw);
1988
1989             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1990
1991             /* Calculate temporary vectorial force */
1992             tx               = _mm256_mul_pd(fscal,dx00);
1993             ty               = _mm256_mul_pd(fscal,dy00);
1994             tz               = _mm256_mul_pd(fscal,dz00);
1995
1996             /* Update vectorial force */
1997             fix0             = _mm256_add_pd(fix0,tx);
1998             fiy0             = _mm256_add_pd(fiy0,ty);
1999             fiz0             = _mm256_add_pd(fiz0,tz);
2000
2001             fjx0             = _mm256_add_pd(fjx0,tx);
2002             fjy0             = _mm256_add_pd(fjy0,ty);
2003             fjz0             = _mm256_add_pd(fjz0,tz);
2004
2005             }
2006
2007             /**************************
2008              * CALCULATE INTERACTIONS *
2009              **************************/
2010
2011             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2012             {
2013
2014             r01              = _mm256_mul_pd(rsq01,rinv01);
2015
2016             /* EWALD ELECTROSTATICS */
2017
2018             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2019             ewrt             = _mm256_mul_pd(r01,ewtabscale);
2020             ewitab           = _mm256_cvttpd_epi32(ewrt);
2021             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2022             ewitab           = _mm_slli_epi32(ewitab,2);
2023             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2024             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2025             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2026             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2027             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2028             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2029             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2030             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
2031             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2032
2033             d                = _mm256_sub_pd(r01,rswitch);
2034             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2035             d2               = _mm256_mul_pd(d,d);
2036             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2037
2038             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2039
2040             /* Evaluate switch function */
2041             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2042             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
2043             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2044
2045             fscal            = felec;
2046
2047             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2048
2049             /* Calculate temporary vectorial force */
2050             tx               = _mm256_mul_pd(fscal,dx01);
2051             ty               = _mm256_mul_pd(fscal,dy01);
2052             tz               = _mm256_mul_pd(fscal,dz01);
2053
2054             /* Update vectorial force */
2055             fix0             = _mm256_add_pd(fix0,tx);
2056             fiy0             = _mm256_add_pd(fiy0,ty);
2057             fiz0             = _mm256_add_pd(fiz0,tz);
2058
2059             fjx1             = _mm256_add_pd(fjx1,tx);
2060             fjy1             = _mm256_add_pd(fjy1,ty);
2061             fjz1             = _mm256_add_pd(fjz1,tz);
2062
2063             }
2064
2065             /**************************
2066              * CALCULATE INTERACTIONS *
2067              **************************/
2068
2069             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2070             {
2071
2072             r02              = _mm256_mul_pd(rsq02,rinv02);
2073
2074             /* EWALD ELECTROSTATICS */
2075
2076             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2077             ewrt             = _mm256_mul_pd(r02,ewtabscale);
2078             ewitab           = _mm256_cvttpd_epi32(ewrt);
2079             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2080             ewitab           = _mm_slli_epi32(ewitab,2);
2081             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2082             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2083             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2084             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2085             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2086             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2087             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2088             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
2089             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2090
2091             d                = _mm256_sub_pd(r02,rswitch);
2092             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2093             d2               = _mm256_mul_pd(d,d);
2094             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2095
2096             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2097
2098             /* Evaluate switch function */
2099             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2100             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
2101             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2102
2103             fscal            = felec;
2104
2105             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2106
2107             /* Calculate temporary vectorial force */
2108             tx               = _mm256_mul_pd(fscal,dx02);
2109             ty               = _mm256_mul_pd(fscal,dy02);
2110             tz               = _mm256_mul_pd(fscal,dz02);
2111
2112             /* Update vectorial force */
2113             fix0             = _mm256_add_pd(fix0,tx);
2114             fiy0             = _mm256_add_pd(fiy0,ty);
2115             fiz0             = _mm256_add_pd(fiz0,tz);
2116
2117             fjx2             = _mm256_add_pd(fjx2,tx);
2118             fjy2             = _mm256_add_pd(fjy2,ty);
2119             fjz2             = _mm256_add_pd(fjz2,tz);
2120
2121             }
2122
2123             /**************************
2124              * CALCULATE INTERACTIONS *
2125              **************************/
2126
2127             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2128             {
2129
2130             r10              = _mm256_mul_pd(rsq10,rinv10);
2131
2132             /* EWALD ELECTROSTATICS */
2133
2134             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2135             ewrt             = _mm256_mul_pd(r10,ewtabscale);
2136             ewitab           = _mm256_cvttpd_epi32(ewrt);
2137             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2138             ewitab           = _mm_slli_epi32(ewitab,2);
2139             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2140             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2141             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2142             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2143             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2144             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2145             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2146             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
2147             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2148
2149             d                = _mm256_sub_pd(r10,rswitch);
2150             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2151             d2               = _mm256_mul_pd(d,d);
2152             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2153
2154             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2155
2156             /* Evaluate switch function */
2157             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2158             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
2159             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2160
2161             fscal            = felec;
2162
2163             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2164
2165             /* Calculate temporary vectorial force */
2166             tx               = _mm256_mul_pd(fscal,dx10);
2167             ty               = _mm256_mul_pd(fscal,dy10);
2168             tz               = _mm256_mul_pd(fscal,dz10);
2169
2170             /* Update vectorial force */
2171             fix1             = _mm256_add_pd(fix1,tx);
2172             fiy1             = _mm256_add_pd(fiy1,ty);
2173             fiz1             = _mm256_add_pd(fiz1,tz);
2174
2175             fjx0             = _mm256_add_pd(fjx0,tx);
2176             fjy0             = _mm256_add_pd(fjy0,ty);
2177             fjz0             = _mm256_add_pd(fjz0,tz);
2178
2179             }
2180
2181             /**************************
2182              * CALCULATE INTERACTIONS *
2183              **************************/
2184
2185             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2186             {
2187
2188             r11              = _mm256_mul_pd(rsq11,rinv11);
2189
2190             /* EWALD ELECTROSTATICS */
2191
2192             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2193             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2194             ewitab           = _mm256_cvttpd_epi32(ewrt);
2195             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2196             ewitab           = _mm_slli_epi32(ewitab,2);
2197             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2198             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2199             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2200             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2201             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2202             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2203             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2204             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2205             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2206
2207             d                = _mm256_sub_pd(r11,rswitch);
2208             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2209             d2               = _mm256_mul_pd(d,d);
2210             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2211
2212             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2213
2214             /* Evaluate switch function */
2215             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2216             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2217             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2218
2219             fscal            = felec;
2220
2221             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2222
2223             /* Calculate temporary vectorial force */
2224             tx               = _mm256_mul_pd(fscal,dx11);
2225             ty               = _mm256_mul_pd(fscal,dy11);
2226             tz               = _mm256_mul_pd(fscal,dz11);
2227
2228             /* Update vectorial force */
2229             fix1             = _mm256_add_pd(fix1,tx);
2230             fiy1             = _mm256_add_pd(fiy1,ty);
2231             fiz1             = _mm256_add_pd(fiz1,tz);
2232
2233             fjx1             = _mm256_add_pd(fjx1,tx);
2234             fjy1             = _mm256_add_pd(fjy1,ty);
2235             fjz1             = _mm256_add_pd(fjz1,tz);
2236
2237             }
2238
2239             /**************************
2240              * CALCULATE INTERACTIONS *
2241              **************************/
2242
2243             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2244             {
2245
2246             r12              = _mm256_mul_pd(rsq12,rinv12);
2247
2248             /* EWALD ELECTROSTATICS */
2249
2250             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2251             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2252             ewitab           = _mm256_cvttpd_epi32(ewrt);
2253             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2254             ewitab           = _mm_slli_epi32(ewitab,2);
2255             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2256             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2257             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2258             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2259             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2260             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2261             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2262             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2263             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2264
2265             d                = _mm256_sub_pd(r12,rswitch);
2266             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2267             d2               = _mm256_mul_pd(d,d);
2268             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2269
2270             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2271
2272             /* Evaluate switch function */
2273             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2274             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2275             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2276
2277             fscal            = felec;
2278
2279             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2280
2281             /* Calculate temporary vectorial force */
2282             tx               = _mm256_mul_pd(fscal,dx12);
2283             ty               = _mm256_mul_pd(fscal,dy12);
2284             tz               = _mm256_mul_pd(fscal,dz12);
2285
2286             /* Update vectorial force */
2287             fix1             = _mm256_add_pd(fix1,tx);
2288             fiy1             = _mm256_add_pd(fiy1,ty);
2289             fiz1             = _mm256_add_pd(fiz1,tz);
2290
2291             fjx2             = _mm256_add_pd(fjx2,tx);
2292             fjy2             = _mm256_add_pd(fjy2,ty);
2293             fjz2             = _mm256_add_pd(fjz2,tz);
2294
2295             }
2296
2297             /**************************
2298              * CALCULATE INTERACTIONS *
2299              **************************/
2300
2301             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2302             {
2303
2304             r20              = _mm256_mul_pd(rsq20,rinv20);
2305
2306             /* EWALD ELECTROSTATICS */
2307
2308             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2309             ewrt             = _mm256_mul_pd(r20,ewtabscale);
2310             ewitab           = _mm256_cvttpd_epi32(ewrt);
2311             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2312             ewitab           = _mm_slli_epi32(ewitab,2);
2313             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2314             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2315             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2316             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2317             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2318             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2319             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2320             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
2321             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2322
2323             d                = _mm256_sub_pd(r20,rswitch);
2324             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2325             d2               = _mm256_mul_pd(d,d);
2326             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2327
2328             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2329
2330             /* Evaluate switch function */
2331             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2332             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
2333             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2334
2335             fscal            = felec;
2336
2337             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2338
2339             /* Calculate temporary vectorial force */
2340             tx               = _mm256_mul_pd(fscal,dx20);
2341             ty               = _mm256_mul_pd(fscal,dy20);
2342             tz               = _mm256_mul_pd(fscal,dz20);
2343
2344             /* Update vectorial force */
2345             fix2             = _mm256_add_pd(fix2,tx);
2346             fiy2             = _mm256_add_pd(fiy2,ty);
2347             fiz2             = _mm256_add_pd(fiz2,tz);
2348
2349             fjx0             = _mm256_add_pd(fjx0,tx);
2350             fjy0             = _mm256_add_pd(fjy0,ty);
2351             fjz0             = _mm256_add_pd(fjz0,tz);
2352
2353             }
2354
2355             /**************************
2356              * CALCULATE INTERACTIONS *
2357              **************************/
2358
2359             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2360             {
2361
2362             r21              = _mm256_mul_pd(rsq21,rinv21);
2363
2364             /* EWALD ELECTROSTATICS */
2365
2366             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2367             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2368             ewitab           = _mm256_cvttpd_epi32(ewrt);
2369             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2370             ewitab           = _mm_slli_epi32(ewitab,2);
2371             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2372             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2373             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2374             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2375             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2376             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2377             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2378             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2379             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2380
2381             d                = _mm256_sub_pd(r21,rswitch);
2382             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2383             d2               = _mm256_mul_pd(d,d);
2384             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2385
2386             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2387
2388             /* Evaluate switch function */
2389             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2390             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2391             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2392
2393             fscal            = felec;
2394
2395             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2396
2397             /* Calculate temporary vectorial force */
2398             tx               = _mm256_mul_pd(fscal,dx21);
2399             ty               = _mm256_mul_pd(fscal,dy21);
2400             tz               = _mm256_mul_pd(fscal,dz21);
2401
2402             /* Update vectorial force */
2403             fix2             = _mm256_add_pd(fix2,tx);
2404             fiy2             = _mm256_add_pd(fiy2,ty);
2405             fiz2             = _mm256_add_pd(fiz2,tz);
2406
2407             fjx1             = _mm256_add_pd(fjx1,tx);
2408             fjy1             = _mm256_add_pd(fjy1,ty);
2409             fjz1             = _mm256_add_pd(fjz1,tz);
2410
2411             }
2412
2413             /**************************
2414              * CALCULATE INTERACTIONS *
2415              **************************/
2416
2417             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2418             {
2419
2420             r22              = _mm256_mul_pd(rsq22,rinv22);
2421
2422             /* EWALD ELECTROSTATICS */
2423
2424             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2425             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2426             ewitab           = _mm256_cvttpd_epi32(ewrt);
2427             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2428             ewitab           = _mm_slli_epi32(ewitab,2);
2429             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2430             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2431             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2432             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2433             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2434             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2435             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2436             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2437             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2438
2439             d                = _mm256_sub_pd(r22,rswitch);
2440             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2441             d2               = _mm256_mul_pd(d,d);
2442             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2443
2444             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2445
2446             /* Evaluate switch function */
2447             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2448             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2449             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2450
2451             fscal            = felec;
2452
2453             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2454
2455             /* Calculate temporary vectorial force */
2456             tx               = _mm256_mul_pd(fscal,dx22);
2457             ty               = _mm256_mul_pd(fscal,dy22);
2458             tz               = _mm256_mul_pd(fscal,dz22);
2459
2460             /* Update vectorial force */
2461             fix2             = _mm256_add_pd(fix2,tx);
2462             fiy2             = _mm256_add_pd(fiy2,ty);
2463             fiz2             = _mm256_add_pd(fiz2,tz);
2464
2465             fjx2             = _mm256_add_pd(fjx2,tx);
2466             fjy2             = _mm256_add_pd(fjy2,ty);
2467             fjz2             = _mm256_add_pd(fjz2,tz);
2468
2469             }
2470
2471             fjptrA             = f+j_coord_offsetA;
2472             fjptrB             = f+j_coord_offsetB;
2473             fjptrC             = f+j_coord_offsetC;
2474             fjptrD             = f+j_coord_offsetD;
2475
2476             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2477                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2478
2479             /* Inner loop uses 573 flops */
2480         }
2481
2482         if(jidx<j_index_end)
2483         {
2484
2485             /* Get j neighbor index, and coordinate index */
2486             jnrlistA         = jjnr[jidx];
2487             jnrlistB         = jjnr[jidx+1];
2488             jnrlistC         = jjnr[jidx+2];
2489             jnrlistD         = jjnr[jidx+3];
2490             /* Sign of each element will be negative for non-real atoms.
2491              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2492              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2493              */
2494             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2495
2496             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2497             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2498             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2499
2500             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2501             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2502             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2503             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2504             j_coord_offsetA  = DIM*jnrA;
2505             j_coord_offsetB  = DIM*jnrB;
2506             j_coord_offsetC  = DIM*jnrC;
2507             j_coord_offsetD  = DIM*jnrD;
2508
2509             /* load j atom coordinates */
2510             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2511                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2512                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2513
2514             /* Calculate displacement vector */
2515             dx00             = _mm256_sub_pd(ix0,jx0);
2516             dy00             = _mm256_sub_pd(iy0,jy0);
2517             dz00             = _mm256_sub_pd(iz0,jz0);
2518             dx01             = _mm256_sub_pd(ix0,jx1);
2519             dy01             = _mm256_sub_pd(iy0,jy1);
2520             dz01             = _mm256_sub_pd(iz0,jz1);
2521             dx02             = _mm256_sub_pd(ix0,jx2);
2522             dy02             = _mm256_sub_pd(iy0,jy2);
2523             dz02             = _mm256_sub_pd(iz0,jz2);
2524             dx10             = _mm256_sub_pd(ix1,jx0);
2525             dy10             = _mm256_sub_pd(iy1,jy0);
2526             dz10             = _mm256_sub_pd(iz1,jz0);
2527             dx11             = _mm256_sub_pd(ix1,jx1);
2528             dy11             = _mm256_sub_pd(iy1,jy1);
2529             dz11             = _mm256_sub_pd(iz1,jz1);
2530             dx12             = _mm256_sub_pd(ix1,jx2);
2531             dy12             = _mm256_sub_pd(iy1,jy2);
2532             dz12             = _mm256_sub_pd(iz1,jz2);
2533             dx20             = _mm256_sub_pd(ix2,jx0);
2534             dy20             = _mm256_sub_pd(iy2,jy0);
2535             dz20             = _mm256_sub_pd(iz2,jz0);
2536             dx21             = _mm256_sub_pd(ix2,jx1);
2537             dy21             = _mm256_sub_pd(iy2,jy1);
2538             dz21             = _mm256_sub_pd(iz2,jz1);
2539             dx22             = _mm256_sub_pd(ix2,jx2);
2540             dy22             = _mm256_sub_pd(iy2,jy2);
2541             dz22             = _mm256_sub_pd(iz2,jz2);
2542
2543             /* Calculate squared distance and things based on it */
2544             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2545             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2546             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2547             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2548             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2549             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2550             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2551             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2552             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2553
2554             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
2555             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
2556             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
2557             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
2558             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2559             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2560             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
2561             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2562             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2563
2564             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2565             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
2566             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
2567             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
2568             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2569             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2570             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
2571             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2572             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2573
2574             fjx0             = _mm256_setzero_pd();
2575             fjy0             = _mm256_setzero_pd();
2576             fjz0             = _mm256_setzero_pd();
2577             fjx1             = _mm256_setzero_pd();
2578             fjy1             = _mm256_setzero_pd();
2579             fjz1             = _mm256_setzero_pd();
2580             fjx2             = _mm256_setzero_pd();
2581             fjy2             = _mm256_setzero_pd();
2582             fjz2             = _mm256_setzero_pd();
2583
2584             /**************************
2585              * CALCULATE INTERACTIONS *
2586              **************************/
2587
2588             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2589             {
2590
2591             r00              = _mm256_mul_pd(rsq00,rinv00);
2592             r00              = _mm256_andnot_pd(dummy_mask,r00);
2593
2594             /* EWALD ELECTROSTATICS */
2595
2596             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2597             ewrt             = _mm256_mul_pd(r00,ewtabscale);
2598             ewitab           = _mm256_cvttpd_epi32(ewrt);
2599             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2600             ewitab           = _mm_slli_epi32(ewitab,2);
2601             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2602             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2603             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2604             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2605             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2606             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2607             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2608             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
2609             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2610
2611             /* LENNARD-JONES DISPERSION/REPULSION */
2612
2613             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2614             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
2615             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2616             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2617             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2618
2619             d                = _mm256_sub_pd(r00,rswitch);
2620             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2621             d2               = _mm256_mul_pd(d,d);
2622             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2623
2624             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2625
2626             /* Evaluate switch function */
2627             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2628             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
2629             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2630             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2631
2632             fscal            = _mm256_add_pd(felec,fvdw);
2633
2634             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2635
2636             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2637
2638             /* Calculate temporary vectorial force */
2639             tx               = _mm256_mul_pd(fscal,dx00);
2640             ty               = _mm256_mul_pd(fscal,dy00);
2641             tz               = _mm256_mul_pd(fscal,dz00);
2642
2643             /* Update vectorial force */
2644             fix0             = _mm256_add_pd(fix0,tx);
2645             fiy0             = _mm256_add_pd(fiy0,ty);
2646             fiz0             = _mm256_add_pd(fiz0,tz);
2647
2648             fjx0             = _mm256_add_pd(fjx0,tx);
2649             fjy0             = _mm256_add_pd(fjy0,ty);
2650             fjz0             = _mm256_add_pd(fjz0,tz);
2651
2652             }
2653
2654             /**************************
2655              * CALCULATE INTERACTIONS *
2656              **************************/
2657
2658             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2659             {
2660
2661             r01              = _mm256_mul_pd(rsq01,rinv01);
2662             r01              = _mm256_andnot_pd(dummy_mask,r01);
2663
2664             /* EWALD ELECTROSTATICS */
2665
2666             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2667             ewrt             = _mm256_mul_pd(r01,ewtabscale);
2668             ewitab           = _mm256_cvttpd_epi32(ewrt);
2669             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2670             ewitab           = _mm_slli_epi32(ewitab,2);
2671             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2672             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2673             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2674             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2675             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2676             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2677             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2678             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
2679             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2680
2681             d                = _mm256_sub_pd(r01,rswitch);
2682             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2683             d2               = _mm256_mul_pd(d,d);
2684             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2685
2686             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2687
2688             /* Evaluate switch function */
2689             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2690             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
2691             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2692
2693             fscal            = felec;
2694
2695             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2696
2697             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2698
2699             /* Calculate temporary vectorial force */
2700             tx               = _mm256_mul_pd(fscal,dx01);
2701             ty               = _mm256_mul_pd(fscal,dy01);
2702             tz               = _mm256_mul_pd(fscal,dz01);
2703
2704             /* Update vectorial force */
2705             fix0             = _mm256_add_pd(fix0,tx);
2706             fiy0             = _mm256_add_pd(fiy0,ty);
2707             fiz0             = _mm256_add_pd(fiz0,tz);
2708
2709             fjx1             = _mm256_add_pd(fjx1,tx);
2710             fjy1             = _mm256_add_pd(fjy1,ty);
2711             fjz1             = _mm256_add_pd(fjz1,tz);
2712
2713             }
2714
2715             /**************************
2716              * CALCULATE INTERACTIONS *
2717              **************************/
2718
2719             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2720             {
2721
2722             r02              = _mm256_mul_pd(rsq02,rinv02);
2723             r02              = _mm256_andnot_pd(dummy_mask,r02);
2724
2725             /* EWALD ELECTROSTATICS */
2726
2727             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2728             ewrt             = _mm256_mul_pd(r02,ewtabscale);
2729             ewitab           = _mm256_cvttpd_epi32(ewrt);
2730             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2731             ewitab           = _mm_slli_epi32(ewitab,2);
2732             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2733             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2734             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2735             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2736             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2737             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2738             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2739             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
2740             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2741
2742             d                = _mm256_sub_pd(r02,rswitch);
2743             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2744             d2               = _mm256_mul_pd(d,d);
2745             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2746
2747             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2748
2749             /* Evaluate switch function */
2750             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2751             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
2752             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2753
2754             fscal            = felec;
2755
2756             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2757
2758             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2759
2760             /* Calculate temporary vectorial force */
2761             tx               = _mm256_mul_pd(fscal,dx02);
2762             ty               = _mm256_mul_pd(fscal,dy02);
2763             tz               = _mm256_mul_pd(fscal,dz02);
2764
2765             /* Update vectorial force */
2766             fix0             = _mm256_add_pd(fix0,tx);
2767             fiy0             = _mm256_add_pd(fiy0,ty);
2768             fiz0             = _mm256_add_pd(fiz0,tz);
2769
2770             fjx2             = _mm256_add_pd(fjx2,tx);
2771             fjy2             = _mm256_add_pd(fjy2,ty);
2772             fjz2             = _mm256_add_pd(fjz2,tz);
2773
2774             }
2775
2776             /**************************
2777              * CALCULATE INTERACTIONS *
2778              **************************/
2779
2780             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2781             {
2782
2783             r10              = _mm256_mul_pd(rsq10,rinv10);
2784             r10              = _mm256_andnot_pd(dummy_mask,r10);
2785
2786             /* EWALD ELECTROSTATICS */
2787
2788             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2789             ewrt             = _mm256_mul_pd(r10,ewtabscale);
2790             ewitab           = _mm256_cvttpd_epi32(ewrt);
2791             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2792             ewitab           = _mm_slli_epi32(ewitab,2);
2793             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2794             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2795             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2796             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2797             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2798             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2799             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2800             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
2801             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2802
2803             d                = _mm256_sub_pd(r10,rswitch);
2804             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2805             d2               = _mm256_mul_pd(d,d);
2806             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2807
2808             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2809
2810             /* Evaluate switch function */
2811             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2812             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
2813             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2814
2815             fscal            = felec;
2816
2817             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2818
2819             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2820
2821             /* Calculate temporary vectorial force */
2822             tx               = _mm256_mul_pd(fscal,dx10);
2823             ty               = _mm256_mul_pd(fscal,dy10);
2824             tz               = _mm256_mul_pd(fscal,dz10);
2825
2826             /* Update vectorial force */
2827             fix1             = _mm256_add_pd(fix1,tx);
2828             fiy1             = _mm256_add_pd(fiy1,ty);
2829             fiz1             = _mm256_add_pd(fiz1,tz);
2830
2831             fjx0             = _mm256_add_pd(fjx0,tx);
2832             fjy0             = _mm256_add_pd(fjy0,ty);
2833             fjz0             = _mm256_add_pd(fjz0,tz);
2834
2835             }
2836
2837             /**************************
2838              * CALCULATE INTERACTIONS *
2839              **************************/
2840
2841             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2842             {
2843
2844             r11              = _mm256_mul_pd(rsq11,rinv11);
2845             r11              = _mm256_andnot_pd(dummy_mask,r11);
2846
2847             /* EWALD ELECTROSTATICS */
2848
2849             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2850             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2851             ewitab           = _mm256_cvttpd_epi32(ewrt);
2852             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2853             ewitab           = _mm_slli_epi32(ewitab,2);
2854             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2855             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2856             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2857             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2858             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2859             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2860             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2861             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2862             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2863
2864             d                = _mm256_sub_pd(r11,rswitch);
2865             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2866             d2               = _mm256_mul_pd(d,d);
2867             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2868
2869             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2870
2871             /* Evaluate switch function */
2872             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2873             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2874             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2875
2876             fscal            = felec;
2877
2878             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2879
2880             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2881
2882             /* Calculate temporary vectorial force */
2883             tx               = _mm256_mul_pd(fscal,dx11);
2884             ty               = _mm256_mul_pd(fscal,dy11);
2885             tz               = _mm256_mul_pd(fscal,dz11);
2886
2887             /* Update vectorial force */
2888             fix1             = _mm256_add_pd(fix1,tx);
2889             fiy1             = _mm256_add_pd(fiy1,ty);
2890             fiz1             = _mm256_add_pd(fiz1,tz);
2891
2892             fjx1             = _mm256_add_pd(fjx1,tx);
2893             fjy1             = _mm256_add_pd(fjy1,ty);
2894             fjz1             = _mm256_add_pd(fjz1,tz);
2895
2896             }
2897
2898             /**************************
2899              * CALCULATE INTERACTIONS *
2900              **************************/
2901
2902             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2903             {
2904
2905             r12              = _mm256_mul_pd(rsq12,rinv12);
2906             r12              = _mm256_andnot_pd(dummy_mask,r12);
2907
2908             /* EWALD ELECTROSTATICS */
2909
2910             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2911             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2912             ewitab           = _mm256_cvttpd_epi32(ewrt);
2913             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2914             ewitab           = _mm_slli_epi32(ewitab,2);
2915             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2916             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2917             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2918             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2919             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2920             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2921             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2922             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2923             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2924
2925             d                = _mm256_sub_pd(r12,rswitch);
2926             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2927             d2               = _mm256_mul_pd(d,d);
2928             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2929
2930             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2931
2932             /* Evaluate switch function */
2933             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2934             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2935             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2936
2937             fscal            = felec;
2938
2939             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2940
2941             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2942
2943             /* Calculate temporary vectorial force */
2944             tx               = _mm256_mul_pd(fscal,dx12);
2945             ty               = _mm256_mul_pd(fscal,dy12);
2946             tz               = _mm256_mul_pd(fscal,dz12);
2947
2948             /* Update vectorial force */
2949             fix1             = _mm256_add_pd(fix1,tx);
2950             fiy1             = _mm256_add_pd(fiy1,ty);
2951             fiz1             = _mm256_add_pd(fiz1,tz);
2952
2953             fjx2             = _mm256_add_pd(fjx2,tx);
2954             fjy2             = _mm256_add_pd(fjy2,ty);
2955             fjz2             = _mm256_add_pd(fjz2,tz);
2956
2957             }
2958
2959             /**************************
2960              * CALCULATE INTERACTIONS *
2961              **************************/
2962
2963             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2964             {
2965
2966             r20              = _mm256_mul_pd(rsq20,rinv20);
2967             r20              = _mm256_andnot_pd(dummy_mask,r20);
2968
2969             /* EWALD ELECTROSTATICS */
2970
2971             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2972             ewrt             = _mm256_mul_pd(r20,ewtabscale);
2973             ewitab           = _mm256_cvttpd_epi32(ewrt);
2974             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2975             ewitab           = _mm_slli_epi32(ewitab,2);
2976             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2977             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2978             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2979             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2980             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2981             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2982             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2983             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
2984             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2985
2986             d                = _mm256_sub_pd(r20,rswitch);
2987             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2988             d2               = _mm256_mul_pd(d,d);
2989             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2990
2991             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2992
2993             /* Evaluate switch function */
2994             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2995             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
2996             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2997
2998             fscal            = felec;
2999
3000             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3001
3002             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3003
3004             /* Calculate temporary vectorial force */
3005             tx               = _mm256_mul_pd(fscal,dx20);
3006             ty               = _mm256_mul_pd(fscal,dy20);
3007             tz               = _mm256_mul_pd(fscal,dz20);
3008
3009             /* Update vectorial force */
3010             fix2             = _mm256_add_pd(fix2,tx);
3011             fiy2             = _mm256_add_pd(fiy2,ty);
3012             fiz2             = _mm256_add_pd(fiz2,tz);
3013
3014             fjx0             = _mm256_add_pd(fjx0,tx);
3015             fjy0             = _mm256_add_pd(fjy0,ty);
3016             fjz0             = _mm256_add_pd(fjz0,tz);
3017
3018             }
3019
3020             /**************************
3021              * CALCULATE INTERACTIONS *
3022              **************************/
3023
3024             if (gmx_mm256_any_lt(rsq21,rcutoff2))
3025             {
3026
3027             r21              = _mm256_mul_pd(rsq21,rinv21);
3028             r21              = _mm256_andnot_pd(dummy_mask,r21);
3029
3030             /* EWALD ELECTROSTATICS */
3031
3032             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3033             ewrt             = _mm256_mul_pd(r21,ewtabscale);
3034             ewitab           = _mm256_cvttpd_epi32(ewrt);
3035             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3036             ewitab           = _mm_slli_epi32(ewitab,2);
3037             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3038             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3039             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3040             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3041             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3042             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3043             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3044             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
3045             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
3046
3047             d                = _mm256_sub_pd(r21,rswitch);
3048             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3049             d2               = _mm256_mul_pd(d,d);
3050             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3051
3052             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3053
3054             /* Evaluate switch function */
3055             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3056             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
3057             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
3058
3059             fscal            = felec;
3060
3061             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3062
3063             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3064
3065             /* Calculate temporary vectorial force */
3066             tx               = _mm256_mul_pd(fscal,dx21);
3067             ty               = _mm256_mul_pd(fscal,dy21);
3068             tz               = _mm256_mul_pd(fscal,dz21);
3069
3070             /* Update vectorial force */
3071             fix2             = _mm256_add_pd(fix2,tx);
3072             fiy2             = _mm256_add_pd(fiy2,ty);
3073             fiz2             = _mm256_add_pd(fiz2,tz);
3074
3075             fjx1             = _mm256_add_pd(fjx1,tx);
3076             fjy1             = _mm256_add_pd(fjy1,ty);
3077             fjz1             = _mm256_add_pd(fjz1,tz);
3078
3079             }
3080
3081             /**************************
3082              * CALCULATE INTERACTIONS *
3083              **************************/
3084
3085             if (gmx_mm256_any_lt(rsq22,rcutoff2))
3086             {
3087
3088             r22              = _mm256_mul_pd(rsq22,rinv22);
3089             r22              = _mm256_andnot_pd(dummy_mask,r22);
3090
3091             /* EWALD ELECTROSTATICS */
3092
3093             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3094             ewrt             = _mm256_mul_pd(r22,ewtabscale);
3095             ewitab           = _mm256_cvttpd_epi32(ewrt);
3096             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3097             ewitab           = _mm_slli_epi32(ewitab,2);
3098             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3099             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3100             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3101             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3102             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3103             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3104             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3105             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
3106             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
3107
3108             d                = _mm256_sub_pd(r22,rswitch);
3109             d                = _mm256_max_pd(d,_mm256_setzero_pd());
3110             d2               = _mm256_mul_pd(d,d);
3111             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3112
3113             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3114
3115             /* Evaluate switch function */
3116             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3117             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
3118             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
3119
3120             fscal            = felec;
3121
3122             fscal            = _mm256_and_pd(fscal,cutoff_mask);
3123
3124             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
3125
3126             /* Calculate temporary vectorial force */
3127             tx               = _mm256_mul_pd(fscal,dx22);
3128             ty               = _mm256_mul_pd(fscal,dy22);
3129             tz               = _mm256_mul_pd(fscal,dz22);
3130
3131             /* Update vectorial force */
3132             fix2             = _mm256_add_pd(fix2,tx);
3133             fiy2             = _mm256_add_pd(fiy2,ty);
3134             fiz2             = _mm256_add_pd(fiz2,tz);
3135
3136             fjx2             = _mm256_add_pd(fjx2,tx);
3137             fjy2             = _mm256_add_pd(fjy2,ty);
3138             fjz2             = _mm256_add_pd(fjz2,tz);
3139
3140             }
3141
3142             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3143             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3144             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3145             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3146
3147             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
3148                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3149
3150             /* Inner loop uses 582 flops */
3151         }
3152
3153         /* End of innermost loop */
3154
3155         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3156                                                  f+i_coord_offset,fshift+i_shift_offset);
3157
3158         /* Increment number of inner iterations */
3159         inneriter                  += j_index_end - j_index_start;
3160
3161         /* Outer loop uses 18 flops */
3162     }
3163
3164     /* Increment number of outer iterations */
3165     outeriter        += nri;
3166
3167     /* Update outer/inner flops */
3168
3169     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*582);
3170 }