b92a3d2484d6831d941d9fc948e6b1e296f2350f
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82     real             scratch[4*DIM];
83     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84     real *           vdwioffsetptr0;
85     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86     real *           vdwioffsetptr1;
87     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88     real *           vdwioffsetptr2;
89     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
106     real             *charge;
107     int              nvdwtype;
108     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109     int              *vdwtype;
110     real             *vdwparam;
111     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
112     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
113     __m128i          ewitab;
114     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
115     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
116     real             *ewtab;
117     __m256d          dummy_mask,cutoff_mask;
118     __m128           tmpmask0,tmpmask1;
119     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
120     __m256d          one     = _mm256_set1_pd(1.0);
121     __m256d          two     = _mm256_set1_pd(2.0);
122     x                = xx[0];
123     f                = ff[0];
124
125     nri              = nlist->nri;
126     iinr             = nlist->iinr;
127     jindex           = nlist->jindex;
128     jjnr             = nlist->jjnr;
129     shiftidx         = nlist->shift;
130     gid              = nlist->gid;
131     shiftvec         = fr->shift_vec[0];
132     fshift           = fr->fshift[0];
133     facel            = _mm256_set1_pd(fr->epsfac);
134     charge           = mdatoms->chargeA;
135     nvdwtype         = fr->ntype;
136     vdwparam         = fr->nbfp;
137     vdwtype          = mdatoms->typeA;
138
139     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
140     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
141     beta2            = _mm256_mul_pd(beta,beta);
142     beta3            = _mm256_mul_pd(beta,beta2);
143
144     ewtab            = fr->ic->tabq_coul_FDV0;
145     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
146     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
147
148     /* Setup water-specific parameters */
149     inr              = nlist->iinr[0];
150     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
151     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
152     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
153     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
154
155     jq0              = _mm256_set1_pd(charge[inr+0]);
156     jq1              = _mm256_set1_pd(charge[inr+1]);
157     jq2              = _mm256_set1_pd(charge[inr+2]);
158     vdwjidx0A        = 2*vdwtype[inr+0];
159     qq00             = _mm256_mul_pd(iq0,jq0);
160     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
161     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
162     qq01             = _mm256_mul_pd(iq0,jq1);
163     qq02             = _mm256_mul_pd(iq0,jq2);
164     qq10             = _mm256_mul_pd(iq1,jq0);
165     qq11             = _mm256_mul_pd(iq1,jq1);
166     qq12             = _mm256_mul_pd(iq1,jq2);
167     qq20             = _mm256_mul_pd(iq2,jq0);
168     qq21             = _mm256_mul_pd(iq2,jq1);
169     qq22             = _mm256_mul_pd(iq2,jq2);
170
171     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
172     rcutoff_scalar   = fr->rcoulomb;
173     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
174     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
175
176     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
177     rvdw             = _mm256_set1_pd(fr->rvdw);
178
179     /* Avoid stupid compiler warnings */
180     jnrA = jnrB = jnrC = jnrD = 0;
181     j_coord_offsetA = 0;
182     j_coord_offsetB = 0;
183     j_coord_offsetC = 0;
184     j_coord_offsetD = 0;
185
186     outeriter        = 0;
187     inneriter        = 0;
188
189     for(iidx=0;iidx<4*DIM;iidx++)
190     {
191         scratch[iidx] = 0.0;
192     }
193
194     /* Start outer loop over neighborlists */
195     for(iidx=0; iidx<nri; iidx++)
196     {
197         /* Load shift vector for this list */
198         i_shift_offset   = DIM*shiftidx[iidx];
199
200         /* Load limits for loop over neighbors */
201         j_index_start    = jindex[iidx];
202         j_index_end      = jindex[iidx+1];
203
204         /* Get outer coordinate index */
205         inr              = iinr[iidx];
206         i_coord_offset   = DIM*inr;
207
208         /* Load i particle coords and add shift vector */
209         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
210                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
211
212         fix0             = _mm256_setzero_pd();
213         fiy0             = _mm256_setzero_pd();
214         fiz0             = _mm256_setzero_pd();
215         fix1             = _mm256_setzero_pd();
216         fiy1             = _mm256_setzero_pd();
217         fiz1             = _mm256_setzero_pd();
218         fix2             = _mm256_setzero_pd();
219         fiy2             = _mm256_setzero_pd();
220         fiz2             = _mm256_setzero_pd();
221
222         /* Reset potential sums */
223         velecsum         = _mm256_setzero_pd();
224         vvdwsum          = _mm256_setzero_pd();
225
226         /* Start inner kernel loop */
227         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
228         {
229
230             /* Get j neighbor index, and coordinate index */
231             jnrA             = jjnr[jidx];
232             jnrB             = jjnr[jidx+1];
233             jnrC             = jjnr[jidx+2];
234             jnrD             = jjnr[jidx+3];
235             j_coord_offsetA  = DIM*jnrA;
236             j_coord_offsetB  = DIM*jnrB;
237             j_coord_offsetC  = DIM*jnrC;
238             j_coord_offsetD  = DIM*jnrD;
239
240             /* load j atom coordinates */
241             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
242                                                  x+j_coord_offsetC,x+j_coord_offsetD,
243                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
244
245             /* Calculate displacement vector */
246             dx00             = _mm256_sub_pd(ix0,jx0);
247             dy00             = _mm256_sub_pd(iy0,jy0);
248             dz00             = _mm256_sub_pd(iz0,jz0);
249             dx01             = _mm256_sub_pd(ix0,jx1);
250             dy01             = _mm256_sub_pd(iy0,jy1);
251             dz01             = _mm256_sub_pd(iz0,jz1);
252             dx02             = _mm256_sub_pd(ix0,jx2);
253             dy02             = _mm256_sub_pd(iy0,jy2);
254             dz02             = _mm256_sub_pd(iz0,jz2);
255             dx10             = _mm256_sub_pd(ix1,jx0);
256             dy10             = _mm256_sub_pd(iy1,jy0);
257             dz10             = _mm256_sub_pd(iz1,jz0);
258             dx11             = _mm256_sub_pd(ix1,jx1);
259             dy11             = _mm256_sub_pd(iy1,jy1);
260             dz11             = _mm256_sub_pd(iz1,jz1);
261             dx12             = _mm256_sub_pd(ix1,jx2);
262             dy12             = _mm256_sub_pd(iy1,jy2);
263             dz12             = _mm256_sub_pd(iz1,jz2);
264             dx20             = _mm256_sub_pd(ix2,jx0);
265             dy20             = _mm256_sub_pd(iy2,jy0);
266             dz20             = _mm256_sub_pd(iz2,jz0);
267             dx21             = _mm256_sub_pd(ix2,jx1);
268             dy21             = _mm256_sub_pd(iy2,jy1);
269             dz21             = _mm256_sub_pd(iz2,jz1);
270             dx22             = _mm256_sub_pd(ix2,jx2);
271             dy22             = _mm256_sub_pd(iy2,jy2);
272             dz22             = _mm256_sub_pd(iz2,jz2);
273
274             /* Calculate squared distance and things based on it */
275             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
276             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
277             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
278             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
279             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
280             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
281             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
282             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
283             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
284
285             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
286             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
287             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
288             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
289             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
290             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
291             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
292             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
293             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
294
295             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
296             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
297             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
298             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
299             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
300             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
301             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
302             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
303             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
304
305             fjx0             = _mm256_setzero_pd();
306             fjy0             = _mm256_setzero_pd();
307             fjz0             = _mm256_setzero_pd();
308             fjx1             = _mm256_setzero_pd();
309             fjy1             = _mm256_setzero_pd();
310             fjz1             = _mm256_setzero_pd();
311             fjx2             = _mm256_setzero_pd();
312             fjy2             = _mm256_setzero_pd();
313             fjz2             = _mm256_setzero_pd();
314
315             /**************************
316              * CALCULATE INTERACTIONS *
317              **************************/
318
319             if (gmx_mm256_any_lt(rsq00,rcutoff2))
320             {
321
322             r00              = _mm256_mul_pd(rsq00,rinv00);
323
324             /* EWALD ELECTROSTATICS */
325
326             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
327             ewrt             = _mm256_mul_pd(r00,ewtabscale);
328             ewitab           = _mm256_cvttpd_epi32(ewrt);
329             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
330             ewitab           = _mm_slli_epi32(ewitab,2);
331             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
332             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
333             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
334             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
335             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
336             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
337             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
338             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
339             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
340
341             /* LENNARD-JONES DISPERSION/REPULSION */
342
343             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
344             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
345             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
346             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
347                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
348             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
349
350             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
351
352             /* Update potential sum for this i atom from the interaction with this j atom. */
353             velec            = _mm256_and_pd(velec,cutoff_mask);
354             velecsum         = _mm256_add_pd(velecsum,velec);
355             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
356             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
357
358             fscal            = _mm256_add_pd(felec,fvdw);
359
360             fscal            = _mm256_and_pd(fscal,cutoff_mask);
361
362             /* Calculate temporary vectorial force */
363             tx               = _mm256_mul_pd(fscal,dx00);
364             ty               = _mm256_mul_pd(fscal,dy00);
365             tz               = _mm256_mul_pd(fscal,dz00);
366
367             /* Update vectorial force */
368             fix0             = _mm256_add_pd(fix0,tx);
369             fiy0             = _mm256_add_pd(fiy0,ty);
370             fiz0             = _mm256_add_pd(fiz0,tz);
371
372             fjx0             = _mm256_add_pd(fjx0,tx);
373             fjy0             = _mm256_add_pd(fjy0,ty);
374             fjz0             = _mm256_add_pd(fjz0,tz);
375
376             }
377
378             /**************************
379              * CALCULATE INTERACTIONS *
380              **************************/
381
382             if (gmx_mm256_any_lt(rsq01,rcutoff2))
383             {
384
385             r01              = _mm256_mul_pd(rsq01,rinv01);
386
387             /* EWALD ELECTROSTATICS */
388
389             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
390             ewrt             = _mm256_mul_pd(r01,ewtabscale);
391             ewitab           = _mm256_cvttpd_epi32(ewrt);
392             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
393             ewitab           = _mm_slli_epi32(ewitab,2);
394             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
395             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
396             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
397             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
398             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
399             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
400             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
401             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
402             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
403
404             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
405
406             /* Update potential sum for this i atom from the interaction with this j atom. */
407             velec            = _mm256_and_pd(velec,cutoff_mask);
408             velecsum         = _mm256_add_pd(velecsum,velec);
409
410             fscal            = felec;
411
412             fscal            = _mm256_and_pd(fscal,cutoff_mask);
413
414             /* Calculate temporary vectorial force */
415             tx               = _mm256_mul_pd(fscal,dx01);
416             ty               = _mm256_mul_pd(fscal,dy01);
417             tz               = _mm256_mul_pd(fscal,dz01);
418
419             /* Update vectorial force */
420             fix0             = _mm256_add_pd(fix0,tx);
421             fiy0             = _mm256_add_pd(fiy0,ty);
422             fiz0             = _mm256_add_pd(fiz0,tz);
423
424             fjx1             = _mm256_add_pd(fjx1,tx);
425             fjy1             = _mm256_add_pd(fjy1,ty);
426             fjz1             = _mm256_add_pd(fjz1,tz);
427
428             }
429
430             /**************************
431              * CALCULATE INTERACTIONS *
432              **************************/
433
434             if (gmx_mm256_any_lt(rsq02,rcutoff2))
435             {
436
437             r02              = _mm256_mul_pd(rsq02,rinv02);
438
439             /* EWALD ELECTROSTATICS */
440
441             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
442             ewrt             = _mm256_mul_pd(r02,ewtabscale);
443             ewitab           = _mm256_cvttpd_epi32(ewrt);
444             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
445             ewitab           = _mm_slli_epi32(ewitab,2);
446             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
447             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
448             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
449             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
450             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
451             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
452             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
453             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
454             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
455
456             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
457
458             /* Update potential sum for this i atom from the interaction with this j atom. */
459             velec            = _mm256_and_pd(velec,cutoff_mask);
460             velecsum         = _mm256_add_pd(velecsum,velec);
461
462             fscal            = felec;
463
464             fscal            = _mm256_and_pd(fscal,cutoff_mask);
465
466             /* Calculate temporary vectorial force */
467             tx               = _mm256_mul_pd(fscal,dx02);
468             ty               = _mm256_mul_pd(fscal,dy02);
469             tz               = _mm256_mul_pd(fscal,dz02);
470
471             /* Update vectorial force */
472             fix0             = _mm256_add_pd(fix0,tx);
473             fiy0             = _mm256_add_pd(fiy0,ty);
474             fiz0             = _mm256_add_pd(fiz0,tz);
475
476             fjx2             = _mm256_add_pd(fjx2,tx);
477             fjy2             = _mm256_add_pd(fjy2,ty);
478             fjz2             = _mm256_add_pd(fjz2,tz);
479
480             }
481
482             /**************************
483              * CALCULATE INTERACTIONS *
484              **************************/
485
486             if (gmx_mm256_any_lt(rsq10,rcutoff2))
487             {
488
489             r10              = _mm256_mul_pd(rsq10,rinv10);
490
491             /* EWALD ELECTROSTATICS */
492
493             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
494             ewrt             = _mm256_mul_pd(r10,ewtabscale);
495             ewitab           = _mm256_cvttpd_epi32(ewrt);
496             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
497             ewitab           = _mm_slli_epi32(ewitab,2);
498             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
499             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
500             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
501             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
502             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
503             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
504             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
505             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
506             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
507
508             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
509
510             /* Update potential sum for this i atom from the interaction with this j atom. */
511             velec            = _mm256_and_pd(velec,cutoff_mask);
512             velecsum         = _mm256_add_pd(velecsum,velec);
513
514             fscal            = felec;
515
516             fscal            = _mm256_and_pd(fscal,cutoff_mask);
517
518             /* Calculate temporary vectorial force */
519             tx               = _mm256_mul_pd(fscal,dx10);
520             ty               = _mm256_mul_pd(fscal,dy10);
521             tz               = _mm256_mul_pd(fscal,dz10);
522
523             /* Update vectorial force */
524             fix1             = _mm256_add_pd(fix1,tx);
525             fiy1             = _mm256_add_pd(fiy1,ty);
526             fiz1             = _mm256_add_pd(fiz1,tz);
527
528             fjx0             = _mm256_add_pd(fjx0,tx);
529             fjy0             = _mm256_add_pd(fjy0,ty);
530             fjz0             = _mm256_add_pd(fjz0,tz);
531
532             }
533
534             /**************************
535              * CALCULATE INTERACTIONS *
536              **************************/
537
538             if (gmx_mm256_any_lt(rsq11,rcutoff2))
539             {
540
541             r11              = _mm256_mul_pd(rsq11,rinv11);
542
543             /* EWALD ELECTROSTATICS */
544
545             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
546             ewrt             = _mm256_mul_pd(r11,ewtabscale);
547             ewitab           = _mm256_cvttpd_epi32(ewrt);
548             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
549             ewitab           = _mm_slli_epi32(ewitab,2);
550             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
551             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
552             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
553             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
554             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
555             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
556             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
557             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
558             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
559
560             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
561
562             /* Update potential sum for this i atom from the interaction with this j atom. */
563             velec            = _mm256_and_pd(velec,cutoff_mask);
564             velecsum         = _mm256_add_pd(velecsum,velec);
565
566             fscal            = felec;
567
568             fscal            = _mm256_and_pd(fscal,cutoff_mask);
569
570             /* Calculate temporary vectorial force */
571             tx               = _mm256_mul_pd(fscal,dx11);
572             ty               = _mm256_mul_pd(fscal,dy11);
573             tz               = _mm256_mul_pd(fscal,dz11);
574
575             /* Update vectorial force */
576             fix1             = _mm256_add_pd(fix1,tx);
577             fiy1             = _mm256_add_pd(fiy1,ty);
578             fiz1             = _mm256_add_pd(fiz1,tz);
579
580             fjx1             = _mm256_add_pd(fjx1,tx);
581             fjy1             = _mm256_add_pd(fjy1,ty);
582             fjz1             = _mm256_add_pd(fjz1,tz);
583
584             }
585
586             /**************************
587              * CALCULATE INTERACTIONS *
588              **************************/
589
590             if (gmx_mm256_any_lt(rsq12,rcutoff2))
591             {
592
593             r12              = _mm256_mul_pd(rsq12,rinv12);
594
595             /* EWALD ELECTROSTATICS */
596
597             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
598             ewrt             = _mm256_mul_pd(r12,ewtabscale);
599             ewitab           = _mm256_cvttpd_epi32(ewrt);
600             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
601             ewitab           = _mm_slli_epi32(ewitab,2);
602             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
603             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
604             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
605             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
606             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
607             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
608             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
609             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
610             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
611
612             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
613
614             /* Update potential sum for this i atom from the interaction with this j atom. */
615             velec            = _mm256_and_pd(velec,cutoff_mask);
616             velecsum         = _mm256_add_pd(velecsum,velec);
617
618             fscal            = felec;
619
620             fscal            = _mm256_and_pd(fscal,cutoff_mask);
621
622             /* Calculate temporary vectorial force */
623             tx               = _mm256_mul_pd(fscal,dx12);
624             ty               = _mm256_mul_pd(fscal,dy12);
625             tz               = _mm256_mul_pd(fscal,dz12);
626
627             /* Update vectorial force */
628             fix1             = _mm256_add_pd(fix1,tx);
629             fiy1             = _mm256_add_pd(fiy1,ty);
630             fiz1             = _mm256_add_pd(fiz1,tz);
631
632             fjx2             = _mm256_add_pd(fjx2,tx);
633             fjy2             = _mm256_add_pd(fjy2,ty);
634             fjz2             = _mm256_add_pd(fjz2,tz);
635
636             }
637
638             /**************************
639              * CALCULATE INTERACTIONS *
640              **************************/
641
642             if (gmx_mm256_any_lt(rsq20,rcutoff2))
643             {
644
645             r20              = _mm256_mul_pd(rsq20,rinv20);
646
647             /* EWALD ELECTROSTATICS */
648
649             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
650             ewrt             = _mm256_mul_pd(r20,ewtabscale);
651             ewitab           = _mm256_cvttpd_epi32(ewrt);
652             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
653             ewitab           = _mm_slli_epi32(ewitab,2);
654             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
655             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
656             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
657             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
658             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
659             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
660             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
661             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
662             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
663
664             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
665
666             /* Update potential sum for this i atom from the interaction with this j atom. */
667             velec            = _mm256_and_pd(velec,cutoff_mask);
668             velecsum         = _mm256_add_pd(velecsum,velec);
669
670             fscal            = felec;
671
672             fscal            = _mm256_and_pd(fscal,cutoff_mask);
673
674             /* Calculate temporary vectorial force */
675             tx               = _mm256_mul_pd(fscal,dx20);
676             ty               = _mm256_mul_pd(fscal,dy20);
677             tz               = _mm256_mul_pd(fscal,dz20);
678
679             /* Update vectorial force */
680             fix2             = _mm256_add_pd(fix2,tx);
681             fiy2             = _mm256_add_pd(fiy2,ty);
682             fiz2             = _mm256_add_pd(fiz2,tz);
683
684             fjx0             = _mm256_add_pd(fjx0,tx);
685             fjy0             = _mm256_add_pd(fjy0,ty);
686             fjz0             = _mm256_add_pd(fjz0,tz);
687
688             }
689
690             /**************************
691              * CALCULATE INTERACTIONS *
692              **************************/
693
694             if (gmx_mm256_any_lt(rsq21,rcutoff2))
695             {
696
697             r21              = _mm256_mul_pd(rsq21,rinv21);
698
699             /* EWALD ELECTROSTATICS */
700
701             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
702             ewrt             = _mm256_mul_pd(r21,ewtabscale);
703             ewitab           = _mm256_cvttpd_epi32(ewrt);
704             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
705             ewitab           = _mm_slli_epi32(ewitab,2);
706             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
707             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
708             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
709             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
710             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
711             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
712             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
713             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
714             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
715
716             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
717
718             /* Update potential sum for this i atom from the interaction with this j atom. */
719             velec            = _mm256_and_pd(velec,cutoff_mask);
720             velecsum         = _mm256_add_pd(velecsum,velec);
721
722             fscal            = felec;
723
724             fscal            = _mm256_and_pd(fscal,cutoff_mask);
725
726             /* Calculate temporary vectorial force */
727             tx               = _mm256_mul_pd(fscal,dx21);
728             ty               = _mm256_mul_pd(fscal,dy21);
729             tz               = _mm256_mul_pd(fscal,dz21);
730
731             /* Update vectorial force */
732             fix2             = _mm256_add_pd(fix2,tx);
733             fiy2             = _mm256_add_pd(fiy2,ty);
734             fiz2             = _mm256_add_pd(fiz2,tz);
735
736             fjx1             = _mm256_add_pd(fjx1,tx);
737             fjy1             = _mm256_add_pd(fjy1,ty);
738             fjz1             = _mm256_add_pd(fjz1,tz);
739
740             }
741
742             /**************************
743              * CALCULATE INTERACTIONS *
744              **************************/
745
746             if (gmx_mm256_any_lt(rsq22,rcutoff2))
747             {
748
749             r22              = _mm256_mul_pd(rsq22,rinv22);
750
751             /* EWALD ELECTROSTATICS */
752
753             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
754             ewrt             = _mm256_mul_pd(r22,ewtabscale);
755             ewitab           = _mm256_cvttpd_epi32(ewrt);
756             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
757             ewitab           = _mm_slli_epi32(ewitab,2);
758             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
759             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
760             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
761             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
762             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
763             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
764             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
765             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
766             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
767
768             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
769
770             /* Update potential sum for this i atom from the interaction with this j atom. */
771             velec            = _mm256_and_pd(velec,cutoff_mask);
772             velecsum         = _mm256_add_pd(velecsum,velec);
773
774             fscal            = felec;
775
776             fscal            = _mm256_and_pd(fscal,cutoff_mask);
777
778             /* Calculate temporary vectorial force */
779             tx               = _mm256_mul_pd(fscal,dx22);
780             ty               = _mm256_mul_pd(fscal,dy22);
781             tz               = _mm256_mul_pd(fscal,dz22);
782
783             /* Update vectorial force */
784             fix2             = _mm256_add_pd(fix2,tx);
785             fiy2             = _mm256_add_pd(fiy2,ty);
786             fiz2             = _mm256_add_pd(fiz2,tz);
787
788             fjx2             = _mm256_add_pd(fjx2,tx);
789             fjy2             = _mm256_add_pd(fjy2,ty);
790             fjz2             = _mm256_add_pd(fjz2,tz);
791
792             }
793
794             fjptrA             = f+j_coord_offsetA;
795             fjptrB             = f+j_coord_offsetB;
796             fjptrC             = f+j_coord_offsetC;
797             fjptrD             = f+j_coord_offsetD;
798
799             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
800                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
801
802             /* Inner loop uses 432 flops */
803         }
804
805         if(jidx<j_index_end)
806         {
807
808             /* Get j neighbor index, and coordinate index */
809             jnrlistA         = jjnr[jidx];
810             jnrlistB         = jjnr[jidx+1];
811             jnrlistC         = jjnr[jidx+2];
812             jnrlistD         = jjnr[jidx+3];
813             /* Sign of each element will be negative for non-real atoms.
814              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
815              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
816              */
817             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
818
819             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
820             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
821             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
822
823             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
824             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
825             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
826             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
827             j_coord_offsetA  = DIM*jnrA;
828             j_coord_offsetB  = DIM*jnrB;
829             j_coord_offsetC  = DIM*jnrC;
830             j_coord_offsetD  = DIM*jnrD;
831
832             /* load j atom coordinates */
833             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
834                                                  x+j_coord_offsetC,x+j_coord_offsetD,
835                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
836
837             /* Calculate displacement vector */
838             dx00             = _mm256_sub_pd(ix0,jx0);
839             dy00             = _mm256_sub_pd(iy0,jy0);
840             dz00             = _mm256_sub_pd(iz0,jz0);
841             dx01             = _mm256_sub_pd(ix0,jx1);
842             dy01             = _mm256_sub_pd(iy0,jy1);
843             dz01             = _mm256_sub_pd(iz0,jz1);
844             dx02             = _mm256_sub_pd(ix0,jx2);
845             dy02             = _mm256_sub_pd(iy0,jy2);
846             dz02             = _mm256_sub_pd(iz0,jz2);
847             dx10             = _mm256_sub_pd(ix1,jx0);
848             dy10             = _mm256_sub_pd(iy1,jy0);
849             dz10             = _mm256_sub_pd(iz1,jz0);
850             dx11             = _mm256_sub_pd(ix1,jx1);
851             dy11             = _mm256_sub_pd(iy1,jy1);
852             dz11             = _mm256_sub_pd(iz1,jz1);
853             dx12             = _mm256_sub_pd(ix1,jx2);
854             dy12             = _mm256_sub_pd(iy1,jy2);
855             dz12             = _mm256_sub_pd(iz1,jz2);
856             dx20             = _mm256_sub_pd(ix2,jx0);
857             dy20             = _mm256_sub_pd(iy2,jy0);
858             dz20             = _mm256_sub_pd(iz2,jz0);
859             dx21             = _mm256_sub_pd(ix2,jx1);
860             dy21             = _mm256_sub_pd(iy2,jy1);
861             dz21             = _mm256_sub_pd(iz2,jz1);
862             dx22             = _mm256_sub_pd(ix2,jx2);
863             dy22             = _mm256_sub_pd(iy2,jy2);
864             dz22             = _mm256_sub_pd(iz2,jz2);
865
866             /* Calculate squared distance and things based on it */
867             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
868             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
869             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
870             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
871             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
872             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
873             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
874             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
875             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
876
877             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
878             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
879             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
880             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
881             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
882             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
883             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
884             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
885             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
886
887             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
888             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
889             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
890             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
891             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
892             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
893             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
894             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
895             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
896
897             fjx0             = _mm256_setzero_pd();
898             fjy0             = _mm256_setzero_pd();
899             fjz0             = _mm256_setzero_pd();
900             fjx1             = _mm256_setzero_pd();
901             fjy1             = _mm256_setzero_pd();
902             fjz1             = _mm256_setzero_pd();
903             fjx2             = _mm256_setzero_pd();
904             fjy2             = _mm256_setzero_pd();
905             fjz2             = _mm256_setzero_pd();
906
907             /**************************
908              * CALCULATE INTERACTIONS *
909              **************************/
910
911             if (gmx_mm256_any_lt(rsq00,rcutoff2))
912             {
913
914             r00              = _mm256_mul_pd(rsq00,rinv00);
915             r00              = _mm256_andnot_pd(dummy_mask,r00);
916
917             /* EWALD ELECTROSTATICS */
918
919             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
920             ewrt             = _mm256_mul_pd(r00,ewtabscale);
921             ewitab           = _mm256_cvttpd_epi32(ewrt);
922             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
923             ewitab           = _mm_slli_epi32(ewitab,2);
924             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
925             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
926             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
927             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
928             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
929             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
930             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
931             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
932             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
933
934             /* LENNARD-JONES DISPERSION/REPULSION */
935
936             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
937             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
938             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
939             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
940                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
941             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
942
943             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
944
945             /* Update potential sum for this i atom from the interaction with this j atom. */
946             velec            = _mm256_and_pd(velec,cutoff_mask);
947             velec            = _mm256_andnot_pd(dummy_mask,velec);
948             velecsum         = _mm256_add_pd(velecsum,velec);
949             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
950             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
951             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
952
953             fscal            = _mm256_add_pd(felec,fvdw);
954
955             fscal            = _mm256_and_pd(fscal,cutoff_mask);
956
957             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
958
959             /* Calculate temporary vectorial force */
960             tx               = _mm256_mul_pd(fscal,dx00);
961             ty               = _mm256_mul_pd(fscal,dy00);
962             tz               = _mm256_mul_pd(fscal,dz00);
963
964             /* Update vectorial force */
965             fix0             = _mm256_add_pd(fix0,tx);
966             fiy0             = _mm256_add_pd(fiy0,ty);
967             fiz0             = _mm256_add_pd(fiz0,tz);
968
969             fjx0             = _mm256_add_pd(fjx0,tx);
970             fjy0             = _mm256_add_pd(fjy0,ty);
971             fjz0             = _mm256_add_pd(fjz0,tz);
972
973             }
974
975             /**************************
976              * CALCULATE INTERACTIONS *
977              **************************/
978
979             if (gmx_mm256_any_lt(rsq01,rcutoff2))
980             {
981
982             r01              = _mm256_mul_pd(rsq01,rinv01);
983             r01              = _mm256_andnot_pd(dummy_mask,r01);
984
985             /* EWALD ELECTROSTATICS */
986
987             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
988             ewrt             = _mm256_mul_pd(r01,ewtabscale);
989             ewitab           = _mm256_cvttpd_epi32(ewrt);
990             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
991             ewitab           = _mm_slli_epi32(ewitab,2);
992             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
993             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
994             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
995             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
996             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
997             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
998             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
999             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
1000             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1001
1002             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1003
1004             /* Update potential sum for this i atom from the interaction with this j atom. */
1005             velec            = _mm256_and_pd(velec,cutoff_mask);
1006             velec            = _mm256_andnot_pd(dummy_mask,velec);
1007             velecsum         = _mm256_add_pd(velecsum,velec);
1008
1009             fscal            = felec;
1010
1011             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1012
1013             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1014
1015             /* Calculate temporary vectorial force */
1016             tx               = _mm256_mul_pd(fscal,dx01);
1017             ty               = _mm256_mul_pd(fscal,dy01);
1018             tz               = _mm256_mul_pd(fscal,dz01);
1019
1020             /* Update vectorial force */
1021             fix0             = _mm256_add_pd(fix0,tx);
1022             fiy0             = _mm256_add_pd(fiy0,ty);
1023             fiz0             = _mm256_add_pd(fiz0,tz);
1024
1025             fjx1             = _mm256_add_pd(fjx1,tx);
1026             fjy1             = _mm256_add_pd(fjy1,ty);
1027             fjz1             = _mm256_add_pd(fjz1,tz);
1028
1029             }
1030
1031             /**************************
1032              * CALCULATE INTERACTIONS *
1033              **************************/
1034
1035             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1036             {
1037
1038             r02              = _mm256_mul_pd(rsq02,rinv02);
1039             r02              = _mm256_andnot_pd(dummy_mask,r02);
1040
1041             /* EWALD ELECTROSTATICS */
1042
1043             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1044             ewrt             = _mm256_mul_pd(r02,ewtabscale);
1045             ewitab           = _mm256_cvttpd_epi32(ewrt);
1046             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1047             ewitab           = _mm_slli_epi32(ewitab,2);
1048             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1049             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1050             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1051             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1052             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1053             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1054             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1055             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
1056             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1057
1058             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1059
1060             /* Update potential sum for this i atom from the interaction with this j atom. */
1061             velec            = _mm256_and_pd(velec,cutoff_mask);
1062             velec            = _mm256_andnot_pd(dummy_mask,velec);
1063             velecsum         = _mm256_add_pd(velecsum,velec);
1064
1065             fscal            = felec;
1066
1067             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1068
1069             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1070
1071             /* Calculate temporary vectorial force */
1072             tx               = _mm256_mul_pd(fscal,dx02);
1073             ty               = _mm256_mul_pd(fscal,dy02);
1074             tz               = _mm256_mul_pd(fscal,dz02);
1075
1076             /* Update vectorial force */
1077             fix0             = _mm256_add_pd(fix0,tx);
1078             fiy0             = _mm256_add_pd(fiy0,ty);
1079             fiz0             = _mm256_add_pd(fiz0,tz);
1080
1081             fjx2             = _mm256_add_pd(fjx2,tx);
1082             fjy2             = _mm256_add_pd(fjy2,ty);
1083             fjz2             = _mm256_add_pd(fjz2,tz);
1084
1085             }
1086
1087             /**************************
1088              * CALCULATE INTERACTIONS *
1089              **************************/
1090
1091             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1092             {
1093
1094             r10              = _mm256_mul_pd(rsq10,rinv10);
1095             r10              = _mm256_andnot_pd(dummy_mask,r10);
1096
1097             /* EWALD ELECTROSTATICS */
1098
1099             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1100             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1101             ewitab           = _mm256_cvttpd_epi32(ewrt);
1102             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1103             ewitab           = _mm_slli_epi32(ewitab,2);
1104             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1105             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1106             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1107             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1108             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1109             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1110             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1111             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
1112             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1113
1114             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1115
1116             /* Update potential sum for this i atom from the interaction with this j atom. */
1117             velec            = _mm256_and_pd(velec,cutoff_mask);
1118             velec            = _mm256_andnot_pd(dummy_mask,velec);
1119             velecsum         = _mm256_add_pd(velecsum,velec);
1120
1121             fscal            = felec;
1122
1123             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1124
1125             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1126
1127             /* Calculate temporary vectorial force */
1128             tx               = _mm256_mul_pd(fscal,dx10);
1129             ty               = _mm256_mul_pd(fscal,dy10);
1130             tz               = _mm256_mul_pd(fscal,dz10);
1131
1132             /* Update vectorial force */
1133             fix1             = _mm256_add_pd(fix1,tx);
1134             fiy1             = _mm256_add_pd(fiy1,ty);
1135             fiz1             = _mm256_add_pd(fiz1,tz);
1136
1137             fjx0             = _mm256_add_pd(fjx0,tx);
1138             fjy0             = _mm256_add_pd(fjy0,ty);
1139             fjz0             = _mm256_add_pd(fjz0,tz);
1140
1141             }
1142
1143             /**************************
1144              * CALCULATE INTERACTIONS *
1145              **************************/
1146
1147             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1148             {
1149
1150             r11              = _mm256_mul_pd(rsq11,rinv11);
1151             r11              = _mm256_andnot_pd(dummy_mask,r11);
1152
1153             /* EWALD ELECTROSTATICS */
1154
1155             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1156             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1157             ewitab           = _mm256_cvttpd_epi32(ewrt);
1158             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1159             ewitab           = _mm_slli_epi32(ewitab,2);
1160             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1161             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1162             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1163             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1164             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1165             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1166             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1167             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1168             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1169
1170             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1171
1172             /* Update potential sum for this i atom from the interaction with this j atom. */
1173             velec            = _mm256_and_pd(velec,cutoff_mask);
1174             velec            = _mm256_andnot_pd(dummy_mask,velec);
1175             velecsum         = _mm256_add_pd(velecsum,velec);
1176
1177             fscal            = felec;
1178
1179             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1180
1181             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1182
1183             /* Calculate temporary vectorial force */
1184             tx               = _mm256_mul_pd(fscal,dx11);
1185             ty               = _mm256_mul_pd(fscal,dy11);
1186             tz               = _mm256_mul_pd(fscal,dz11);
1187
1188             /* Update vectorial force */
1189             fix1             = _mm256_add_pd(fix1,tx);
1190             fiy1             = _mm256_add_pd(fiy1,ty);
1191             fiz1             = _mm256_add_pd(fiz1,tz);
1192
1193             fjx1             = _mm256_add_pd(fjx1,tx);
1194             fjy1             = _mm256_add_pd(fjy1,ty);
1195             fjz1             = _mm256_add_pd(fjz1,tz);
1196
1197             }
1198
1199             /**************************
1200              * CALCULATE INTERACTIONS *
1201              **************************/
1202
1203             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1204             {
1205
1206             r12              = _mm256_mul_pd(rsq12,rinv12);
1207             r12              = _mm256_andnot_pd(dummy_mask,r12);
1208
1209             /* EWALD ELECTROSTATICS */
1210
1211             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1212             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1213             ewitab           = _mm256_cvttpd_epi32(ewrt);
1214             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1215             ewitab           = _mm_slli_epi32(ewitab,2);
1216             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1217             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1218             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1219             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1220             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1221             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1222             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1223             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1224             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1225
1226             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1227
1228             /* Update potential sum for this i atom from the interaction with this j atom. */
1229             velec            = _mm256_and_pd(velec,cutoff_mask);
1230             velec            = _mm256_andnot_pd(dummy_mask,velec);
1231             velecsum         = _mm256_add_pd(velecsum,velec);
1232
1233             fscal            = felec;
1234
1235             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1236
1237             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1238
1239             /* Calculate temporary vectorial force */
1240             tx               = _mm256_mul_pd(fscal,dx12);
1241             ty               = _mm256_mul_pd(fscal,dy12);
1242             tz               = _mm256_mul_pd(fscal,dz12);
1243
1244             /* Update vectorial force */
1245             fix1             = _mm256_add_pd(fix1,tx);
1246             fiy1             = _mm256_add_pd(fiy1,ty);
1247             fiz1             = _mm256_add_pd(fiz1,tz);
1248
1249             fjx2             = _mm256_add_pd(fjx2,tx);
1250             fjy2             = _mm256_add_pd(fjy2,ty);
1251             fjz2             = _mm256_add_pd(fjz2,tz);
1252
1253             }
1254
1255             /**************************
1256              * CALCULATE INTERACTIONS *
1257              **************************/
1258
1259             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1260             {
1261
1262             r20              = _mm256_mul_pd(rsq20,rinv20);
1263             r20              = _mm256_andnot_pd(dummy_mask,r20);
1264
1265             /* EWALD ELECTROSTATICS */
1266
1267             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1268             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1269             ewitab           = _mm256_cvttpd_epi32(ewrt);
1270             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1271             ewitab           = _mm_slli_epi32(ewitab,2);
1272             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1273             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1274             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1275             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1276             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1277             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1278             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1279             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
1280             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1281
1282             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1283
1284             /* Update potential sum for this i atom from the interaction with this j atom. */
1285             velec            = _mm256_and_pd(velec,cutoff_mask);
1286             velec            = _mm256_andnot_pd(dummy_mask,velec);
1287             velecsum         = _mm256_add_pd(velecsum,velec);
1288
1289             fscal            = felec;
1290
1291             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1292
1293             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1294
1295             /* Calculate temporary vectorial force */
1296             tx               = _mm256_mul_pd(fscal,dx20);
1297             ty               = _mm256_mul_pd(fscal,dy20);
1298             tz               = _mm256_mul_pd(fscal,dz20);
1299
1300             /* Update vectorial force */
1301             fix2             = _mm256_add_pd(fix2,tx);
1302             fiy2             = _mm256_add_pd(fiy2,ty);
1303             fiz2             = _mm256_add_pd(fiz2,tz);
1304
1305             fjx0             = _mm256_add_pd(fjx0,tx);
1306             fjy0             = _mm256_add_pd(fjy0,ty);
1307             fjz0             = _mm256_add_pd(fjz0,tz);
1308
1309             }
1310
1311             /**************************
1312              * CALCULATE INTERACTIONS *
1313              **************************/
1314
1315             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1316             {
1317
1318             r21              = _mm256_mul_pd(rsq21,rinv21);
1319             r21              = _mm256_andnot_pd(dummy_mask,r21);
1320
1321             /* EWALD ELECTROSTATICS */
1322
1323             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1324             ewrt             = _mm256_mul_pd(r21,ewtabscale);
1325             ewitab           = _mm256_cvttpd_epi32(ewrt);
1326             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1327             ewitab           = _mm_slli_epi32(ewitab,2);
1328             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1329             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1330             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1331             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1332             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1333             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1334             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1335             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1336             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1337
1338             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1339
1340             /* Update potential sum for this i atom from the interaction with this j atom. */
1341             velec            = _mm256_and_pd(velec,cutoff_mask);
1342             velec            = _mm256_andnot_pd(dummy_mask,velec);
1343             velecsum         = _mm256_add_pd(velecsum,velec);
1344
1345             fscal            = felec;
1346
1347             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1348
1349             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1350
1351             /* Calculate temporary vectorial force */
1352             tx               = _mm256_mul_pd(fscal,dx21);
1353             ty               = _mm256_mul_pd(fscal,dy21);
1354             tz               = _mm256_mul_pd(fscal,dz21);
1355
1356             /* Update vectorial force */
1357             fix2             = _mm256_add_pd(fix2,tx);
1358             fiy2             = _mm256_add_pd(fiy2,ty);
1359             fiz2             = _mm256_add_pd(fiz2,tz);
1360
1361             fjx1             = _mm256_add_pd(fjx1,tx);
1362             fjy1             = _mm256_add_pd(fjy1,ty);
1363             fjz1             = _mm256_add_pd(fjz1,tz);
1364
1365             }
1366
1367             /**************************
1368              * CALCULATE INTERACTIONS *
1369              **************************/
1370
1371             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1372             {
1373
1374             r22              = _mm256_mul_pd(rsq22,rinv22);
1375             r22              = _mm256_andnot_pd(dummy_mask,r22);
1376
1377             /* EWALD ELECTROSTATICS */
1378
1379             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1380             ewrt             = _mm256_mul_pd(r22,ewtabscale);
1381             ewitab           = _mm256_cvttpd_epi32(ewrt);
1382             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1383             ewitab           = _mm_slli_epi32(ewitab,2);
1384             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1385             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1386             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1387             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1388             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1389             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1390             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1391             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1392             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1393
1394             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1395
1396             /* Update potential sum for this i atom from the interaction with this j atom. */
1397             velec            = _mm256_and_pd(velec,cutoff_mask);
1398             velec            = _mm256_andnot_pd(dummy_mask,velec);
1399             velecsum         = _mm256_add_pd(velecsum,velec);
1400
1401             fscal            = felec;
1402
1403             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1404
1405             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1406
1407             /* Calculate temporary vectorial force */
1408             tx               = _mm256_mul_pd(fscal,dx22);
1409             ty               = _mm256_mul_pd(fscal,dy22);
1410             tz               = _mm256_mul_pd(fscal,dz22);
1411
1412             /* Update vectorial force */
1413             fix2             = _mm256_add_pd(fix2,tx);
1414             fiy2             = _mm256_add_pd(fiy2,ty);
1415             fiz2             = _mm256_add_pd(fiz2,tz);
1416
1417             fjx2             = _mm256_add_pd(fjx2,tx);
1418             fjy2             = _mm256_add_pd(fjy2,ty);
1419             fjz2             = _mm256_add_pd(fjz2,tz);
1420
1421             }
1422
1423             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1424             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1425             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1426             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1427
1428             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1429                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1430
1431             /* Inner loop uses 441 flops */
1432         }
1433
1434         /* End of innermost loop */
1435
1436         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1437                                                  f+i_coord_offset,fshift+i_shift_offset);
1438
1439         ggid                        = gid[iidx];
1440         /* Update potential energies */
1441         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1442         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1443
1444         /* Increment number of inner iterations */
1445         inneriter                  += j_index_end - j_index_start;
1446
1447         /* Outer loop uses 20 flops */
1448     }
1449
1450     /* Increment number of outer iterations */
1451     outeriter        += nri;
1452
1453     /* Update outer/inner flops */
1454
1455     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1456 }
1457 /*
1458  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
1459  * Electrostatics interaction: Ewald
1460  * VdW interaction:            LennardJones
1461  * Geometry:                   Water3-Water3
1462  * Calculate force/pot:        Force
1463  */
1464 void
1465 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
1466                     (t_nblist                    * gmx_restrict       nlist,
1467                      rvec                        * gmx_restrict          xx,
1468                      rvec                        * gmx_restrict          ff,
1469                      t_forcerec                  * gmx_restrict          fr,
1470                      t_mdatoms                   * gmx_restrict     mdatoms,
1471                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1472                      t_nrnb                      * gmx_restrict        nrnb)
1473 {
1474     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1475      * just 0 for non-waters.
1476      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1477      * jnr indices corresponding to data put in the four positions in the SIMD register.
1478      */
1479     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1480     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1481     int              jnrA,jnrB,jnrC,jnrD;
1482     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1483     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1484     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1485     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1486     real             rcutoff_scalar;
1487     real             *shiftvec,*fshift,*x,*f;
1488     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1489     real             scratch[4*DIM];
1490     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1491     real *           vdwioffsetptr0;
1492     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1493     real *           vdwioffsetptr1;
1494     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1495     real *           vdwioffsetptr2;
1496     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1497     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1498     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1499     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1500     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1501     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1502     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1503     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1504     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1505     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1506     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1507     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1508     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1509     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1510     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1511     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1512     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1513     real             *charge;
1514     int              nvdwtype;
1515     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1516     int              *vdwtype;
1517     real             *vdwparam;
1518     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1519     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1520     __m128i          ewitab;
1521     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1522     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1523     real             *ewtab;
1524     __m256d          dummy_mask,cutoff_mask;
1525     __m128           tmpmask0,tmpmask1;
1526     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1527     __m256d          one     = _mm256_set1_pd(1.0);
1528     __m256d          two     = _mm256_set1_pd(2.0);
1529     x                = xx[0];
1530     f                = ff[0];
1531
1532     nri              = nlist->nri;
1533     iinr             = nlist->iinr;
1534     jindex           = nlist->jindex;
1535     jjnr             = nlist->jjnr;
1536     shiftidx         = nlist->shift;
1537     gid              = nlist->gid;
1538     shiftvec         = fr->shift_vec[0];
1539     fshift           = fr->fshift[0];
1540     facel            = _mm256_set1_pd(fr->epsfac);
1541     charge           = mdatoms->chargeA;
1542     nvdwtype         = fr->ntype;
1543     vdwparam         = fr->nbfp;
1544     vdwtype          = mdatoms->typeA;
1545
1546     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1547     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1548     beta2            = _mm256_mul_pd(beta,beta);
1549     beta3            = _mm256_mul_pd(beta,beta2);
1550
1551     ewtab            = fr->ic->tabq_coul_F;
1552     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1553     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1554
1555     /* Setup water-specific parameters */
1556     inr              = nlist->iinr[0];
1557     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1558     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1559     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1560     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1561
1562     jq0              = _mm256_set1_pd(charge[inr+0]);
1563     jq1              = _mm256_set1_pd(charge[inr+1]);
1564     jq2              = _mm256_set1_pd(charge[inr+2]);
1565     vdwjidx0A        = 2*vdwtype[inr+0];
1566     qq00             = _mm256_mul_pd(iq0,jq0);
1567     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1568     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1569     qq01             = _mm256_mul_pd(iq0,jq1);
1570     qq02             = _mm256_mul_pd(iq0,jq2);
1571     qq10             = _mm256_mul_pd(iq1,jq0);
1572     qq11             = _mm256_mul_pd(iq1,jq1);
1573     qq12             = _mm256_mul_pd(iq1,jq2);
1574     qq20             = _mm256_mul_pd(iq2,jq0);
1575     qq21             = _mm256_mul_pd(iq2,jq1);
1576     qq22             = _mm256_mul_pd(iq2,jq2);
1577
1578     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1579     rcutoff_scalar   = fr->rcoulomb;
1580     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1581     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1582
1583     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
1584     rvdw             = _mm256_set1_pd(fr->rvdw);
1585
1586     /* Avoid stupid compiler warnings */
1587     jnrA = jnrB = jnrC = jnrD = 0;
1588     j_coord_offsetA = 0;
1589     j_coord_offsetB = 0;
1590     j_coord_offsetC = 0;
1591     j_coord_offsetD = 0;
1592
1593     outeriter        = 0;
1594     inneriter        = 0;
1595
1596     for(iidx=0;iidx<4*DIM;iidx++)
1597     {
1598         scratch[iidx] = 0.0;
1599     }
1600
1601     /* Start outer loop over neighborlists */
1602     for(iidx=0; iidx<nri; iidx++)
1603     {
1604         /* Load shift vector for this list */
1605         i_shift_offset   = DIM*shiftidx[iidx];
1606
1607         /* Load limits for loop over neighbors */
1608         j_index_start    = jindex[iidx];
1609         j_index_end      = jindex[iidx+1];
1610
1611         /* Get outer coordinate index */
1612         inr              = iinr[iidx];
1613         i_coord_offset   = DIM*inr;
1614
1615         /* Load i particle coords and add shift vector */
1616         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1617                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1618
1619         fix0             = _mm256_setzero_pd();
1620         fiy0             = _mm256_setzero_pd();
1621         fiz0             = _mm256_setzero_pd();
1622         fix1             = _mm256_setzero_pd();
1623         fiy1             = _mm256_setzero_pd();
1624         fiz1             = _mm256_setzero_pd();
1625         fix2             = _mm256_setzero_pd();
1626         fiy2             = _mm256_setzero_pd();
1627         fiz2             = _mm256_setzero_pd();
1628
1629         /* Start inner kernel loop */
1630         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1631         {
1632
1633             /* Get j neighbor index, and coordinate index */
1634             jnrA             = jjnr[jidx];
1635             jnrB             = jjnr[jidx+1];
1636             jnrC             = jjnr[jidx+2];
1637             jnrD             = jjnr[jidx+3];
1638             j_coord_offsetA  = DIM*jnrA;
1639             j_coord_offsetB  = DIM*jnrB;
1640             j_coord_offsetC  = DIM*jnrC;
1641             j_coord_offsetD  = DIM*jnrD;
1642
1643             /* load j atom coordinates */
1644             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1645                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1646                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1647
1648             /* Calculate displacement vector */
1649             dx00             = _mm256_sub_pd(ix0,jx0);
1650             dy00             = _mm256_sub_pd(iy0,jy0);
1651             dz00             = _mm256_sub_pd(iz0,jz0);
1652             dx01             = _mm256_sub_pd(ix0,jx1);
1653             dy01             = _mm256_sub_pd(iy0,jy1);
1654             dz01             = _mm256_sub_pd(iz0,jz1);
1655             dx02             = _mm256_sub_pd(ix0,jx2);
1656             dy02             = _mm256_sub_pd(iy0,jy2);
1657             dz02             = _mm256_sub_pd(iz0,jz2);
1658             dx10             = _mm256_sub_pd(ix1,jx0);
1659             dy10             = _mm256_sub_pd(iy1,jy0);
1660             dz10             = _mm256_sub_pd(iz1,jz0);
1661             dx11             = _mm256_sub_pd(ix1,jx1);
1662             dy11             = _mm256_sub_pd(iy1,jy1);
1663             dz11             = _mm256_sub_pd(iz1,jz1);
1664             dx12             = _mm256_sub_pd(ix1,jx2);
1665             dy12             = _mm256_sub_pd(iy1,jy2);
1666             dz12             = _mm256_sub_pd(iz1,jz2);
1667             dx20             = _mm256_sub_pd(ix2,jx0);
1668             dy20             = _mm256_sub_pd(iy2,jy0);
1669             dz20             = _mm256_sub_pd(iz2,jz0);
1670             dx21             = _mm256_sub_pd(ix2,jx1);
1671             dy21             = _mm256_sub_pd(iy2,jy1);
1672             dz21             = _mm256_sub_pd(iz2,jz1);
1673             dx22             = _mm256_sub_pd(ix2,jx2);
1674             dy22             = _mm256_sub_pd(iy2,jy2);
1675             dz22             = _mm256_sub_pd(iz2,jz2);
1676
1677             /* Calculate squared distance and things based on it */
1678             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1679             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1680             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1681             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1682             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1683             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1684             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1685             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1686             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1687
1688             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1689             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1690             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1691             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1692             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1693             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1694             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1695             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1696             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1697
1698             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1699             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1700             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1701             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1702             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1703             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1704             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1705             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1706             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1707
1708             fjx0             = _mm256_setzero_pd();
1709             fjy0             = _mm256_setzero_pd();
1710             fjz0             = _mm256_setzero_pd();
1711             fjx1             = _mm256_setzero_pd();
1712             fjy1             = _mm256_setzero_pd();
1713             fjz1             = _mm256_setzero_pd();
1714             fjx2             = _mm256_setzero_pd();
1715             fjy2             = _mm256_setzero_pd();
1716             fjz2             = _mm256_setzero_pd();
1717
1718             /**************************
1719              * CALCULATE INTERACTIONS *
1720              **************************/
1721
1722             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1723             {
1724
1725             r00              = _mm256_mul_pd(rsq00,rinv00);
1726
1727             /* EWALD ELECTROSTATICS */
1728
1729             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1730             ewrt             = _mm256_mul_pd(r00,ewtabscale);
1731             ewitab           = _mm256_cvttpd_epi32(ewrt);
1732             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1733             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1734                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1735                                             &ewtabF,&ewtabFn);
1736             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1737             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1738
1739             /* LENNARD-JONES DISPERSION/REPULSION */
1740
1741             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1742             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1743
1744             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1745
1746             fscal            = _mm256_add_pd(felec,fvdw);
1747
1748             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1749
1750             /* Calculate temporary vectorial force */
1751             tx               = _mm256_mul_pd(fscal,dx00);
1752             ty               = _mm256_mul_pd(fscal,dy00);
1753             tz               = _mm256_mul_pd(fscal,dz00);
1754
1755             /* Update vectorial force */
1756             fix0             = _mm256_add_pd(fix0,tx);
1757             fiy0             = _mm256_add_pd(fiy0,ty);
1758             fiz0             = _mm256_add_pd(fiz0,tz);
1759
1760             fjx0             = _mm256_add_pd(fjx0,tx);
1761             fjy0             = _mm256_add_pd(fjy0,ty);
1762             fjz0             = _mm256_add_pd(fjz0,tz);
1763
1764             }
1765
1766             /**************************
1767              * CALCULATE INTERACTIONS *
1768              **************************/
1769
1770             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1771             {
1772
1773             r01              = _mm256_mul_pd(rsq01,rinv01);
1774
1775             /* EWALD ELECTROSTATICS */
1776
1777             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1778             ewrt             = _mm256_mul_pd(r01,ewtabscale);
1779             ewitab           = _mm256_cvttpd_epi32(ewrt);
1780             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1781             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1782                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1783                                             &ewtabF,&ewtabFn);
1784             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1785             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1786
1787             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1788
1789             fscal            = felec;
1790
1791             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1792
1793             /* Calculate temporary vectorial force */
1794             tx               = _mm256_mul_pd(fscal,dx01);
1795             ty               = _mm256_mul_pd(fscal,dy01);
1796             tz               = _mm256_mul_pd(fscal,dz01);
1797
1798             /* Update vectorial force */
1799             fix0             = _mm256_add_pd(fix0,tx);
1800             fiy0             = _mm256_add_pd(fiy0,ty);
1801             fiz0             = _mm256_add_pd(fiz0,tz);
1802
1803             fjx1             = _mm256_add_pd(fjx1,tx);
1804             fjy1             = _mm256_add_pd(fjy1,ty);
1805             fjz1             = _mm256_add_pd(fjz1,tz);
1806
1807             }
1808
1809             /**************************
1810              * CALCULATE INTERACTIONS *
1811              **************************/
1812
1813             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1814             {
1815
1816             r02              = _mm256_mul_pd(rsq02,rinv02);
1817
1818             /* EWALD ELECTROSTATICS */
1819
1820             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1821             ewrt             = _mm256_mul_pd(r02,ewtabscale);
1822             ewitab           = _mm256_cvttpd_epi32(ewrt);
1823             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1824             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1825                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1826                                             &ewtabF,&ewtabFn);
1827             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1828             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1829
1830             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1831
1832             fscal            = felec;
1833
1834             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1835
1836             /* Calculate temporary vectorial force */
1837             tx               = _mm256_mul_pd(fscal,dx02);
1838             ty               = _mm256_mul_pd(fscal,dy02);
1839             tz               = _mm256_mul_pd(fscal,dz02);
1840
1841             /* Update vectorial force */
1842             fix0             = _mm256_add_pd(fix0,tx);
1843             fiy0             = _mm256_add_pd(fiy0,ty);
1844             fiz0             = _mm256_add_pd(fiz0,tz);
1845
1846             fjx2             = _mm256_add_pd(fjx2,tx);
1847             fjy2             = _mm256_add_pd(fjy2,ty);
1848             fjz2             = _mm256_add_pd(fjz2,tz);
1849
1850             }
1851
1852             /**************************
1853              * CALCULATE INTERACTIONS *
1854              **************************/
1855
1856             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1857             {
1858
1859             r10              = _mm256_mul_pd(rsq10,rinv10);
1860
1861             /* EWALD ELECTROSTATICS */
1862
1863             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1864             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1865             ewitab           = _mm256_cvttpd_epi32(ewrt);
1866             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1867             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1868                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1869                                             &ewtabF,&ewtabFn);
1870             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1871             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1872
1873             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1874
1875             fscal            = felec;
1876
1877             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1878
1879             /* Calculate temporary vectorial force */
1880             tx               = _mm256_mul_pd(fscal,dx10);
1881             ty               = _mm256_mul_pd(fscal,dy10);
1882             tz               = _mm256_mul_pd(fscal,dz10);
1883
1884             /* Update vectorial force */
1885             fix1             = _mm256_add_pd(fix1,tx);
1886             fiy1             = _mm256_add_pd(fiy1,ty);
1887             fiz1             = _mm256_add_pd(fiz1,tz);
1888
1889             fjx0             = _mm256_add_pd(fjx0,tx);
1890             fjy0             = _mm256_add_pd(fjy0,ty);
1891             fjz0             = _mm256_add_pd(fjz0,tz);
1892
1893             }
1894
1895             /**************************
1896              * CALCULATE INTERACTIONS *
1897              **************************/
1898
1899             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1900             {
1901
1902             r11              = _mm256_mul_pd(rsq11,rinv11);
1903
1904             /* EWALD ELECTROSTATICS */
1905
1906             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1907             ewrt             = _mm256_mul_pd(r11,ewtabscale);
1908             ewitab           = _mm256_cvttpd_epi32(ewrt);
1909             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1910             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1911                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1912                                             &ewtabF,&ewtabFn);
1913             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1914             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1915
1916             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1917
1918             fscal            = felec;
1919
1920             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1921
1922             /* Calculate temporary vectorial force */
1923             tx               = _mm256_mul_pd(fscal,dx11);
1924             ty               = _mm256_mul_pd(fscal,dy11);
1925             tz               = _mm256_mul_pd(fscal,dz11);
1926
1927             /* Update vectorial force */
1928             fix1             = _mm256_add_pd(fix1,tx);
1929             fiy1             = _mm256_add_pd(fiy1,ty);
1930             fiz1             = _mm256_add_pd(fiz1,tz);
1931
1932             fjx1             = _mm256_add_pd(fjx1,tx);
1933             fjy1             = _mm256_add_pd(fjy1,ty);
1934             fjz1             = _mm256_add_pd(fjz1,tz);
1935
1936             }
1937
1938             /**************************
1939              * CALCULATE INTERACTIONS *
1940              **************************/
1941
1942             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1943             {
1944
1945             r12              = _mm256_mul_pd(rsq12,rinv12);
1946
1947             /* EWALD ELECTROSTATICS */
1948
1949             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1950             ewrt             = _mm256_mul_pd(r12,ewtabscale);
1951             ewitab           = _mm256_cvttpd_epi32(ewrt);
1952             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1953             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1954                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1955                                             &ewtabF,&ewtabFn);
1956             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1957             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1958
1959             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1960
1961             fscal            = felec;
1962
1963             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1964
1965             /* Calculate temporary vectorial force */
1966             tx               = _mm256_mul_pd(fscal,dx12);
1967             ty               = _mm256_mul_pd(fscal,dy12);
1968             tz               = _mm256_mul_pd(fscal,dz12);
1969
1970             /* Update vectorial force */
1971             fix1             = _mm256_add_pd(fix1,tx);
1972             fiy1             = _mm256_add_pd(fiy1,ty);
1973             fiz1             = _mm256_add_pd(fiz1,tz);
1974
1975             fjx2             = _mm256_add_pd(fjx2,tx);
1976             fjy2             = _mm256_add_pd(fjy2,ty);
1977             fjz2             = _mm256_add_pd(fjz2,tz);
1978
1979             }
1980
1981             /**************************
1982              * CALCULATE INTERACTIONS *
1983              **************************/
1984
1985             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1986             {
1987
1988             r20              = _mm256_mul_pd(rsq20,rinv20);
1989
1990             /* EWALD ELECTROSTATICS */
1991
1992             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1993             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1994             ewitab           = _mm256_cvttpd_epi32(ewrt);
1995             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1996             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1997                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1998                                             &ewtabF,&ewtabFn);
1999             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2000             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2001
2002             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2003
2004             fscal            = felec;
2005
2006             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2007
2008             /* Calculate temporary vectorial force */
2009             tx               = _mm256_mul_pd(fscal,dx20);
2010             ty               = _mm256_mul_pd(fscal,dy20);
2011             tz               = _mm256_mul_pd(fscal,dz20);
2012
2013             /* Update vectorial force */
2014             fix2             = _mm256_add_pd(fix2,tx);
2015             fiy2             = _mm256_add_pd(fiy2,ty);
2016             fiz2             = _mm256_add_pd(fiz2,tz);
2017
2018             fjx0             = _mm256_add_pd(fjx0,tx);
2019             fjy0             = _mm256_add_pd(fjy0,ty);
2020             fjz0             = _mm256_add_pd(fjz0,tz);
2021
2022             }
2023
2024             /**************************
2025              * CALCULATE INTERACTIONS *
2026              **************************/
2027
2028             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2029             {
2030
2031             r21              = _mm256_mul_pd(rsq21,rinv21);
2032
2033             /* EWALD ELECTROSTATICS */
2034
2035             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2036             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2037             ewitab           = _mm256_cvttpd_epi32(ewrt);
2038             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2039             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2040                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2041                                             &ewtabF,&ewtabFn);
2042             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2043             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2044
2045             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2046
2047             fscal            = felec;
2048
2049             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2050
2051             /* Calculate temporary vectorial force */
2052             tx               = _mm256_mul_pd(fscal,dx21);
2053             ty               = _mm256_mul_pd(fscal,dy21);
2054             tz               = _mm256_mul_pd(fscal,dz21);
2055
2056             /* Update vectorial force */
2057             fix2             = _mm256_add_pd(fix2,tx);
2058             fiy2             = _mm256_add_pd(fiy2,ty);
2059             fiz2             = _mm256_add_pd(fiz2,tz);
2060
2061             fjx1             = _mm256_add_pd(fjx1,tx);
2062             fjy1             = _mm256_add_pd(fjy1,ty);
2063             fjz1             = _mm256_add_pd(fjz1,tz);
2064
2065             }
2066
2067             /**************************
2068              * CALCULATE INTERACTIONS *
2069              **************************/
2070
2071             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2072             {
2073
2074             r22              = _mm256_mul_pd(rsq22,rinv22);
2075
2076             /* EWALD ELECTROSTATICS */
2077
2078             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2079             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2080             ewitab           = _mm256_cvttpd_epi32(ewrt);
2081             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2082             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2083                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2084                                             &ewtabF,&ewtabFn);
2085             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2086             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2087
2088             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2089
2090             fscal            = felec;
2091
2092             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2093
2094             /* Calculate temporary vectorial force */
2095             tx               = _mm256_mul_pd(fscal,dx22);
2096             ty               = _mm256_mul_pd(fscal,dy22);
2097             tz               = _mm256_mul_pd(fscal,dz22);
2098
2099             /* Update vectorial force */
2100             fix2             = _mm256_add_pd(fix2,tx);
2101             fiy2             = _mm256_add_pd(fiy2,ty);
2102             fiz2             = _mm256_add_pd(fiz2,tz);
2103
2104             fjx2             = _mm256_add_pd(fjx2,tx);
2105             fjy2             = _mm256_add_pd(fjy2,ty);
2106             fjz2             = _mm256_add_pd(fjz2,tz);
2107
2108             }
2109
2110             fjptrA             = f+j_coord_offsetA;
2111             fjptrB             = f+j_coord_offsetB;
2112             fjptrC             = f+j_coord_offsetC;
2113             fjptrD             = f+j_coord_offsetD;
2114
2115             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2116                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2117
2118             /* Inner loop uses 358 flops */
2119         }
2120
2121         if(jidx<j_index_end)
2122         {
2123
2124             /* Get j neighbor index, and coordinate index */
2125             jnrlistA         = jjnr[jidx];
2126             jnrlistB         = jjnr[jidx+1];
2127             jnrlistC         = jjnr[jidx+2];
2128             jnrlistD         = jjnr[jidx+3];
2129             /* Sign of each element will be negative for non-real atoms.
2130              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2131              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2132              */
2133             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2134
2135             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2136             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2137             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2138
2139             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2140             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2141             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2142             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2143             j_coord_offsetA  = DIM*jnrA;
2144             j_coord_offsetB  = DIM*jnrB;
2145             j_coord_offsetC  = DIM*jnrC;
2146             j_coord_offsetD  = DIM*jnrD;
2147
2148             /* load j atom coordinates */
2149             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2150                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2151                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2152
2153             /* Calculate displacement vector */
2154             dx00             = _mm256_sub_pd(ix0,jx0);
2155             dy00             = _mm256_sub_pd(iy0,jy0);
2156             dz00             = _mm256_sub_pd(iz0,jz0);
2157             dx01             = _mm256_sub_pd(ix0,jx1);
2158             dy01             = _mm256_sub_pd(iy0,jy1);
2159             dz01             = _mm256_sub_pd(iz0,jz1);
2160             dx02             = _mm256_sub_pd(ix0,jx2);
2161             dy02             = _mm256_sub_pd(iy0,jy2);
2162             dz02             = _mm256_sub_pd(iz0,jz2);
2163             dx10             = _mm256_sub_pd(ix1,jx0);
2164             dy10             = _mm256_sub_pd(iy1,jy0);
2165             dz10             = _mm256_sub_pd(iz1,jz0);
2166             dx11             = _mm256_sub_pd(ix1,jx1);
2167             dy11             = _mm256_sub_pd(iy1,jy1);
2168             dz11             = _mm256_sub_pd(iz1,jz1);
2169             dx12             = _mm256_sub_pd(ix1,jx2);
2170             dy12             = _mm256_sub_pd(iy1,jy2);
2171             dz12             = _mm256_sub_pd(iz1,jz2);
2172             dx20             = _mm256_sub_pd(ix2,jx0);
2173             dy20             = _mm256_sub_pd(iy2,jy0);
2174             dz20             = _mm256_sub_pd(iz2,jz0);
2175             dx21             = _mm256_sub_pd(ix2,jx1);
2176             dy21             = _mm256_sub_pd(iy2,jy1);
2177             dz21             = _mm256_sub_pd(iz2,jz1);
2178             dx22             = _mm256_sub_pd(ix2,jx2);
2179             dy22             = _mm256_sub_pd(iy2,jy2);
2180             dz22             = _mm256_sub_pd(iz2,jz2);
2181
2182             /* Calculate squared distance and things based on it */
2183             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2184             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2185             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2186             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2187             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2188             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2189             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2190             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2191             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2192
2193             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
2194             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
2195             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
2196             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
2197             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2198             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2199             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
2200             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2201             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2202
2203             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2204             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
2205             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
2206             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
2207             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2208             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2209             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
2210             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2211             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2212
2213             fjx0             = _mm256_setzero_pd();
2214             fjy0             = _mm256_setzero_pd();
2215             fjz0             = _mm256_setzero_pd();
2216             fjx1             = _mm256_setzero_pd();
2217             fjy1             = _mm256_setzero_pd();
2218             fjz1             = _mm256_setzero_pd();
2219             fjx2             = _mm256_setzero_pd();
2220             fjy2             = _mm256_setzero_pd();
2221             fjz2             = _mm256_setzero_pd();
2222
2223             /**************************
2224              * CALCULATE INTERACTIONS *
2225              **************************/
2226
2227             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2228             {
2229
2230             r00              = _mm256_mul_pd(rsq00,rinv00);
2231             r00              = _mm256_andnot_pd(dummy_mask,r00);
2232
2233             /* EWALD ELECTROSTATICS */
2234
2235             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2236             ewrt             = _mm256_mul_pd(r00,ewtabscale);
2237             ewitab           = _mm256_cvttpd_epi32(ewrt);
2238             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2239             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2240                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2241                                             &ewtabF,&ewtabFn);
2242             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2243             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2244
2245             /* LENNARD-JONES DISPERSION/REPULSION */
2246
2247             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2248             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2249
2250             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2251
2252             fscal            = _mm256_add_pd(felec,fvdw);
2253
2254             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2255
2256             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2257
2258             /* Calculate temporary vectorial force */
2259             tx               = _mm256_mul_pd(fscal,dx00);
2260             ty               = _mm256_mul_pd(fscal,dy00);
2261             tz               = _mm256_mul_pd(fscal,dz00);
2262
2263             /* Update vectorial force */
2264             fix0             = _mm256_add_pd(fix0,tx);
2265             fiy0             = _mm256_add_pd(fiy0,ty);
2266             fiz0             = _mm256_add_pd(fiz0,tz);
2267
2268             fjx0             = _mm256_add_pd(fjx0,tx);
2269             fjy0             = _mm256_add_pd(fjy0,ty);
2270             fjz0             = _mm256_add_pd(fjz0,tz);
2271
2272             }
2273
2274             /**************************
2275              * CALCULATE INTERACTIONS *
2276              **************************/
2277
2278             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2279             {
2280
2281             r01              = _mm256_mul_pd(rsq01,rinv01);
2282             r01              = _mm256_andnot_pd(dummy_mask,r01);
2283
2284             /* EWALD ELECTROSTATICS */
2285
2286             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2287             ewrt             = _mm256_mul_pd(r01,ewtabscale);
2288             ewitab           = _mm256_cvttpd_epi32(ewrt);
2289             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2290             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2291                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2292                                             &ewtabF,&ewtabFn);
2293             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2294             felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2295
2296             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2297
2298             fscal            = felec;
2299
2300             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2301
2302             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2303
2304             /* Calculate temporary vectorial force */
2305             tx               = _mm256_mul_pd(fscal,dx01);
2306             ty               = _mm256_mul_pd(fscal,dy01);
2307             tz               = _mm256_mul_pd(fscal,dz01);
2308
2309             /* Update vectorial force */
2310             fix0             = _mm256_add_pd(fix0,tx);
2311             fiy0             = _mm256_add_pd(fiy0,ty);
2312             fiz0             = _mm256_add_pd(fiz0,tz);
2313
2314             fjx1             = _mm256_add_pd(fjx1,tx);
2315             fjy1             = _mm256_add_pd(fjy1,ty);
2316             fjz1             = _mm256_add_pd(fjz1,tz);
2317
2318             }
2319
2320             /**************************
2321              * CALCULATE INTERACTIONS *
2322              **************************/
2323
2324             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2325             {
2326
2327             r02              = _mm256_mul_pd(rsq02,rinv02);
2328             r02              = _mm256_andnot_pd(dummy_mask,r02);
2329
2330             /* EWALD ELECTROSTATICS */
2331
2332             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2333             ewrt             = _mm256_mul_pd(r02,ewtabscale);
2334             ewitab           = _mm256_cvttpd_epi32(ewrt);
2335             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2336             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2337                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2338                                             &ewtabF,&ewtabFn);
2339             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2340             felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2341
2342             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2343
2344             fscal            = felec;
2345
2346             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2347
2348             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2349
2350             /* Calculate temporary vectorial force */
2351             tx               = _mm256_mul_pd(fscal,dx02);
2352             ty               = _mm256_mul_pd(fscal,dy02);
2353             tz               = _mm256_mul_pd(fscal,dz02);
2354
2355             /* Update vectorial force */
2356             fix0             = _mm256_add_pd(fix0,tx);
2357             fiy0             = _mm256_add_pd(fiy0,ty);
2358             fiz0             = _mm256_add_pd(fiz0,tz);
2359
2360             fjx2             = _mm256_add_pd(fjx2,tx);
2361             fjy2             = _mm256_add_pd(fjy2,ty);
2362             fjz2             = _mm256_add_pd(fjz2,tz);
2363
2364             }
2365
2366             /**************************
2367              * CALCULATE INTERACTIONS *
2368              **************************/
2369
2370             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2371             {
2372
2373             r10              = _mm256_mul_pd(rsq10,rinv10);
2374             r10              = _mm256_andnot_pd(dummy_mask,r10);
2375
2376             /* EWALD ELECTROSTATICS */
2377
2378             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2379             ewrt             = _mm256_mul_pd(r10,ewtabscale);
2380             ewitab           = _mm256_cvttpd_epi32(ewrt);
2381             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2382             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2383                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2384                                             &ewtabF,&ewtabFn);
2385             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2386             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2387
2388             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2389
2390             fscal            = felec;
2391
2392             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2393
2394             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2395
2396             /* Calculate temporary vectorial force */
2397             tx               = _mm256_mul_pd(fscal,dx10);
2398             ty               = _mm256_mul_pd(fscal,dy10);
2399             tz               = _mm256_mul_pd(fscal,dz10);
2400
2401             /* Update vectorial force */
2402             fix1             = _mm256_add_pd(fix1,tx);
2403             fiy1             = _mm256_add_pd(fiy1,ty);
2404             fiz1             = _mm256_add_pd(fiz1,tz);
2405
2406             fjx0             = _mm256_add_pd(fjx0,tx);
2407             fjy0             = _mm256_add_pd(fjy0,ty);
2408             fjz0             = _mm256_add_pd(fjz0,tz);
2409
2410             }
2411
2412             /**************************
2413              * CALCULATE INTERACTIONS *
2414              **************************/
2415
2416             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2417             {
2418
2419             r11              = _mm256_mul_pd(rsq11,rinv11);
2420             r11              = _mm256_andnot_pd(dummy_mask,r11);
2421
2422             /* EWALD ELECTROSTATICS */
2423
2424             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2425             ewrt             = _mm256_mul_pd(r11,ewtabscale);
2426             ewitab           = _mm256_cvttpd_epi32(ewrt);
2427             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2428             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2429                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2430                                             &ewtabF,&ewtabFn);
2431             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2432             felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2433
2434             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2435
2436             fscal            = felec;
2437
2438             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2439
2440             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2441
2442             /* Calculate temporary vectorial force */
2443             tx               = _mm256_mul_pd(fscal,dx11);
2444             ty               = _mm256_mul_pd(fscal,dy11);
2445             tz               = _mm256_mul_pd(fscal,dz11);
2446
2447             /* Update vectorial force */
2448             fix1             = _mm256_add_pd(fix1,tx);
2449             fiy1             = _mm256_add_pd(fiy1,ty);
2450             fiz1             = _mm256_add_pd(fiz1,tz);
2451
2452             fjx1             = _mm256_add_pd(fjx1,tx);
2453             fjy1             = _mm256_add_pd(fjy1,ty);
2454             fjz1             = _mm256_add_pd(fjz1,tz);
2455
2456             }
2457
2458             /**************************
2459              * CALCULATE INTERACTIONS *
2460              **************************/
2461
2462             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2463             {
2464
2465             r12              = _mm256_mul_pd(rsq12,rinv12);
2466             r12              = _mm256_andnot_pd(dummy_mask,r12);
2467
2468             /* EWALD ELECTROSTATICS */
2469
2470             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2471             ewrt             = _mm256_mul_pd(r12,ewtabscale);
2472             ewitab           = _mm256_cvttpd_epi32(ewrt);
2473             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2474             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2475                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2476                                             &ewtabF,&ewtabFn);
2477             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2478             felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2479
2480             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2481
2482             fscal            = felec;
2483
2484             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2485
2486             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2487
2488             /* Calculate temporary vectorial force */
2489             tx               = _mm256_mul_pd(fscal,dx12);
2490             ty               = _mm256_mul_pd(fscal,dy12);
2491             tz               = _mm256_mul_pd(fscal,dz12);
2492
2493             /* Update vectorial force */
2494             fix1             = _mm256_add_pd(fix1,tx);
2495             fiy1             = _mm256_add_pd(fiy1,ty);
2496             fiz1             = _mm256_add_pd(fiz1,tz);
2497
2498             fjx2             = _mm256_add_pd(fjx2,tx);
2499             fjy2             = _mm256_add_pd(fjy2,ty);
2500             fjz2             = _mm256_add_pd(fjz2,tz);
2501
2502             }
2503
2504             /**************************
2505              * CALCULATE INTERACTIONS *
2506              **************************/
2507
2508             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2509             {
2510
2511             r20              = _mm256_mul_pd(rsq20,rinv20);
2512             r20              = _mm256_andnot_pd(dummy_mask,r20);
2513
2514             /* EWALD ELECTROSTATICS */
2515
2516             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2517             ewrt             = _mm256_mul_pd(r20,ewtabscale);
2518             ewitab           = _mm256_cvttpd_epi32(ewrt);
2519             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2520             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2521                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2522                                             &ewtabF,&ewtabFn);
2523             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2524             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2525
2526             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2527
2528             fscal            = felec;
2529
2530             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2531
2532             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2533
2534             /* Calculate temporary vectorial force */
2535             tx               = _mm256_mul_pd(fscal,dx20);
2536             ty               = _mm256_mul_pd(fscal,dy20);
2537             tz               = _mm256_mul_pd(fscal,dz20);
2538
2539             /* Update vectorial force */
2540             fix2             = _mm256_add_pd(fix2,tx);
2541             fiy2             = _mm256_add_pd(fiy2,ty);
2542             fiz2             = _mm256_add_pd(fiz2,tz);
2543
2544             fjx0             = _mm256_add_pd(fjx0,tx);
2545             fjy0             = _mm256_add_pd(fjy0,ty);
2546             fjz0             = _mm256_add_pd(fjz0,tz);
2547
2548             }
2549
2550             /**************************
2551              * CALCULATE INTERACTIONS *
2552              **************************/
2553
2554             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2555             {
2556
2557             r21              = _mm256_mul_pd(rsq21,rinv21);
2558             r21              = _mm256_andnot_pd(dummy_mask,r21);
2559
2560             /* EWALD ELECTROSTATICS */
2561
2562             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2563             ewrt             = _mm256_mul_pd(r21,ewtabscale);
2564             ewitab           = _mm256_cvttpd_epi32(ewrt);
2565             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2566             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2567                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2568                                             &ewtabF,&ewtabFn);
2569             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2570             felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2571
2572             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2573
2574             fscal            = felec;
2575
2576             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2577
2578             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2579
2580             /* Calculate temporary vectorial force */
2581             tx               = _mm256_mul_pd(fscal,dx21);
2582             ty               = _mm256_mul_pd(fscal,dy21);
2583             tz               = _mm256_mul_pd(fscal,dz21);
2584
2585             /* Update vectorial force */
2586             fix2             = _mm256_add_pd(fix2,tx);
2587             fiy2             = _mm256_add_pd(fiy2,ty);
2588             fiz2             = _mm256_add_pd(fiz2,tz);
2589
2590             fjx1             = _mm256_add_pd(fjx1,tx);
2591             fjy1             = _mm256_add_pd(fjy1,ty);
2592             fjz1             = _mm256_add_pd(fjz1,tz);
2593
2594             }
2595
2596             /**************************
2597              * CALCULATE INTERACTIONS *
2598              **************************/
2599
2600             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2601             {
2602
2603             r22              = _mm256_mul_pd(rsq22,rinv22);
2604             r22              = _mm256_andnot_pd(dummy_mask,r22);
2605
2606             /* EWALD ELECTROSTATICS */
2607
2608             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2609             ewrt             = _mm256_mul_pd(r22,ewtabscale);
2610             ewitab           = _mm256_cvttpd_epi32(ewrt);
2611             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2612             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2613                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2614                                             &ewtabF,&ewtabFn);
2615             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2616             felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2617
2618             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2619
2620             fscal            = felec;
2621
2622             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2623
2624             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2625
2626             /* Calculate temporary vectorial force */
2627             tx               = _mm256_mul_pd(fscal,dx22);
2628             ty               = _mm256_mul_pd(fscal,dy22);
2629             tz               = _mm256_mul_pd(fscal,dz22);
2630
2631             /* Update vectorial force */
2632             fix2             = _mm256_add_pd(fix2,tx);
2633             fiy2             = _mm256_add_pd(fiy2,ty);
2634             fiz2             = _mm256_add_pd(fiz2,tz);
2635
2636             fjx2             = _mm256_add_pd(fjx2,tx);
2637             fjy2             = _mm256_add_pd(fjy2,ty);
2638             fjz2             = _mm256_add_pd(fjz2,tz);
2639
2640             }
2641
2642             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2643             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2644             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2645             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2646
2647             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2648                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2649
2650             /* Inner loop uses 367 flops */
2651         }
2652
2653         /* End of innermost loop */
2654
2655         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2656                                                  f+i_coord_offset,fshift+i_shift_offset);
2657
2658         /* Increment number of inner iterations */
2659         inneriter                  += j_index_end - j_index_start;
2660
2661         /* Outer loop uses 18 flops */
2662     }
2663
2664     /* Increment number of outer iterations */
2665     outeriter        += nri;
2666
2667     /* Update outer/inner flops */
2668
2669     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);
2670 }