Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_avx_256_double.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_double
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water4-Particle
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     real *           vdwioffsetptr0;
84     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     real *           vdwioffsetptr1;
86     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     real *           vdwioffsetptr3;
90     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
96     __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
97     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
98     real             *charge;
99     int              nvdwtype;
100     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101     int              *vdwtype;
102     real             *vdwparam;
103     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
104     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
105     __m128i          ewitab;
106     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
107     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
108     real             *ewtab;
109     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
110     real             rswitch_scalar,d_scalar;
111     __m256d          dummy_mask,cutoff_mask;
112     __m128           tmpmask0,tmpmask1;
113     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
114     __m256d          one     = _mm256_set1_pd(1.0);
115     __m256d          two     = _mm256_set1_pd(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm256_set1_pd(fr->ic->epsfac);
128     charge           = mdatoms->chargeA;
129     nvdwtype         = fr->ntype;
130     vdwparam         = fr->nbfp;
131     vdwtype          = mdatoms->typeA;
132
133     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
134     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
135     beta2            = _mm256_mul_pd(beta,beta);
136     beta3            = _mm256_mul_pd(beta,beta2);
137
138     ewtab            = fr->ic->tabq_coul_FDV0;
139     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
140     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
141
142     /* Setup water-specific parameters */
143     inr              = nlist->iinr[0];
144     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
145     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
146     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
147     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
148
149     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150     rcutoff_scalar   = fr->ic->rcoulomb;
151     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
152     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
153
154     rswitch_scalar   = fr->ic->rcoulomb_switch;
155     rswitch          = _mm256_set1_pd(rswitch_scalar);
156     /* Setup switch parameters */
157     d_scalar         = rcutoff_scalar-rswitch_scalar;
158     d                = _mm256_set1_pd(d_scalar);
159     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
160     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
161     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
162     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
163     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
165
166     /* Avoid stupid compiler warnings */
167     jnrA = jnrB = jnrC = jnrD = 0;
168     j_coord_offsetA = 0;
169     j_coord_offsetB = 0;
170     j_coord_offsetC = 0;
171     j_coord_offsetD = 0;
172
173     outeriter        = 0;
174     inneriter        = 0;
175
176     for(iidx=0;iidx<4*DIM;iidx++)
177     {
178         scratch[iidx] = 0.0;
179     }
180
181     /* Start outer loop over neighborlists */
182     for(iidx=0; iidx<nri; iidx++)
183     {
184         /* Load shift vector for this list */
185         i_shift_offset   = DIM*shiftidx[iidx];
186
187         /* Load limits for loop over neighbors */
188         j_index_start    = jindex[iidx];
189         j_index_end      = jindex[iidx+1];
190
191         /* Get outer coordinate index */
192         inr              = iinr[iidx];
193         i_coord_offset   = DIM*inr;
194
195         /* Load i particle coords and add shift vector */
196         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
197                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
198
199         fix0             = _mm256_setzero_pd();
200         fiy0             = _mm256_setzero_pd();
201         fiz0             = _mm256_setzero_pd();
202         fix1             = _mm256_setzero_pd();
203         fiy1             = _mm256_setzero_pd();
204         fiz1             = _mm256_setzero_pd();
205         fix2             = _mm256_setzero_pd();
206         fiy2             = _mm256_setzero_pd();
207         fiz2             = _mm256_setzero_pd();
208         fix3             = _mm256_setzero_pd();
209         fiy3             = _mm256_setzero_pd();
210         fiz3             = _mm256_setzero_pd();
211
212         /* Reset potential sums */
213         velecsum         = _mm256_setzero_pd();
214         vvdwsum          = _mm256_setzero_pd();
215
216         /* Start inner kernel loop */
217         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
218         {
219
220             /* Get j neighbor index, and coordinate index */
221             jnrA             = jjnr[jidx];
222             jnrB             = jjnr[jidx+1];
223             jnrC             = jjnr[jidx+2];
224             jnrD             = jjnr[jidx+3];
225             j_coord_offsetA  = DIM*jnrA;
226             j_coord_offsetB  = DIM*jnrB;
227             j_coord_offsetC  = DIM*jnrC;
228             j_coord_offsetD  = DIM*jnrD;
229
230             /* load j atom coordinates */
231             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
232                                                  x+j_coord_offsetC,x+j_coord_offsetD,
233                                                  &jx0,&jy0,&jz0);
234
235             /* Calculate displacement vector */
236             dx00             = _mm256_sub_pd(ix0,jx0);
237             dy00             = _mm256_sub_pd(iy0,jy0);
238             dz00             = _mm256_sub_pd(iz0,jz0);
239             dx10             = _mm256_sub_pd(ix1,jx0);
240             dy10             = _mm256_sub_pd(iy1,jy0);
241             dz10             = _mm256_sub_pd(iz1,jz0);
242             dx20             = _mm256_sub_pd(ix2,jx0);
243             dy20             = _mm256_sub_pd(iy2,jy0);
244             dz20             = _mm256_sub_pd(iz2,jz0);
245             dx30             = _mm256_sub_pd(ix3,jx0);
246             dy30             = _mm256_sub_pd(iy3,jy0);
247             dz30             = _mm256_sub_pd(iz3,jz0);
248
249             /* Calculate squared distance and things based on it */
250             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
251             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
252             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
253             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
254
255             rinv00           = avx256_invsqrt_d(rsq00);
256             rinv10           = avx256_invsqrt_d(rsq10);
257             rinv20           = avx256_invsqrt_d(rsq20);
258             rinv30           = avx256_invsqrt_d(rsq30);
259
260             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
261             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
262             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
263             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
264
265             /* Load parameters for j particles */
266             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
267                                                                  charge+jnrC+0,charge+jnrD+0);
268             vdwjidx0A        = 2*vdwtype[jnrA+0];
269             vdwjidx0B        = 2*vdwtype[jnrB+0];
270             vdwjidx0C        = 2*vdwtype[jnrC+0];
271             vdwjidx0D        = 2*vdwtype[jnrD+0];
272
273             fjx0             = _mm256_setzero_pd();
274             fjy0             = _mm256_setzero_pd();
275             fjz0             = _mm256_setzero_pd();
276
277             /**************************
278              * CALCULATE INTERACTIONS *
279              **************************/
280
281             if (gmx_mm256_any_lt(rsq00,rcutoff2))
282             {
283
284             r00              = _mm256_mul_pd(rsq00,rinv00);
285
286             /* Compute parameters for interactions between i and j atoms */
287             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
288                                             vdwioffsetptr0+vdwjidx0B,
289                                             vdwioffsetptr0+vdwjidx0C,
290                                             vdwioffsetptr0+vdwjidx0D,
291                                             &c6_00,&c12_00);
292
293             /* LENNARD-JONES DISPERSION/REPULSION */
294
295             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
296             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
297             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
298             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
299             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
300
301             d                = _mm256_sub_pd(r00,rswitch);
302             d                = _mm256_max_pd(d,_mm256_setzero_pd());
303             d2               = _mm256_mul_pd(d,d);
304             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
305
306             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
307
308             /* Evaluate switch function */
309             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
310             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
311             vvdw             = _mm256_mul_pd(vvdw,sw);
312             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
313
314             /* Update potential sum for this i atom from the interaction with this j atom. */
315             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
316             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
317
318             fscal            = fvdw;
319
320             fscal            = _mm256_and_pd(fscal,cutoff_mask);
321
322             /* Calculate temporary vectorial force */
323             tx               = _mm256_mul_pd(fscal,dx00);
324             ty               = _mm256_mul_pd(fscal,dy00);
325             tz               = _mm256_mul_pd(fscal,dz00);
326
327             /* Update vectorial force */
328             fix0             = _mm256_add_pd(fix0,tx);
329             fiy0             = _mm256_add_pd(fiy0,ty);
330             fiz0             = _mm256_add_pd(fiz0,tz);
331
332             fjx0             = _mm256_add_pd(fjx0,tx);
333             fjy0             = _mm256_add_pd(fjy0,ty);
334             fjz0             = _mm256_add_pd(fjz0,tz);
335
336             }
337
338             /**************************
339              * CALCULATE INTERACTIONS *
340              **************************/
341
342             if (gmx_mm256_any_lt(rsq10,rcutoff2))
343             {
344
345             r10              = _mm256_mul_pd(rsq10,rinv10);
346
347             /* Compute parameters for interactions between i and j atoms */
348             qq10             = _mm256_mul_pd(iq1,jq0);
349
350             /* EWALD ELECTROSTATICS */
351
352             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
353             ewrt             = _mm256_mul_pd(r10,ewtabscale);
354             ewitab           = _mm256_cvttpd_epi32(ewrt);
355             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
356             ewitab           = _mm_slli_epi32(ewitab,2);
357             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
358             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
359             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
360             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
361             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
362             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
363             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
364             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
365             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
366
367             d                = _mm256_sub_pd(r10,rswitch);
368             d                = _mm256_max_pd(d,_mm256_setzero_pd());
369             d2               = _mm256_mul_pd(d,d);
370             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
371
372             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
373
374             /* Evaluate switch function */
375             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
376             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
377             velec            = _mm256_mul_pd(velec,sw);
378             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
379
380             /* Update potential sum for this i atom from the interaction with this j atom. */
381             velec            = _mm256_and_pd(velec,cutoff_mask);
382             velecsum         = _mm256_add_pd(velecsum,velec);
383
384             fscal            = felec;
385
386             fscal            = _mm256_and_pd(fscal,cutoff_mask);
387
388             /* Calculate temporary vectorial force */
389             tx               = _mm256_mul_pd(fscal,dx10);
390             ty               = _mm256_mul_pd(fscal,dy10);
391             tz               = _mm256_mul_pd(fscal,dz10);
392
393             /* Update vectorial force */
394             fix1             = _mm256_add_pd(fix1,tx);
395             fiy1             = _mm256_add_pd(fiy1,ty);
396             fiz1             = _mm256_add_pd(fiz1,tz);
397
398             fjx0             = _mm256_add_pd(fjx0,tx);
399             fjy0             = _mm256_add_pd(fjy0,ty);
400             fjz0             = _mm256_add_pd(fjz0,tz);
401
402             }
403
404             /**************************
405              * CALCULATE INTERACTIONS *
406              **************************/
407
408             if (gmx_mm256_any_lt(rsq20,rcutoff2))
409             {
410
411             r20              = _mm256_mul_pd(rsq20,rinv20);
412
413             /* Compute parameters for interactions between i and j atoms */
414             qq20             = _mm256_mul_pd(iq2,jq0);
415
416             /* EWALD ELECTROSTATICS */
417
418             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
419             ewrt             = _mm256_mul_pd(r20,ewtabscale);
420             ewitab           = _mm256_cvttpd_epi32(ewrt);
421             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
422             ewitab           = _mm_slli_epi32(ewitab,2);
423             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
424             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
425             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
426             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
427             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
428             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
429             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
430             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
431             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
432
433             d                = _mm256_sub_pd(r20,rswitch);
434             d                = _mm256_max_pd(d,_mm256_setzero_pd());
435             d2               = _mm256_mul_pd(d,d);
436             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
437
438             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
439
440             /* Evaluate switch function */
441             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
442             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
443             velec            = _mm256_mul_pd(velec,sw);
444             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
445
446             /* Update potential sum for this i atom from the interaction with this j atom. */
447             velec            = _mm256_and_pd(velec,cutoff_mask);
448             velecsum         = _mm256_add_pd(velecsum,velec);
449
450             fscal            = felec;
451
452             fscal            = _mm256_and_pd(fscal,cutoff_mask);
453
454             /* Calculate temporary vectorial force */
455             tx               = _mm256_mul_pd(fscal,dx20);
456             ty               = _mm256_mul_pd(fscal,dy20);
457             tz               = _mm256_mul_pd(fscal,dz20);
458
459             /* Update vectorial force */
460             fix2             = _mm256_add_pd(fix2,tx);
461             fiy2             = _mm256_add_pd(fiy2,ty);
462             fiz2             = _mm256_add_pd(fiz2,tz);
463
464             fjx0             = _mm256_add_pd(fjx0,tx);
465             fjy0             = _mm256_add_pd(fjy0,ty);
466             fjz0             = _mm256_add_pd(fjz0,tz);
467
468             }
469
470             /**************************
471              * CALCULATE INTERACTIONS *
472              **************************/
473
474             if (gmx_mm256_any_lt(rsq30,rcutoff2))
475             {
476
477             r30              = _mm256_mul_pd(rsq30,rinv30);
478
479             /* Compute parameters for interactions between i and j atoms */
480             qq30             = _mm256_mul_pd(iq3,jq0);
481
482             /* EWALD ELECTROSTATICS */
483
484             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
485             ewrt             = _mm256_mul_pd(r30,ewtabscale);
486             ewitab           = _mm256_cvttpd_epi32(ewrt);
487             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
488             ewitab           = _mm_slli_epi32(ewitab,2);
489             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
490             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
491             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
492             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
493             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
494             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
495             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
496             velec            = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
497             felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
498
499             d                = _mm256_sub_pd(r30,rswitch);
500             d                = _mm256_max_pd(d,_mm256_setzero_pd());
501             d2               = _mm256_mul_pd(d,d);
502             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
503
504             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
505
506             /* Evaluate switch function */
507             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
508             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv30,_mm256_mul_pd(velec,dsw)) );
509             velec            = _mm256_mul_pd(velec,sw);
510             cutoff_mask      = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
511
512             /* Update potential sum for this i atom from the interaction with this j atom. */
513             velec            = _mm256_and_pd(velec,cutoff_mask);
514             velecsum         = _mm256_add_pd(velecsum,velec);
515
516             fscal            = felec;
517
518             fscal            = _mm256_and_pd(fscal,cutoff_mask);
519
520             /* Calculate temporary vectorial force */
521             tx               = _mm256_mul_pd(fscal,dx30);
522             ty               = _mm256_mul_pd(fscal,dy30);
523             tz               = _mm256_mul_pd(fscal,dz30);
524
525             /* Update vectorial force */
526             fix3             = _mm256_add_pd(fix3,tx);
527             fiy3             = _mm256_add_pd(fiy3,ty);
528             fiz3             = _mm256_add_pd(fiz3,tz);
529
530             fjx0             = _mm256_add_pd(fjx0,tx);
531             fjy0             = _mm256_add_pd(fjy0,ty);
532             fjz0             = _mm256_add_pd(fjz0,tz);
533
534             }
535
536             fjptrA             = f+j_coord_offsetA;
537             fjptrB             = f+j_coord_offsetB;
538             fjptrC             = f+j_coord_offsetC;
539             fjptrD             = f+j_coord_offsetD;
540
541             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
542
543             /* Inner loop uses 257 flops */
544         }
545
546         if(jidx<j_index_end)
547         {
548
549             /* Get j neighbor index, and coordinate index */
550             jnrlistA         = jjnr[jidx];
551             jnrlistB         = jjnr[jidx+1];
552             jnrlistC         = jjnr[jidx+2];
553             jnrlistD         = jjnr[jidx+3];
554             /* Sign of each element will be negative for non-real atoms.
555              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
556              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
557              */
558             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
559
560             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
561             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
562             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
563
564             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
565             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
566             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
567             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
568             j_coord_offsetA  = DIM*jnrA;
569             j_coord_offsetB  = DIM*jnrB;
570             j_coord_offsetC  = DIM*jnrC;
571             j_coord_offsetD  = DIM*jnrD;
572
573             /* load j atom coordinates */
574             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
575                                                  x+j_coord_offsetC,x+j_coord_offsetD,
576                                                  &jx0,&jy0,&jz0);
577
578             /* Calculate displacement vector */
579             dx00             = _mm256_sub_pd(ix0,jx0);
580             dy00             = _mm256_sub_pd(iy0,jy0);
581             dz00             = _mm256_sub_pd(iz0,jz0);
582             dx10             = _mm256_sub_pd(ix1,jx0);
583             dy10             = _mm256_sub_pd(iy1,jy0);
584             dz10             = _mm256_sub_pd(iz1,jz0);
585             dx20             = _mm256_sub_pd(ix2,jx0);
586             dy20             = _mm256_sub_pd(iy2,jy0);
587             dz20             = _mm256_sub_pd(iz2,jz0);
588             dx30             = _mm256_sub_pd(ix3,jx0);
589             dy30             = _mm256_sub_pd(iy3,jy0);
590             dz30             = _mm256_sub_pd(iz3,jz0);
591
592             /* Calculate squared distance and things based on it */
593             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
594             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
595             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
596             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
597
598             rinv00           = avx256_invsqrt_d(rsq00);
599             rinv10           = avx256_invsqrt_d(rsq10);
600             rinv20           = avx256_invsqrt_d(rsq20);
601             rinv30           = avx256_invsqrt_d(rsq30);
602
603             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
604             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
605             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
606             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
607
608             /* Load parameters for j particles */
609             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
610                                                                  charge+jnrC+0,charge+jnrD+0);
611             vdwjidx0A        = 2*vdwtype[jnrA+0];
612             vdwjidx0B        = 2*vdwtype[jnrB+0];
613             vdwjidx0C        = 2*vdwtype[jnrC+0];
614             vdwjidx0D        = 2*vdwtype[jnrD+0];
615
616             fjx0             = _mm256_setzero_pd();
617             fjy0             = _mm256_setzero_pd();
618             fjz0             = _mm256_setzero_pd();
619
620             /**************************
621              * CALCULATE INTERACTIONS *
622              **************************/
623
624             if (gmx_mm256_any_lt(rsq00,rcutoff2))
625             {
626
627             r00              = _mm256_mul_pd(rsq00,rinv00);
628             r00              = _mm256_andnot_pd(dummy_mask,r00);
629
630             /* Compute parameters for interactions between i and j atoms */
631             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
632                                             vdwioffsetptr0+vdwjidx0B,
633                                             vdwioffsetptr0+vdwjidx0C,
634                                             vdwioffsetptr0+vdwjidx0D,
635                                             &c6_00,&c12_00);
636
637             /* LENNARD-JONES DISPERSION/REPULSION */
638
639             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
640             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
641             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
642             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
643             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
644
645             d                = _mm256_sub_pd(r00,rswitch);
646             d                = _mm256_max_pd(d,_mm256_setzero_pd());
647             d2               = _mm256_mul_pd(d,d);
648             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
649
650             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
651
652             /* Evaluate switch function */
653             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
654             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
655             vvdw             = _mm256_mul_pd(vvdw,sw);
656             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
657
658             /* Update potential sum for this i atom from the interaction with this j atom. */
659             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
660             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
661             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
662
663             fscal            = fvdw;
664
665             fscal            = _mm256_and_pd(fscal,cutoff_mask);
666
667             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
668
669             /* Calculate temporary vectorial force */
670             tx               = _mm256_mul_pd(fscal,dx00);
671             ty               = _mm256_mul_pd(fscal,dy00);
672             tz               = _mm256_mul_pd(fscal,dz00);
673
674             /* Update vectorial force */
675             fix0             = _mm256_add_pd(fix0,tx);
676             fiy0             = _mm256_add_pd(fiy0,ty);
677             fiz0             = _mm256_add_pd(fiz0,tz);
678
679             fjx0             = _mm256_add_pd(fjx0,tx);
680             fjy0             = _mm256_add_pd(fjy0,ty);
681             fjz0             = _mm256_add_pd(fjz0,tz);
682
683             }
684
685             /**************************
686              * CALCULATE INTERACTIONS *
687              **************************/
688
689             if (gmx_mm256_any_lt(rsq10,rcutoff2))
690             {
691
692             r10              = _mm256_mul_pd(rsq10,rinv10);
693             r10              = _mm256_andnot_pd(dummy_mask,r10);
694
695             /* Compute parameters for interactions between i and j atoms */
696             qq10             = _mm256_mul_pd(iq1,jq0);
697
698             /* EWALD ELECTROSTATICS */
699
700             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
701             ewrt             = _mm256_mul_pd(r10,ewtabscale);
702             ewitab           = _mm256_cvttpd_epi32(ewrt);
703             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
704             ewitab           = _mm_slli_epi32(ewitab,2);
705             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
706             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
707             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
708             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
709             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
710             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
711             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
712             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
713             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
714
715             d                = _mm256_sub_pd(r10,rswitch);
716             d                = _mm256_max_pd(d,_mm256_setzero_pd());
717             d2               = _mm256_mul_pd(d,d);
718             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
719
720             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
721
722             /* Evaluate switch function */
723             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
724             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
725             velec            = _mm256_mul_pd(velec,sw);
726             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
727
728             /* Update potential sum for this i atom from the interaction with this j atom. */
729             velec            = _mm256_and_pd(velec,cutoff_mask);
730             velec            = _mm256_andnot_pd(dummy_mask,velec);
731             velecsum         = _mm256_add_pd(velecsum,velec);
732
733             fscal            = felec;
734
735             fscal            = _mm256_and_pd(fscal,cutoff_mask);
736
737             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
738
739             /* Calculate temporary vectorial force */
740             tx               = _mm256_mul_pd(fscal,dx10);
741             ty               = _mm256_mul_pd(fscal,dy10);
742             tz               = _mm256_mul_pd(fscal,dz10);
743
744             /* Update vectorial force */
745             fix1             = _mm256_add_pd(fix1,tx);
746             fiy1             = _mm256_add_pd(fiy1,ty);
747             fiz1             = _mm256_add_pd(fiz1,tz);
748
749             fjx0             = _mm256_add_pd(fjx0,tx);
750             fjy0             = _mm256_add_pd(fjy0,ty);
751             fjz0             = _mm256_add_pd(fjz0,tz);
752
753             }
754
755             /**************************
756              * CALCULATE INTERACTIONS *
757              **************************/
758
759             if (gmx_mm256_any_lt(rsq20,rcutoff2))
760             {
761
762             r20              = _mm256_mul_pd(rsq20,rinv20);
763             r20              = _mm256_andnot_pd(dummy_mask,r20);
764
765             /* Compute parameters for interactions between i and j atoms */
766             qq20             = _mm256_mul_pd(iq2,jq0);
767
768             /* EWALD ELECTROSTATICS */
769
770             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
771             ewrt             = _mm256_mul_pd(r20,ewtabscale);
772             ewitab           = _mm256_cvttpd_epi32(ewrt);
773             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
774             ewitab           = _mm_slli_epi32(ewitab,2);
775             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
776             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
777             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
778             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
779             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
780             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
781             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
782             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
783             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
784
785             d                = _mm256_sub_pd(r20,rswitch);
786             d                = _mm256_max_pd(d,_mm256_setzero_pd());
787             d2               = _mm256_mul_pd(d,d);
788             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
789
790             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
791
792             /* Evaluate switch function */
793             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
794             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
795             velec            = _mm256_mul_pd(velec,sw);
796             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
797
798             /* Update potential sum for this i atom from the interaction with this j atom. */
799             velec            = _mm256_and_pd(velec,cutoff_mask);
800             velec            = _mm256_andnot_pd(dummy_mask,velec);
801             velecsum         = _mm256_add_pd(velecsum,velec);
802
803             fscal            = felec;
804
805             fscal            = _mm256_and_pd(fscal,cutoff_mask);
806
807             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
808
809             /* Calculate temporary vectorial force */
810             tx               = _mm256_mul_pd(fscal,dx20);
811             ty               = _mm256_mul_pd(fscal,dy20);
812             tz               = _mm256_mul_pd(fscal,dz20);
813
814             /* Update vectorial force */
815             fix2             = _mm256_add_pd(fix2,tx);
816             fiy2             = _mm256_add_pd(fiy2,ty);
817             fiz2             = _mm256_add_pd(fiz2,tz);
818
819             fjx0             = _mm256_add_pd(fjx0,tx);
820             fjy0             = _mm256_add_pd(fjy0,ty);
821             fjz0             = _mm256_add_pd(fjz0,tz);
822
823             }
824
825             /**************************
826              * CALCULATE INTERACTIONS *
827              **************************/
828
829             if (gmx_mm256_any_lt(rsq30,rcutoff2))
830             {
831
832             r30              = _mm256_mul_pd(rsq30,rinv30);
833             r30              = _mm256_andnot_pd(dummy_mask,r30);
834
835             /* Compute parameters for interactions between i and j atoms */
836             qq30             = _mm256_mul_pd(iq3,jq0);
837
838             /* EWALD ELECTROSTATICS */
839
840             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
841             ewrt             = _mm256_mul_pd(r30,ewtabscale);
842             ewitab           = _mm256_cvttpd_epi32(ewrt);
843             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
844             ewitab           = _mm_slli_epi32(ewitab,2);
845             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
846             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
847             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
848             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
849             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
850             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
851             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
852             velec            = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
853             felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
854
855             d                = _mm256_sub_pd(r30,rswitch);
856             d                = _mm256_max_pd(d,_mm256_setzero_pd());
857             d2               = _mm256_mul_pd(d,d);
858             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
859
860             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
861
862             /* Evaluate switch function */
863             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
864             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv30,_mm256_mul_pd(velec,dsw)) );
865             velec            = _mm256_mul_pd(velec,sw);
866             cutoff_mask      = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
867
868             /* Update potential sum for this i atom from the interaction with this j atom. */
869             velec            = _mm256_and_pd(velec,cutoff_mask);
870             velec            = _mm256_andnot_pd(dummy_mask,velec);
871             velecsum         = _mm256_add_pd(velecsum,velec);
872
873             fscal            = felec;
874
875             fscal            = _mm256_and_pd(fscal,cutoff_mask);
876
877             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
878
879             /* Calculate temporary vectorial force */
880             tx               = _mm256_mul_pd(fscal,dx30);
881             ty               = _mm256_mul_pd(fscal,dy30);
882             tz               = _mm256_mul_pd(fscal,dz30);
883
884             /* Update vectorial force */
885             fix3             = _mm256_add_pd(fix3,tx);
886             fiy3             = _mm256_add_pd(fiy3,ty);
887             fiz3             = _mm256_add_pd(fiz3,tz);
888
889             fjx0             = _mm256_add_pd(fjx0,tx);
890             fjy0             = _mm256_add_pd(fjy0,ty);
891             fjz0             = _mm256_add_pd(fjz0,tz);
892
893             }
894
895             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
896             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
897             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
898             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
899
900             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
901
902             /* Inner loop uses 261 flops */
903         }
904
905         /* End of innermost loop */
906
907         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
908                                                  f+i_coord_offset,fshift+i_shift_offset);
909
910         ggid                        = gid[iidx];
911         /* Update potential energies */
912         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
913         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
914
915         /* Increment number of inner iterations */
916         inneriter                  += j_index_end - j_index_start;
917
918         /* Outer loop uses 26 flops */
919     }
920
921     /* Increment number of outer iterations */
922     outeriter        += nri;
923
924     /* Update outer/inner flops */
925
926     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*261);
927 }
928 /*
929  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_double
930  * Electrostatics interaction: Ewald
931  * VdW interaction:            LennardJones
932  * Geometry:                   Water4-Particle
933  * Calculate force/pot:        Force
934  */
935 void
936 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_double
937                     (t_nblist                    * gmx_restrict       nlist,
938                      rvec                        * gmx_restrict          xx,
939                      rvec                        * gmx_restrict          ff,
940                      struct t_forcerec           * gmx_restrict          fr,
941                      t_mdatoms                   * gmx_restrict     mdatoms,
942                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
943                      t_nrnb                      * gmx_restrict        nrnb)
944 {
945     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
946      * just 0 for non-waters.
947      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
948      * jnr indices corresponding to data put in the four positions in the SIMD register.
949      */
950     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
951     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
952     int              jnrA,jnrB,jnrC,jnrD;
953     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
954     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
955     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
956     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
957     real             rcutoff_scalar;
958     real             *shiftvec,*fshift,*x,*f;
959     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
960     real             scratch[4*DIM];
961     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
962     real *           vdwioffsetptr0;
963     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
964     real *           vdwioffsetptr1;
965     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
966     real *           vdwioffsetptr2;
967     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
968     real *           vdwioffsetptr3;
969     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
970     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
971     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
972     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
973     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
974     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
975     __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
976     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
977     real             *charge;
978     int              nvdwtype;
979     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
980     int              *vdwtype;
981     real             *vdwparam;
982     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
983     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
984     __m128i          ewitab;
985     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
986     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
987     real             *ewtab;
988     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
989     real             rswitch_scalar,d_scalar;
990     __m256d          dummy_mask,cutoff_mask;
991     __m128           tmpmask0,tmpmask1;
992     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
993     __m256d          one     = _mm256_set1_pd(1.0);
994     __m256d          two     = _mm256_set1_pd(2.0);
995     x                = xx[0];
996     f                = ff[0];
997
998     nri              = nlist->nri;
999     iinr             = nlist->iinr;
1000     jindex           = nlist->jindex;
1001     jjnr             = nlist->jjnr;
1002     shiftidx         = nlist->shift;
1003     gid              = nlist->gid;
1004     shiftvec         = fr->shift_vec[0];
1005     fshift           = fr->fshift[0];
1006     facel            = _mm256_set1_pd(fr->ic->epsfac);
1007     charge           = mdatoms->chargeA;
1008     nvdwtype         = fr->ntype;
1009     vdwparam         = fr->nbfp;
1010     vdwtype          = mdatoms->typeA;
1011
1012     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
1013     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1014     beta2            = _mm256_mul_pd(beta,beta);
1015     beta3            = _mm256_mul_pd(beta,beta2);
1016
1017     ewtab            = fr->ic->tabq_coul_FDV0;
1018     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
1019     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1020
1021     /* Setup water-specific parameters */
1022     inr              = nlist->iinr[0];
1023     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1024     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1025     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1026     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1027
1028     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1029     rcutoff_scalar   = fr->ic->rcoulomb;
1030     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1031     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1032
1033     rswitch_scalar   = fr->ic->rcoulomb_switch;
1034     rswitch          = _mm256_set1_pd(rswitch_scalar);
1035     /* Setup switch parameters */
1036     d_scalar         = rcutoff_scalar-rswitch_scalar;
1037     d                = _mm256_set1_pd(d_scalar);
1038     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1039     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1040     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1041     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1042     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1043     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1044
1045     /* Avoid stupid compiler warnings */
1046     jnrA = jnrB = jnrC = jnrD = 0;
1047     j_coord_offsetA = 0;
1048     j_coord_offsetB = 0;
1049     j_coord_offsetC = 0;
1050     j_coord_offsetD = 0;
1051
1052     outeriter        = 0;
1053     inneriter        = 0;
1054
1055     for(iidx=0;iidx<4*DIM;iidx++)
1056     {
1057         scratch[iidx] = 0.0;
1058     }
1059
1060     /* Start outer loop over neighborlists */
1061     for(iidx=0; iidx<nri; iidx++)
1062     {
1063         /* Load shift vector for this list */
1064         i_shift_offset   = DIM*shiftidx[iidx];
1065
1066         /* Load limits for loop over neighbors */
1067         j_index_start    = jindex[iidx];
1068         j_index_end      = jindex[iidx+1];
1069
1070         /* Get outer coordinate index */
1071         inr              = iinr[iidx];
1072         i_coord_offset   = DIM*inr;
1073
1074         /* Load i particle coords and add shift vector */
1075         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1076                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1077
1078         fix0             = _mm256_setzero_pd();
1079         fiy0             = _mm256_setzero_pd();
1080         fiz0             = _mm256_setzero_pd();
1081         fix1             = _mm256_setzero_pd();
1082         fiy1             = _mm256_setzero_pd();
1083         fiz1             = _mm256_setzero_pd();
1084         fix2             = _mm256_setzero_pd();
1085         fiy2             = _mm256_setzero_pd();
1086         fiz2             = _mm256_setzero_pd();
1087         fix3             = _mm256_setzero_pd();
1088         fiy3             = _mm256_setzero_pd();
1089         fiz3             = _mm256_setzero_pd();
1090
1091         /* Start inner kernel loop */
1092         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1093         {
1094
1095             /* Get j neighbor index, and coordinate index */
1096             jnrA             = jjnr[jidx];
1097             jnrB             = jjnr[jidx+1];
1098             jnrC             = jjnr[jidx+2];
1099             jnrD             = jjnr[jidx+3];
1100             j_coord_offsetA  = DIM*jnrA;
1101             j_coord_offsetB  = DIM*jnrB;
1102             j_coord_offsetC  = DIM*jnrC;
1103             j_coord_offsetD  = DIM*jnrD;
1104
1105             /* load j atom coordinates */
1106             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1107                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1108                                                  &jx0,&jy0,&jz0);
1109
1110             /* Calculate displacement vector */
1111             dx00             = _mm256_sub_pd(ix0,jx0);
1112             dy00             = _mm256_sub_pd(iy0,jy0);
1113             dz00             = _mm256_sub_pd(iz0,jz0);
1114             dx10             = _mm256_sub_pd(ix1,jx0);
1115             dy10             = _mm256_sub_pd(iy1,jy0);
1116             dz10             = _mm256_sub_pd(iz1,jz0);
1117             dx20             = _mm256_sub_pd(ix2,jx0);
1118             dy20             = _mm256_sub_pd(iy2,jy0);
1119             dz20             = _mm256_sub_pd(iz2,jz0);
1120             dx30             = _mm256_sub_pd(ix3,jx0);
1121             dy30             = _mm256_sub_pd(iy3,jy0);
1122             dz30             = _mm256_sub_pd(iz3,jz0);
1123
1124             /* Calculate squared distance and things based on it */
1125             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1126             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1127             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1128             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
1129
1130             rinv00           = avx256_invsqrt_d(rsq00);
1131             rinv10           = avx256_invsqrt_d(rsq10);
1132             rinv20           = avx256_invsqrt_d(rsq20);
1133             rinv30           = avx256_invsqrt_d(rsq30);
1134
1135             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1136             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1137             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1138             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
1139
1140             /* Load parameters for j particles */
1141             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1142                                                                  charge+jnrC+0,charge+jnrD+0);
1143             vdwjidx0A        = 2*vdwtype[jnrA+0];
1144             vdwjidx0B        = 2*vdwtype[jnrB+0];
1145             vdwjidx0C        = 2*vdwtype[jnrC+0];
1146             vdwjidx0D        = 2*vdwtype[jnrD+0];
1147
1148             fjx0             = _mm256_setzero_pd();
1149             fjy0             = _mm256_setzero_pd();
1150             fjz0             = _mm256_setzero_pd();
1151
1152             /**************************
1153              * CALCULATE INTERACTIONS *
1154              **************************/
1155
1156             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1157             {
1158
1159             r00              = _mm256_mul_pd(rsq00,rinv00);
1160
1161             /* Compute parameters for interactions between i and j atoms */
1162             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1163                                             vdwioffsetptr0+vdwjidx0B,
1164                                             vdwioffsetptr0+vdwjidx0C,
1165                                             vdwioffsetptr0+vdwjidx0D,
1166                                             &c6_00,&c12_00);
1167
1168             /* LENNARD-JONES DISPERSION/REPULSION */
1169
1170             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1171             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1172             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1173             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1174             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1175
1176             d                = _mm256_sub_pd(r00,rswitch);
1177             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1178             d2               = _mm256_mul_pd(d,d);
1179             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1180
1181             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1182
1183             /* Evaluate switch function */
1184             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1185             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1186             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1187
1188             fscal            = fvdw;
1189
1190             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1191
1192             /* Calculate temporary vectorial force */
1193             tx               = _mm256_mul_pd(fscal,dx00);
1194             ty               = _mm256_mul_pd(fscal,dy00);
1195             tz               = _mm256_mul_pd(fscal,dz00);
1196
1197             /* Update vectorial force */
1198             fix0             = _mm256_add_pd(fix0,tx);
1199             fiy0             = _mm256_add_pd(fiy0,ty);
1200             fiz0             = _mm256_add_pd(fiz0,tz);
1201
1202             fjx0             = _mm256_add_pd(fjx0,tx);
1203             fjy0             = _mm256_add_pd(fjy0,ty);
1204             fjz0             = _mm256_add_pd(fjz0,tz);
1205
1206             }
1207
1208             /**************************
1209              * CALCULATE INTERACTIONS *
1210              **************************/
1211
1212             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1213             {
1214
1215             r10              = _mm256_mul_pd(rsq10,rinv10);
1216
1217             /* Compute parameters for interactions between i and j atoms */
1218             qq10             = _mm256_mul_pd(iq1,jq0);
1219
1220             /* EWALD ELECTROSTATICS */
1221
1222             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1223             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1224             ewitab           = _mm256_cvttpd_epi32(ewrt);
1225             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1226             ewitab           = _mm_slli_epi32(ewitab,2);
1227             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1228             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1229             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1230             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1231             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1232             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1233             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1234             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1235             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1236
1237             d                = _mm256_sub_pd(r10,rswitch);
1238             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1239             d2               = _mm256_mul_pd(d,d);
1240             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1241
1242             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1243
1244             /* Evaluate switch function */
1245             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1246             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
1247             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1248
1249             fscal            = felec;
1250
1251             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1252
1253             /* Calculate temporary vectorial force */
1254             tx               = _mm256_mul_pd(fscal,dx10);
1255             ty               = _mm256_mul_pd(fscal,dy10);
1256             tz               = _mm256_mul_pd(fscal,dz10);
1257
1258             /* Update vectorial force */
1259             fix1             = _mm256_add_pd(fix1,tx);
1260             fiy1             = _mm256_add_pd(fiy1,ty);
1261             fiz1             = _mm256_add_pd(fiz1,tz);
1262
1263             fjx0             = _mm256_add_pd(fjx0,tx);
1264             fjy0             = _mm256_add_pd(fjy0,ty);
1265             fjz0             = _mm256_add_pd(fjz0,tz);
1266
1267             }
1268
1269             /**************************
1270              * CALCULATE INTERACTIONS *
1271              **************************/
1272
1273             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1274             {
1275
1276             r20              = _mm256_mul_pd(rsq20,rinv20);
1277
1278             /* Compute parameters for interactions between i and j atoms */
1279             qq20             = _mm256_mul_pd(iq2,jq0);
1280
1281             /* EWALD ELECTROSTATICS */
1282
1283             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1284             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1285             ewitab           = _mm256_cvttpd_epi32(ewrt);
1286             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1287             ewitab           = _mm_slli_epi32(ewitab,2);
1288             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1289             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1290             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1291             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1292             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1293             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1294             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1295             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1296             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1297
1298             d                = _mm256_sub_pd(r20,rswitch);
1299             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1300             d2               = _mm256_mul_pd(d,d);
1301             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1302
1303             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1304
1305             /* Evaluate switch function */
1306             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1307             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
1308             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1309
1310             fscal            = felec;
1311
1312             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1313
1314             /* Calculate temporary vectorial force */
1315             tx               = _mm256_mul_pd(fscal,dx20);
1316             ty               = _mm256_mul_pd(fscal,dy20);
1317             tz               = _mm256_mul_pd(fscal,dz20);
1318
1319             /* Update vectorial force */
1320             fix2             = _mm256_add_pd(fix2,tx);
1321             fiy2             = _mm256_add_pd(fiy2,ty);
1322             fiz2             = _mm256_add_pd(fiz2,tz);
1323
1324             fjx0             = _mm256_add_pd(fjx0,tx);
1325             fjy0             = _mm256_add_pd(fjy0,ty);
1326             fjz0             = _mm256_add_pd(fjz0,tz);
1327
1328             }
1329
1330             /**************************
1331              * CALCULATE INTERACTIONS *
1332              **************************/
1333
1334             if (gmx_mm256_any_lt(rsq30,rcutoff2))
1335             {
1336
1337             r30              = _mm256_mul_pd(rsq30,rinv30);
1338
1339             /* Compute parameters for interactions between i and j atoms */
1340             qq30             = _mm256_mul_pd(iq3,jq0);
1341
1342             /* EWALD ELECTROSTATICS */
1343
1344             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1345             ewrt             = _mm256_mul_pd(r30,ewtabscale);
1346             ewitab           = _mm256_cvttpd_epi32(ewrt);
1347             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1348             ewitab           = _mm_slli_epi32(ewitab,2);
1349             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1350             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1351             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1352             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1353             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1354             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1355             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1356             velec            = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
1357             felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
1358
1359             d                = _mm256_sub_pd(r30,rswitch);
1360             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1361             d2               = _mm256_mul_pd(d,d);
1362             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1363
1364             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1365
1366             /* Evaluate switch function */
1367             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1368             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv30,_mm256_mul_pd(velec,dsw)) );
1369             cutoff_mask      = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
1370
1371             fscal            = felec;
1372
1373             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1374
1375             /* Calculate temporary vectorial force */
1376             tx               = _mm256_mul_pd(fscal,dx30);
1377             ty               = _mm256_mul_pd(fscal,dy30);
1378             tz               = _mm256_mul_pd(fscal,dz30);
1379
1380             /* Update vectorial force */
1381             fix3             = _mm256_add_pd(fix3,tx);
1382             fiy3             = _mm256_add_pd(fiy3,ty);
1383             fiz3             = _mm256_add_pd(fiz3,tz);
1384
1385             fjx0             = _mm256_add_pd(fjx0,tx);
1386             fjy0             = _mm256_add_pd(fjy0,ty);
1387             fjz0             = _mm256_add_pd(fjz0,tz);
1388
1389             }
1390
1391             fjptrA             = f+j_coord_offsetA;
1392             fjptrB             = f+j_coord_offsetB;
1393             fjptrC             = f+j_coord_offsetC;
1394             fjptrD             = f+j_coord_offsetD;
1395
1396             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1397
1398             /* Inner loop uses 245 flops */
1399         }
1400
1401         if(jidx<j_index_end)
1402         {
1403
1404             /* Get j neighbor index, and coordinate index */
1405             jnrlistA         = jjnr[jidx];
1406             jnrlistB         = jjnr[jidx+1];
1407             jnrlistC         = jjnr[jidx+2];
1408             jnrlistD         = jjnr[jidx+3];
1409             /* Sign of each element will be negative for non-real atoms.
1410              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1411              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1412              */
1413             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1414
1415             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1416             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1417             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1418
1419             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1420             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1421             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1422             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1423             j_coord_offsetA  = DIM*jnrA;
1424             j_coord_offsetB  = DIM*jnrB;
1425             j_coord_offsetC  = DIM*jnrC;
1426             j_coord_offsetD  = DIM*jnrD;
1427
1428             /* load j atom coordinates */
1429             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1430                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1431                                                  &jx0,&jy0,&jz0);
1432
1433             /* Calculate displacement vector */
1434             dx00             = _mm256_sub_pd(ix0,jx0);
1435             dy00             = _mm256_sub_pd(iy0,jy0);
1436             dz00             = _mm256_sub_pd(iz0,jz0);
1437             dx10             = _mm256_sub_pd(ix1,jx0);
1438             dy10             = _mm256_sub_pd(iy1,jy0);
1439             dz10             = _mm256_sub_pd(iz1,jz0);
1440             dx20             = _mm256_sub_pd(ix2,jx0);
1441             dy20             = _mm256_sub_pd(iy2,jy0);
1442             dz20             = _mm256_sub_pd(iz2,jz0);
1443             dx30             = _mm256_sub_pd(ix3,jx0);
1444             dy30             = _mm256_sub_pd(iy3,jy0);
1445             dz30             = _mm256_sub_pd(iz3,jz0);
1446
1447             /* Calculate squared distance and things based on it */
1448             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1449             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1450             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1451             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
1452
1453             rinv00           = avx256_invsqrt_d(rsq00);
1454             rinv10           = avx256_invsqrt_d(rsq10);
1455             rinv20           = avx256_invsqrt_d(rsq20);
1456             rinv30           = avx256_invsqrt_d(rsq30);
1457
1458             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1459             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1460             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1461             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
1462
1463             /* Load parameters for j particles */
1464             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1465                                                                  charge+jnrC+0,charge+jnrD+0);
1466             vdwjidx0A        = 2*vdwtype[jnrA+0];
1467             vdwjidx0B        = 2*vdwtype[jnrB+0];
1468             vdwjidx0C        = 2*vdwtype[jnrC+0];
1469             vdwjidx0D        = 2*vdwtype[jnrD+0];
1470
1471             fjx0             = _mm256_setzero_pd();
1472             fjy0             = _mm256_setzero_pd();
1473             fjz0             = _mm256_setzero_pd();
1474
1475             /**************************
1476              * CALCULATE INTERACTIONS *
1477              **************************/
1478
1479             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1480             {
1481
1482             r00              = _mm256_mul_pd(rsq00,rinv00);
1483             r00              = _mm256_andnot_pd(dummy_mask,r00);
1484
1485             /* Compute parameters for interactions between i and j atoms */
1486             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1487                                             vdwioffsetptr0+vdwjidx0B,
1488                                             vdwioffsetptr0+vdwjidx0C,
1489                                             vdwioffsetptr0+vdwjidx0D,
1490                                             &c6_00,&c12_00);
1491
1492             /* LENNARD-JONES DISPERSION/REPULSION */
1493
1494             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1495             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1496             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1497             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1498             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1499
1500             d                = _mm256_sub_pd(r00,rswitch);
1501             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1502             d2               = _mm256_mul_pd(d,d);
1503             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1504
1505             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1506
1507             /* Evaluate switch function */
1508             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1509             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1510             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1511
1512             fscal            = fvdw;
1513
1514             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1515
1516             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1517
1518             /* Calculate temporary vectorial force */
1519             tx               = _mm256_mul_pd(fscal,dx00);
1520             ty               = _mm256_mul_pd(fscal,dy00);
1521             tz               = _mm256_mul_pd(fscal,dz00);
1522
1523             /* Update vectorial force */
1524             fix0             = _mm256_add_pd(fix0,tx);
1525             fiy0             = _mm256_add_pd(fiy0,ty);
1526             fiz0             = _mm256_add_pd(fiz0,tz);
1527
1528             fjx0             = _mm256_add_pd(fjx0,tx);
1529             fjy0             = _mm256_add_pd(fjy0,ty);
1530             fjz0             = _mm256_add_pd(fjz0,tz);
1531
1532             }
1533
1534             /**************************
1535              * CALCULATE INTERACTIONS *
1536              **************************/
1537
1538             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1539             {
1540
1541             r10              = _mm256_mul_pd(rsq10,rinv10);
1542             r10              = _mm256_andnot_pd(dummy_mask,r10);
1543
1544             /* Compute parameters for interactions between i and j atoms */
1545             qq10             = _mm256_mul_pd(iq1,jq0);
1546
1547             /* EWALD ELECTROSTATICS */
1548
1549             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1550             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1551             ewitab           = _mm256_cvttpd_epi32(ewrt);
1552             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1553             ewitab           = _mm_slli_epi32(ewitab,2);
1554             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1555             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1556             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1557             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1558             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1559             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1560             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1561             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1562             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1563
1564             d                = _mm256_sub_pd(r10,rswitch);
1565             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1566             d2               = _mm256_mul_pd(d,d);
1567             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1568
1569             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1570
1571             /* Evaluate switch function */
1572             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1573             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
1574             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1575
1576             fscal            = felec;
1577
1578             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1579
1580             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1581
1582             /* Calculate temporary vectorial force */
1583             tx               = _mm256_mul_pd(fscal,dx10);
1584             ty               = _mm256_mul_pd(fscal,dy10);
1585             tz               = _mm256_mul_pd(fscal,dz10);
1586
1587             /* Update vectorial force */
1588             fix1             = _mm256_add_pd(fix1,tx);
1589             fiy1             = _mm256_add_pd(fiy1,ty);
1590             fiz1             = _mm256_add_pd(fiz1,tz);
1591
1592             fjx0             = _mm256_add_pd(fjx0,tx);
1593             fjy0             = _mm256_add_pd(fjy0,ty);
1594             fjz0             = _mm256_add_pd(fjz0,tz);
1595
1596             }
1597
1598             /**************************
1599              * CALCULATE INTERACTIONS *
1600              **************************/
1601
1602             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1603             {
1604
1605             r20              = _mm256_mul_pd(rsq20,rinv20);
1606             r20              = _mm256_andnot_pd(dummy_mask,r20);
1607
1608             /* Compute parameters for interactions between i and j atoms */
1609             qq20             = _mm256_mul_pd(iq2,jq0);
1610
1611             /* EWALD ELECTROSTATICS */
1612
1613             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1614             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1615             ewitab           = _mm256_cvttpd_epi32(ewrt);
1616             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1617             ewitab           = _mm_slli_epi32(ewitab,2);
1618             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1619             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1620             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1621             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1622             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1623             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1624             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1625             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1626             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1627
1628             d                = _mm256_sub_pd(r20,rswitch);
1629             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1630             d2               = _mm256_mul_pd(d,d);
1631             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1632
1633             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1634
1635             /* Evaluate switch function */
1636             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1637             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
1638             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1639
1640             fscal            = felec;
1641
1642             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1643
1644             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1645
1646             /* Calculate temporary vectorial force */
1647             tx               = _mm256_mul_pd(fscal,dx20);
1648             ty               = _mm256_mul_pd(fscal,dy20);
1649             tz               = _mm256_mul_pd(fscal,dz20);
1650
1651             /* Update vectorial force */
1652             fix2             = _mm256_add_pd(fix2,tx);
1653             fiy2             = _mm256_add_pd(fiy2,ty);
1654             fiz2             = _mm256_add_pd(fiz2,tz);
1655
1656             fjx0             = _mm256_add_pd(fjx0,tx);
1657             fjy0             = _mm256_add_pd(fjy0,ty);
1658             fjz0             = _mm256_add_pd(fjz0,tz);
1659
1660             }
1661
1662             /**************************
1663              * CALCULATE INTERACTIONS *
1664              **************************/
1665
1666             if (gmx_mm256_any_lt(rsq30,rcutoff2))
1667             {
1668
1669             r30              = _mm256_mul_pd(rsq30,rinv30);
1670             r30              = _mm256_andnot_pd(dummy_mask,r30);
1671
1672             /* Compute parameters for interactions between i and j atoms */
1673             qq30             = _mm256_mul_pd(iq3,jq0);
1674
1675             /* EWALD ELECTROSTATICS */
1676
1677             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1678             ewrt             = _mm256_mul_pd(r30,ewtabscale);
1679             ewitab           = _mm256_cvttpd_epi32(ewrt);
1680             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1681             ewitab           = _mm_slli_epi32(ewitab,2);
1682             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1683             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1684             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1685             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1686             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1687             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1688             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1689             velec            = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
1690             felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
1691
1692             d                = _mm256_sub_pd(r30,rswitch);
1693             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1694             d2               = _mm256_mul_pd(d,d);
1695             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1696
1697             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1698
1699             /* Evaluate switch function */
1700             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1701             felec            = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv30,_mm256_mul_pd(velec,dsw)) );
1702             cutoff_mask      = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
1703
1704             fscal            = felec;
1705
1706             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1707
1708             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1709
1710             /* Calculate temporary vectorial force */
1711             tx               = _mm256_mul_pd(fscal,dx30);
1712             ty               = _mm256_mul_pd(fscal,dy30);
1713             tz               = _mm256_mul_pd(fscal,dz30);
1714
1715             /* Update vectorial force */
1716             fix3             = _mm256_add_pd(fix3,tx);
1717             fiy3             = _mm256_add_pd(fiy3,ty);
1718             fiz3             = _mm256_add_pd(fiz3,tz);
1719
1720             fjx0             = _mm256_add_pd(fjx0,tx);
1721             fjy0             = _mm256_add_pd(fjy0,ty);
1722             fjz0             = _mm256_add_pd(fjz0,tz);
1723
1724             }
1725
1726             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1727             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1728             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1729             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1730
1731             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1732
1733             /* Inner loop uses 249 flops */
1734         }
1735
1736         /* End of innermost loop */
1737
1738         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1739                                                  f+i_coord_offset,fshift+i_shift_offset);
1740
1741         /* Increment number of inner iterations */
1742         inneriter                  += j_index_end - j_index_start;
1743
1744         /* Outer loop uses 24 flops */
1745     }
1746
1747     /* Increment number of outer iterations */
1748     outeriter        += nri;
1749
1750     /* Update outer/inner flops */
1751
1752     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*249);
1753 }