110634a894fd67429bc11f324a3e516fab2e4349
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEw_VdwCSTab_GeomW3P1_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_double
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            CubicSplineTable
54  * Geometry:                   Water3-Particle
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82     real             scratch[4*DIM];
83     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84     real *           vdwioffsetptr0;
85     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86     real *           vdwioffsetptr1;
87     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88     real *           vdwioffsetptr2;
89     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
94     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
95     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
96     real             *charge;
97     int              nvdwtype;
98     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99     int              *vdwtype;
100     real             *vdwparam;
101     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
102     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
103     __m128i          vfitab;
104     __m128i          ifour       = _mm_set1_epi32(4);
105     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
106     real             *vftab;
107     __m128i          ewitab;
108     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
109     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
110     real             *ewtab;
111     __m256d          dummy_mask,cutoff_mask;
112     __m128           tmpmask0,tmpmask1;
113     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
114     __m256d          one     = _mm256_set1_pd(1.0);
115     __m256d          two     = _mm256_set1_pd(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm256_set1_pd(fr->epsfac);
128     charge           = mdatoms->chargeA;
129     nvdwtype         = fr->ntype;
130     vdwparam         = fr->nbfp;
131     vdwtype          = mdatoms->typeA;
132
133     vftab            = kernel_data->table_vdw->data;
134     vftabscale       = _mm256_set1_pd(kernel_data->table_vdw->scale);
135
136     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
137     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
138     beta2            = _mm256_mul_pd(beta,beta);
139     beta3            = _mm256_mul_pd(beta,beta2);
140
141     ewtab            = fr->ic->tabq_coul_FDV0;
142     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
143     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
144
145     /* Setup water-specific parameters */
146     inr              = nlist->iinr[0];
147     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
148     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
149     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
150     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
151
152     /* Avoid stupid compiler warnings */
153     jnrA = jnrB = jnrC = jnrD = 0;
154     j_coord_offsetA = 0;
155     j_coord_offsetB = 0;
156     j_coord_offsetC = 0;
157     j_coord_offsetD = 0;
158
159     outeriter        = 0;
160     inneriter        = 0;
161
162     for(iidx=0;iidx<4*DIM;iidx++)
163     {
164         scratch[iidx] = 0.0;
165     }
166
167     /* Start outer loop over neighborlists */
168     for(iidx=0; iidx<nri; iidx++)
169     {
170         /* Load shift vector for this list */
171         i_shift_offset   = DIM*shiftidx[iidx];
172
173         /* Load limits for loop over neighbors */
174         j_index_start    = jindex[iidx];
175         j_index_end      = jindex[iidx+1];
176
177         /* Get outer coordinate index */
178         inr              = iinr[iidx];
179         i_coord_offset   = DIM*inr;
180
181         /* Load i particle coords and add shift vector */
182         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
184
185         fix0             = _mm256_setzero_pd();
186         fiy0             = _mm256_setzero_pd();
187         fiz0             = _mm256_setzero_pd();
188         fix1             = _mm256_setzero_pd();
189         fiy1             = _mm256_setzero_pd();
190         fiz1             = _mm256_setzero_pd();
191         fix2             = _mm256_setzero_pd();
192         fiy2             = _mm256_setzero_pd();
193         fiz2             = _mm256_setzero_pd();
194
195         /* Reset potential sums */
196         velecsum         = _mm256_setzero_pd();
197         vvdwsum          = _mm256_setzero_pd();
198
199         /* Start inner kernel loop */
200         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
201         {
202
203             /* Get j neighbor index, and coordinate index */
204             jnrA             = jjnr[jidx];
205             jnrB             = jjnr[jidx+1];
206             jnrC             = jjnr[jidx+2];
207             jnrD             = jjnr[jidx+3];
208             j_coord_offsetA  = DIM*jnrA;
209             j_coord_offsetB  = DIM*jnrB;
210             j_coord_offsetC  = DIM*jnrC;
211             j_coord_offsetD  = DIM*jnrD;
212
213             /* load j atom coordinates */
214             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
215                                                  x+j_coord_offsetC,x+j_coord_offsetD,
216                                                  &jx0,&jy0,&jz0);
217
218             /* Calculate displacement vector */
219             dx00             = _mm256_sub_pd(ix0,jx0);
220             dy00             = _mm256_sub_pd(iy0,jy0);
221             dz00             = _mm256_sub_pd(iz0,jz0);
222             dx10             = _mm256_sub_pd(ix1,jx0);
223             dy10             = _mm256_sub_pd(iy1,jy0);
224             dz10             = _mm256_sub_pd(iz1,jz0);
225             dx20             = _mm256_sub_pd(ix2,jx0);
226             dy20             = _mm256_sub_pd(iy2,jy0);
227             dz20             = _mm256_sub_pd(iz2,jz0);
228
229             /* Calculate squared distance and things based on it */
230             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
231             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
232             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
233
234             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
235             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
236             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
237
238             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
239             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
240             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
241
242             /* Load parameters for j particles */
243             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
244                                                                  charge+jnrC+0,charge+jnrD+0);
245             vdwjidx0A        = 2*vdwtype[jnrA+0];
246             vdwjidx0B        = 2*vdwtype[jnrB+0];
247             vdwjidx0C        = 2*vdwtype[jnrC+0];
248             vdwjidx0D        = 2*vdwtype[jnrD+0];
249
250             fjx0             = _mm256_setzero_pd();
251             fjy0             = _mm256_setzero_pd();
252             fjz0             = _mm256_setzero_pd();
253
254             /**************************
255              * CALCULATE INTERACTIONS *
256              **************************/
257
258             r00              = _mm256_mul_pd(rsq00,rinv00);
259
260             /* Compute parameters for interactions between i and j atoms */
261             qq00             = _mm256_mul_pd(iq0,jq0);
262             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
263                                             vdwioffsetptr0+vdwjidx0B,
264                                             vdwioffsetptr0+vdwjidx0C,
265                                             vdwioffsetptr0+vdwjidx0D,
266                                             &c6_00,&c12_00);
267
268             /* Calculate table index by multiplying r with table scale and truncate to integer */
269             rt               = _mm256_mul_pd(r00,vftabscale);
270             vfitab           = _mm256_cvttpd_epi32(rt);
271             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
272             vfitab           = _mm_slli_epi32(vfitab,3);
273
274             /* EWALD ELECTROSTATICS */
275
276             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
277             ewrt             = _mm256_mul_pd(r00,ewtabscale);
278             ewitab           = _mm256_cvttpd_epi32(ewrt);
279             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
280             ewitab           = _mm_slli_epi32(ewitab,2);
281             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
282             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
283             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
284             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
285             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
286             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
287             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
288             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
289             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
290
291             /* CUBIC SPLINE TABLE DISPERSION */
292             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
293             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
294             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
295             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
296             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
297             Heps             = _mm256_mul_pd(vfeps,H);
298             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
299             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
300             vvdw6            = _mm256_mul_pd(c6_00,VV);
301             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
302             fvdw6            = _mm256_mul_pd(c6_00,FF);
303
304             /* CUBIC SPLINE TABLE REPULSION */
305             vfitab           = _mm_add_epi32(vfitab,ifour);
306             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
307             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
308             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
309             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
310             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
311             Heps             = _mm256_mul_pd(vfeps,H);
312             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
313             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
314             vvdw12           = _mm256_mul_pd(c12_00,VV);
315             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
316             fvdw12           = _mm256_mul_pd(c12_00,FF);
317             vvdw             = _mm256_add_pd(vvdw12,vvdw6);
318             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
319
320             /* Update potential sum for this i atom from the interaction with this j atom. */
321             velecsum         = _mm256_add_pd(velecsum,velec);
322             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
323
324             fscal            = _mm256_add_pd(felec,fvdw);
325
326             /* Calculate temporary vectorial force */
327             tx               = _mm256_mul_pd(fscal,dx00);
328             ty               = _mm256_mul_pd(fscal,dy00);
329             tz               = _mm256_mul_pd(fscal,dz00);
330
331             /* Update vectorial force */
332             fix0             = _mm256_add_pd(fix0,tx);
333             fiy0             = _mm256_add_pd(fiy0,ty);
334             fiz0             = _mm256_add_pd(fiz0,tz);
335
336             fjx0             = _mm256_add_pd(fjx0,tx);
337             fjy0             = _mm256_add_pd(fjy0,ty);
338             fjz0             = _mm256_add_pd(fjz0,tz);
339
340             /**************************
341              * CALCULATE INTERACTIONS *
342              **************************/
343
344             r10              = _mm256_mul_pd(rsq10,rinv10);
345
346             /* Compute parameters for interactions between i and j atoms */
347             qq10             = _mm256_mul_pd(iq1,jq0);
348
349             /* EWALD ELECTROSTATICS */
350
351             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
352             ewrt             = _mm256_mul_pd(r10,ewtabscale);
353             ewitab           = _mm256_cvttpd_epi32(ewrt);
354             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
355             ewitab           = _mm_slli_epi32(ewitab,2);
356             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
357             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
358             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
359             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
360             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
361             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
362             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
363             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
364             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
365
366             /* Update potential sum for this i atom from the interaction with this j atom. */
367             velecsum         = _mm256_add_pd(velecsum,velec);
368
369             fscal            = felec;
370
371             /* Calculate temporary vectorial force */
372             tx               = _mm256_mul_pd(fscal,dx10);
373             ty               = _mm256_mul_pd(fscal,dy10);
374             tz               = _mm256_mul_pd(fscal,dz10);
375
376             /* Update vectorial force */
377             fix1             = _mm256_add_pd(fix1,tx);
378             fiy1             = _mm256_add_pd(fiy1,ty);
379             fiz1             = _mm256_add_pd(fiz1,tz);
380
381             fjx0             = _mm256_add_pd(fjx0,tx);
382             fjy0             = _mm256_add_pd(fjy0,ty);
383             fjz0             = _mm256_add_pd(fjz0,tz);
384
385             /**************************
386              * CALCULATE INTERACTIONS *
387              **************************/
388
389             r20              = _mm256_mul_pd(rsq20,rinv20);
390
391             /* Compute parameters for interactions between i and j atoms */
392             qq20             = _mm256_mul_pd(iq2,jq0);
393
394             /* EWALD ELECTROSTATICS */
395
396             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
397             ewrt             = _mm256_mul_pd(r20,ewtabscale);
398             ewitab           = _mm256_cvttpd_epi32(ewrt);
399             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
400             ewitab           = _mm_slli_epi32(ewitab,2);
401             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
402             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
403             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
404             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
405             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
406             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
407             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
408             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
409             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
410
411             /* Update potential sum for this i atom from the interaction with this j atom. */
412             velecsum         = _mm256_add_pd(velecsum,velec);
413
414             fscal            = felec;
415
416             /* Calculate temporary vectorial force */
417             tx               = _mm256_mul_pd(fscal,dx20);
418             ty               = _mm256_mul_pd(fscal,dy20);
419             tz               = _mm256_mul_pd(fscal,dz20);
420
421             /* Update vectorial force */
422             fix2             = _mm256_add_pd(fix2,tx);
423             fiy2             = _mm256_add_pd(fiy2,ty);
424             fiz2             = _mm256_add_pd(fiz2,tz);
425
426             fjx0             = _mm256_add_pd(fjx0,tx);
427             fjy0             = _mm256_add_pd(fjy0,ty);
428             fjz0             = _mm256_add_pd(fjz0,tz);
429
430             fjptrA             = f+j_coord_offsetA;
431             fjptrB             = f+j_coord_offsetB;
432             fjptrC             = f+j_coord_offsetC;
433             fjptrD             = f+j_coord_offsetD;
434
435             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
436
437             /* Inner loop uses 160 flops */
438         }
439
440         if(jidx<j_index_end)
441         {
442
443             /* Get j neighbor index, and coordinate index */
444             jnrlistA         = jjnr[jidx];
445             jnrlistB         = jjnr[jidx+1];
446             jnrlistC         = jjnr[jidx+2];
447             jnrlistD         = jjnr[jidx+3];
448             /* Sign of each element will be negative for non-real atoms.
449              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
450              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
451              */
452             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
453
454             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
455             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
456             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
457
458             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
459             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
460             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
461             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
462             j_coord_offsetA  = DIM*jnrA;
463             j_coord_offsetB  = DIM*jnrB;
464             j_coord_offsetC  = DIM*jnrC;
465             j_coord_offsetD  = DIM*jnrD;
466
467             /* load j atom coordinates */
468             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
469                                                  x+j_coord_offsetC,x+j_coord_offsetD,
470                                                  &jx0,&jy0,&jz0);
471
472             /* Calculate displacement vector */
473             dx00             = _mm256_sub_pd(ix0,jx0);
474             dy00             = _mm256_sub_pd(iy0,jy0);
475             dz00             = _mm256_sub_pd(iz0,jz0);
476             dx10             = _mm256_sub_pd(ix1,jx0);
477             dy10             = _mm256_sub_pd(iy1,jy0);
478             dz10             = _mm256_sub_pd(iz1,jz0);
479             dx20             = _mm256_sub_pd(ix2,jx0);
480             dy20             = _mm256_sub_pd(iy2,jy0);
481             dz20             = _mm256_sub_pd(iz2,jz0);
482
483             /* Calculate squared distance and things based on it */
484             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
485             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
486             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
487
488             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
489             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
490             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
491
492             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
493             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
494             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
495
496             /* Load parameters for j particles */
497             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
498                                                                  charge+jnrC+0,charge+jnrD+0);
499             vdwjidx0A        = 2*vdwtype[jnrA+0];
500             vdwjidx0B        = 2*vdwtype[jnrB+0];
501             vdwjidx0C        = 2*vdwtype[jnrC+0];
502             vdwjidx0D        = 2*vdwtype[jnrD+0];
503
504             fjx0             = _mm256_setzero_pd();
505             fjy0             = _mm256_setzero_pd();
506             fjz0             = _mm256_setzero_pd();
507
508             /**************************
509              * CALCULATE INTERACTIONS *
510              **************************/
511
512             r00              = _mm256_mul_pd(rsq00,rinv00);
513             r00              = _mm256_andnot_pd(dummy_mask,r00);
514
515             /* Compute parameters for interactions between i and j atoms */
516             qq00             = _mm256_mul_pd(iq0,jq0);
517             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
518                                             vdwioffsetptr0+vdwjidx0B,
519                                             vdwioffsetptr0+vdwjidx0C,
520                                             vdwioffsetptr0+vdwjidx0D,
521                                             &c6_00,&c12_00);
522
523             /* Calculate table index by multiplying r with table scale and truncate to integer */
524             rt               = _mm256_mul_pd(r00,vftabscale);
525             vfitab           = _mm256_cvttpd_epi32(rt);
526             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
527             vfitab           = _mm_slli_epi32(vfitab,3);
528
529             /* EWALD ELECTROSTATICS */
530
531             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
532             ewrt             = _mm256_mul_pd(r00,ewtabscale);
533             ewitab           = _mm256_cvttpd_epi32(ewrt);
534             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
535             ewitab           = _mm_slli_epi32(ewitab,2);
536             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
537             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
538             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
539             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
540             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
541             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
542             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
543             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
544             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
545
546             /* CUBIC SPLINE TABLE DISPERSION */
547             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
548             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
549             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
550             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
551             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
552             Heps             = _mm256_mul_pd(vfeps,H);
553             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
554             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
555             vvdw6            = _mm256_mul_pd(c6_00,VV);
556             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
557             fvdw6            = _mm256_mul_pd(c6_00,FF);
558
559             /* CUBIC SPLINE TABLE REPULSION */
560             vfitab           = _mm_add_epi32(vfitab,ifour);
561             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
562             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
563             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
564             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
565             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
566             Heps             = _mm256_mul_pd(vfeps,H);
567             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
568             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
569             vvdw12           = _mm256_mul_pd(c12_00,VV);
570             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
571             fvdw12           = _mm256_mul_pd(c12_00,FF);
572             vvdw             = _mm256_add_pd(vvdw12,vvdw6);
573             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
574
575             /* Update potential sum for this i atom from the interaction with this j atom. */
576             velec            = _mm256_andnot_pd(dummy_mask,velec);
577             velecsum         = _mm256_add_pd(velecsum,velec);
578             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
579             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
580
581             fscal            = _mm256_add_pd(felec,fvdw);
582
583             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
584
585             /* Calculate temporary vectorial force */
586             tx               = _mm256_mul_pd(fscal,dx00);
587             ty               = _mm256_mul_pd(fscal,dy00);
588             tz               = _mm256_mul_pd(fscal,dz00);
589
590             /* Update vectorial force */
591             fix0             = _mm256_add_pd(fix0,tx);
592             fiy0             = _mm256_add_pd(fiy0,ty);
593             fiz0             = _mm256_add_pd(fiz0,tz);
594
595             fjx0             = _mm256_add_pd(fjx0,tx);
596             fjy0             = _mm256_add_pd(fjy0,ty);
597             fjz0             = _mm256_add_pd(fjz0,tz);
598
599             /**************************
600              * CALCULATE INTERACTIONS *
601              **************************/
602
603             r10              = _mm256_mul_pd(rsq10,rinv10);
604             r10              = _mm256_andnot_pd(dummy_mask,r10);
605
606             /* Compute parameters for interactions between i and j atoms */
607             qq10             = _mm256_mul_pd(iq1,jq0);
608
609             /* EWALD ELECTROSTATICS */
610
611             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
612             ewrt             = _mm256_mul_pd(r10,ewtabscale);
613             ewitab           = _mm256_cvttpd_epi32(ewrt);
614             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
615             ewitab           = _mm_slli_epi32(ewitab,2);
616             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
617             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
618             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
619             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
620             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
621             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
622             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
623             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
624             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
625
626             /* Update potential sum for this i atom from the interaction with this j atom. */
627             velec            = _mm256_andnot_pd(dummy_mask,velec);
628             velecsum         = _mm256_add_pd(velecsum,velec);
629
630             fscal            = felec;
631
632             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
633
634             /* Calculate temporary vectorial force */
635             tx               = _mm256_mul_pd(fscal,dx10);
636             ty               = _mm256_mul_pd(fscal,dy10);
637             tz               = _mm256_mul_pd(fscal,dz10);
638
639             /* Update vectorial force */
640             fix1             = _mm256_add_pd(fix1,tx);
641             fiy1             = _mm256_add_pd(fiy1,ty);
642             fiz1             = _mm256_add_pd(fiz1,tz);
643
644             fjx0             = _mm256_add_pd(fjx0,tx);
645             fjy0             = _mm256_add_pd(fjy0,ty);
646             fjz0             = _mm256_add_pd(fjz0,tz);
647
648             /**************************
649              * CALCULATE INTERACTIONS *
650              **************************/
651
652             r20              = _mm256_mul_pd(rsq20,rinv20);
653             r20              = _mm256_andnot_pd(dummy_mask,r20);
654
655             /* Compute parameters for interactions between i and j atoms */
656             qq20             = _mm256_mul_pd(iq2,jq0);
657
658             /* EWALD ELECTROSTATICS */
659
660             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
661             ewrt             = _mm256_mul_pd(r20,ewtabscale);
662             ewitab           = _mm256_cvttpd_epi32(ewrt);
663             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
664             ewitab           = _mm_slli_epi32(ewitab,2);
665             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
666             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
667             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
668             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
669             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
670             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
671             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
672             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
673             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
674
675             /* Update potential sum for this i atom from the interaction with this j atom. */
676             velec            = _mm256_andnot_pd(dummy_mask,velec);
677             velecsum         = _mm256_add_pd(velecsum,velec);
678
679             fscal            = felec;
680
681             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
682
683             /* Calculate temporary vectorial force */
684             tx               = _mm256_mul_pd(fscal,dx20);
685             ty               = _mm256_mul_pd(fscal,dy20);
686             tz               = _mm256_mul_pd(fscal,dz20);
687
688             /* Update vectorial force */
689             fix2             = _mm256_add_pd(fix2,tx);
690             fiy2             = _mm256_add_pd(fiy2,ty);
691             fiz2             = _mm256_add_pd(fiz2,tz);
692
693             fjx0             = _mm256_add_pd(fjx0,tx);
694             fjy0             = _mm256_add_pd(fjy0,ty);
695             fjz0             = _mm256_add_pd(fjz0,tz);
696
697             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
698             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
699             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
700             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
701
702             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
703
704             /* Inner loop uses 163 flops */
705         }
706
707         /* End of innermost loop */
708
709         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
710                                                  f+i_coord_offset,fshift+i_shift_offset);
711
712         ggid                        = gid[iidx];
713         /* Update potential energies */
714         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
715         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
716
717         /* Increment number of inner iterations */
718         inneriter                  += j_index_end - j_index_start;
719
720         /* Outer loop uses 20 flops */
721     }
722
723     /* Increment number of outer iterations */
724     outeriter        += nri;
725
726     /* Update outer/inner flops */
727
728     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*163);
729 }
730 /*
731  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_double
732  * Electrostatics interaction: Ewald
733  * VdW interaction:            CubicSplineTable
734  * Geometry:                   Water3-Particle
735  * Calculate force/pot:        Force
736  */
737 void
738 nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_double
739                     (t_nblist                    * gmx_restrict       nlist,
740                      rvec                        * gmx_restrict          xx,
741                      rvec                        * gmx_restrict          ff,
742                      t_forcerec                  * gmx_restrict          fr,
743                      t_mdatoms                   * gmx_restrict     mdatoms,
744                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
745                      t_nrnb                      * gmx_restrict        nrnb)
746 {
747     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
748      * just 0 for non-waters.
749      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
750      * jnr indices corresponding to data put in the four positions in the SIMD register.
751      */
752     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
753     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
754     int              jnrA,jnrB,jnrC,jnrD;
755     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
756     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
757     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
758     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
759     real             rcutoff_scalar;
760     real             *shiftvec,*fshift,*x,*f;
761     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
762     real             scratch[4*DIM];
763     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
764     real *           vdwioffsetptr0;
765     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
766     real *           vdwioffsetptr1;
767     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
768     real *           vdwioffsetptr2;
769     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
770     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
771     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
772     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
773     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
774     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
775     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
776     real             *charge;
777     int              nvdwtype;
778     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
779     int              *vdwtype;
780     real             *vdwparam;
781     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
782     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
783     __m128i          vfitab;
784     __m128i          ifour       = _mm_set1_epi32(4);
785     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
786     real             *vftab;
787     __m128i          ewitab;
788     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
789     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
790     real             *ewtab;
791     __m256d          dummy_mask,cutoff_mask;
792     __m128           tmpmask0,tmpmask1;
793     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
794     __m256d          one     = _mm256_set1_pd(1.0);
795     __m256d          two     = _mm256_set1_pd(2.0);
796     x                = xx[0];
797     f                = ff[0];
798
799     nri              = nlist->nri;
800     iinr             = nlist->iinr;
801     jindex           = nlist->jindex;
802     jjnr             = nlist->jjnr;
803     shiftidx         = nlist->shift;
804     gid              = nlist->gid;
805     shiftvec         = fr->shift_vec[0];
806     fshift           = fr->fshift[0];
807     facel            = _mm256_set1_pd(fr->epsfac);
808     charge           = mdatoms->chargeA;
809     nvdwtype         = fr->ntype;
810     vdwparam         = fr->nbfp;
811     vdwtype          = mdatoms->typeA;
812
813     vftab            = kernel_data->table_vdw->data;
814     vftabscale       = _mm256_set1_pd(kernel_data->table_vdw->scale);
815
816     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
817     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
818     beta2            = _mm256_mul_pd(beta,beta);
819     beta3            = _mm256_mul_pd(beta,beta2);
820
821     ewtab            = fr->ic->tabq_coul_F;
822     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
823     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
824
825     /* Setup water-specific parameters */
826     inr              = nlist->iinr[0];
827     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
828     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
829     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
830     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
831
832     /* Avoid stupid compiler warnings */
833     jnrA = jnrB = jnrC = jnrD = 0;
834     j_coord_offsetA = 0;
835     j_coord_offsetB = 0;
836     j_coord_offsetC = 0;
837     j_coord_offsetD = 0;
838
839     outeriter        = 0;
840     inneriter        = 0;
841
842     for(iidx=0;iidx<4*DIM;iidx++)
843     {
844         scratch[iidx] = 0.0;
845     }
846
847     /* Start outer loop over neighborlists */
848     for(iidx=0; iidx<nri; iidx++)
849     {
850         /* Load shift vector for this list */
851         i_shift_offset   = DIM*shiftidx[iidx];
852
853         /* Load limits for loop over neighbors */
854         j_index_start    = jindex[iidx];
855         j_index_end      = jindex[iidx+1];
856
857         /* Get outer coordinate index */
858         inr              = iinr[iidx];
859         i_coord_offset   = DIM*inr;
860
861         /* Load i particle coords and add shift vector */
862         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
863                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
864
865         fix0             = _mm256_setzero_pd();
866         fiy0             = _mm256_setzero_pd();
867         fiz0             = _mm256_setzero_pd();
868         fix1             = _mm256_setzero_pd();
869         fiy1             = _mm256_setzero_pd();
870         fiz1             = _mm256_setzero_pd();
871         fix2             = _mm256_setzero_pd();
872         fiy2             = _mm256_setzero_pd();
873         fiz2             = _mm256_setzero_pd();
874
875         /* Start inner kernel loop */
876         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
877         {
878
879             /* Get j neighbor index, and coordinate index */
880             jnrA             = jjnr[jidx];
881             jnrB             = jjnr[jidx+1];
882             jnrC             = jjnr[jidx+2];
883             jnrD             = jjnr[jidx+3];
884             j_coord_offsetA  = DIM*jnrA;
885             j_coord_offsetB  = DIM*jnrB;
886             j_coord_offsetC  = DIM*jnrC;
887             j_coord_offsetD  = DIM*jnrD;
888
889             /* load j atom coordinates */
890             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
891                                                  x+j_coord_offsetC,x+j_coord_offsetD,
892                                                  &jx0,&jy0,&jz0);
893
894             /* Calculate displacement vector */
895             dx00             = _mm256_sub_pd(ix0,jx0);
896             dy00             = _mm256_sub_pd(iy0,jy0);
897             dz00             = _mm256_sub_pd(iz0,jz0);
898             dx10             = _mm256_sub_pd(ix1,jx0);
899             dy10             = _mm256_sub_pd(iy1,jy0);
900             dz10             = _mm256_sub_pd(iz1,jz0);
901             dx20             = _mm256_sub_pd(ix2,jx0);
902             dy20             = _mm256_sub_pd(iy2,jy0);
903             dz20             = _mm256_sub_pd(iz2,jz0);
904
905             /* Calculate squared distance and things based on it */
906             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
907             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
908             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
909
910             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
911             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
912             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
913
914             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
915             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
916             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
917
918             /* Load parameters for j particles */
919             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
920                                                                  charge+jnrC+0,charge+jnrD+0);
921             vdwjidx0A        = 2*vdwtype[jnrA+0];
922             vdwjidx0B        = 2*vdwtype[jnrB+0];
923             vdwjidx0C        = 2*vdwtype[jnrC+0];
924             vdwjidx0D        = 2*vdwtype[jnrD+0];
925
926             fjx0             = _mm256_setzero_pd();
927             fjy0             = _mm256_setzero_pd();
928             fjz0             = _mm256_setzero_pd();
929
930             /**************************
931              * CALCULATE INTERACTIONS *
932              **************************/
933
934             r00              = _mm256_mul_pd(rsq00,rinv00);
935
936             /* Compute parameters for interactions between i and j atoms */
937             qq00             = _mm256_mul_pd(iq0,jq0);
938             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
939                                             vdwioffsetptr0+vdwjidx0B,
940                                             vdwioffsetptr0+vdwjidx0C,
941                                             vdwioffsetptr0+vdwjidx0D,
942                                             &c6_00,&c12_00);
943
944             /* Calculate table index by multiplying r with table scale and truncate to integer */
945             rt               = _mm256_mul_pd(r00,vftabscale);
946             vfitab           = _mm256_cvttpd_epi32(rt);
947             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
948             vfitab           = _mm_slli_epi32(vfitab,3);
949
950             /* EWALD ELECTROSTATICS */
951
952             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
953             ewrt             = _mm256_mul_pd(r00,ewtabscale);
954             ewitab           = _mm256_cvttpd_epi32(ewrt);
955             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
956             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
957                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
958                                             &ewtabF,&ewtabFn);
959             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
960             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
961
962             /* CUBIC SPLINE TABLE DISPERSION */
963             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
964             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
965             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
966             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
967             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
968             Heps             = _mm256_mul_pd(vfeps,H);
969             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
970             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
971             fvdw6            = _mm256_mul_pd(c6_00,FF);
972
973             /* CUBIC SPLINE TABLE REPULSION */
974             vfitab           = _mm_add_epi32(vfitab,ifour);
975             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
976             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
977             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
978             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
979             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
980             Heps             = _mm256_mul_pd(vfeps,H);
981             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
982             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
983             fvdw12           = _mm256_mul_pd(c12_00,FF);
984             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
985
986             fscal            = _mm256_add_pd(felec,fvdw);
987
988             /* Calculate temporary vectorial force */
989             tx               = _mm256_mul_pd(fscal,dx00);
990             ty               = _mm256_mul_pd(fscal,dy00);
991             tz               = _mm256_mul_pd(fscal,dz00);
992
993             /* Update vectorial force */
994             fix0             = _mm256_add_pd(fix0,tx);
995             fiy0             = _mm256_add_pd(fiy0,ty);
996             fiz0             = _mm256_add_pd(fiz0,tz);
997
998             fjx0             = _mm256_add_pd(fjx0,tx);
999             fjy0             = _mm256_add_pd(fjy0,ty);
1000             fjz0             = _mm256_add_pd(fjz0,tz);
1001
1002             /**************************
1003              * CALCULATE INTERACTIONS *
1004              **************************/
1005
1006             r10              = _mm256_mul_pd(rsq10,rinv10);
1007
1008             /* Compute parameters for interactions between i and j atoms */
1009             qq10             = _mm256_mul_pd(iq1,jq0);
1010
1011             /* EWALD ELECTROSTATICS */
1012
1013             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1014             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1015             ewitab           = _mm256_cvttpd_epi32(ewrt);
1016             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1017             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1018                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1019                                             &ewtabF,&ewtabFn);
1020             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1021             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1022
1023             fscal            = felec;
1024
1025             /* Calculate temporary vectorial force */
1026             tx               = _mm256_mul_pd(fscal,dx10);
1027             ty               = _mm256_mul_pd(fscal,dy10);
1028             tz               = _mm256_mul_pd(fscal,dz10);
1029
1030             /* Update vectorial force */
1031             fix1             = _mm256_add_pd(fix1,tx);
1032             fiy1             = _mm256_add_pd(fiy1,ty);
1033             fiz1             = _mm256_add_pd(fiz1,tz);
1034
1035             fjx0             = _mm256_add_pd(fjx0,tx);
1036             fjy0             = _mm256_add_pd(fjy0,ty);
1037             fjz0             = _mm256_add_pd(fjz0,tz);
1038
1039             /**************************
1040              * CALCULATE INTERACTIONS *
1041              **************************/
1042
1043             r20              = _mm256_mul_pd(rsq20,rinv20);
1044
1045             /* Compute parameters for interactions between i and j atoms */
1046             qq20             = _mm256_mul_pd(iq2,jq0);
1047
1048             /* EWALD ELECTROSTATICS */
1049
1050             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1051             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1052             ewitab           = _mm256_cvttpd_epi32(ewrt);
1053             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1054             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1055                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1056                                             &ewtabF,&ewtabFn);
1057             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1058             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1059
1060             fscal            = felec;
1061
1062             /* Calculate temporary vectorial force */
1063             tx               = _mm256_mul_pd(fscal,dx20);
1064             ty               = _mm256_mul_pd(fscal,dy20);
1065             tz               = _mm256_mul_pd(fscal,dz20);
1066
1067             /* Update vectorial force */
1068             fix2             = _mm256_add_pd(fix2,tx);
1069             fiy2             = _mm256_add_pd(fiy2,ty);
1070             fiz2             = _mm256_add_pd(fiz2,tz);
1071
1072             fjx0             = _mm256_add_pd(fjx0,tx);
1073             fjy0             = _mm256_add_pd(fjy0,ty);
1074             fjz0             = _mm256_add_pd(fjz0,tz);
1075
1076             fjptrA             = f+j_coord_offsetA;
1077             fjptrB             = f+j_coord_offsetB;
1078             fjptrC             = f+j_coord_offsetC;
1079             fjptrD             = f+j_coord_offsetD;
1080
1081             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1082
1083             /* Inner loop uses 137 flops */
1084         }
1085
1086         if(jidx<j_index_end)
1087         {
1088
1089             /* Get j neighbor index, and coordinate index */
1090             jnrlistA         = jjnr[jidx];
1091             jnrlistB         = jjnr[jidx+1];
1092             jnrlistC         = jjnr[jidx+2];
1093             jnrlistD         = jjnr[jidx+3];
1094             /* Sign of each element will be negative for non-real atoms.
1095              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1096              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1097              */
1098             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1099
1100             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1101             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1102             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1103
1104             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1105             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1106             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1107             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1108             j_coord_offsetA  = DIM*jnrA;
1109             j_coord_offsetB  = DIM*jnrB;
1110             j_coord_offsetC  = DIM*jnrC;
1111             j_coord_offsetD  = DIM*jnrD;
1112
1113             /* load j atom coordinates */
1114             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1115                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1116                                                  &jx0,&jy0,&jz0);
1117
1118             /* Calculate displacement vector */
1119             dx00             = _mm256_sub_pd(ix0,jx0);
1120             dy00             = _mm256_sub_pd(iy0,jy0);
1121             dz00             = _mm256_sub_pd(iz0,jz0);
1122             dx10             = _mm256_sub_pd(ix1,jx0);
1123             dy10             = _mm256_sub_pd(iy1,jy0);
1124             dz10             = _mm256_sub_pd(iz1,jz0);
1125             dx20             = _mm256_sub_pd(ix2,jx0);
1126             dy20             = _mm256_sub_pd(iy2,jy0);
1127             dz20             = _mm256_sub_pd(iz2,jz0);
1128
1129             /* Calculate squared distance and things based on it */
1130             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1131             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1132             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1133
1134             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1135             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1136             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1137
1138             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1139             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1140             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1141
1142             /* Load parameters for j particles */
1143             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1144                                                                  charge+jnrC+0,charge+jnrD+0);
1145             vdwjidx0A        = 2*vdwtype[jnrA+0];
1146             vdwjidx0B        = 2*vdwtype[jnrB+0];
1147             vdwjidx0C        = 2*vdwtype[jnrC+0];
1148             vdwjidx0D        = 2*vdwtype[jnrD+0];
1149
1150             fjx0             = _mm256_setzero_pd();
1151             fjy0             = _mm256_setzero_pd();
1152             fjz0             = _mm256_setzero_pd();
1153
1154             /**************************
1155              * CALCULATE INTERACTIONS *
1156              **************************/
1157
1158             r00              = _mm256_mul_pd(rsq00,rinv00);
1159             r00              = _mm256_andnot_pd(dummy_mask,r00);
1160
1161             /* Compute parameters for interactions between i and j atoms */
1162             qq00             = _mm256_mul_pd(iq0,jq0);
1163             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1164                                             vdwioffsetptr0+vdwjidx0B,
1165                                             vdwioffsetptr0+vdwjidx0C,
1166                                             vdwioffsetptr0+vdwjidx0D,
1167                                             &c6_00,&c12_00);
1168
1169             /* Calculate table index by multiplying r with table scale and truncate to integer */
1170             rt               = _mm256_mul_pd(r00,vftabscale);
1171             vfitab           = _mm256_cvttpd_epi32(rt);
1172             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1173             vfitab           = _mm_slli_epi32(vfitab,3);
1174
1175             /* EWALD ELECTROSTATICS */
1176
1177             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1178             ewrt             = _mm256_mul_pd(r00,ewtabscale);
1179             ewitab           = _mm256_cvttpd_epi32(ewrt);
1180             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1181             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1182                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1183                                             &ewtabF,&ewtabFn);
1184             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1185             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1186
1187             /* CUBIC SPLINE TABLE DISPERSION */
1188             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1189             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1190             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1191             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1192             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1193             Heps             = _mm256_mul_pd(vfeps,H);
1194             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1195             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1196             fvdw6            = _mm256_mul_pd(c6_00,FF);
1197
1198             /* CUBIC SPLINE TABLE REPULSION */
1199             vfitab           = _mm_add_epi32(vfitab,ifour);
1200             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1201             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1202             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1203             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1204             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1205             Heps             = _mm256_mul_pd(vfeps,H);
1206             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1207             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1208             fvdw12           = _mm256_mul_pd(c12_00,FF);
1209             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1210
1211             fscal            = _mm256_add_pd(felec,fvdw);
1212
1213             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1214
1215             /* Calculate temporary vectorial force */
1216             tx               = _mm256_mul_pd(fscal,dx00);
1217             ty               = _mm256_mul_pd(fscal,dy00);
1218             tz               = _mm256_mul_pd(fscal,dz00);
1219
1220             /* Update vectorial force */
1221             fix0             = _mm256_add_pd(fix0,tx);
1222             fiy0             = _mm256_add_pd(fiy0,ty);
1223             fiz0             = _mm256_add_pd(fiz0,tz);
1224
1225             fjx0             = _mm256_add_pd(fjx0,tx);
1226             fjy0             = _mm256_add_pd(fjy0,ty);
1227             fjz0             = _mm256_add_pd(fjz0,tz);
1228
1229             /**************************
1230              * CALCULATE INTERACTIONS *
1231              **************************/
1232
1233             r10              = _mm256_mul_pd(rsq10,rinv10);
1234             r10              = _mm256_andnot_pd(dummy_mask,r10);
1235
1236             /* Compute parameters for interactions between i and j atoms */
1237             qq10             = _mm256_mul_pd(iq1,jq0);
1238
1239             /* EWALD ELECTROSTATICS */
1240
1241             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1242             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1243             ewitab           = _mm256_cvttpd_epi32(ewrt);
1244             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1245             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1246                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1247                                             &ewtabF,&ewtabFn);
1248             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1249             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1250
1251             fscal            = felec;
1252
1253             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1254
1255             /* Calculate temporary vectorial force */
1256             tx               = _mm256_mul_pd(fscal,dx10);
1257             ty               = _mm256_mul_pd(fscal,dy10);
1258             tz               = _mm256_mul_pd(fscal,dz10);
1259
1260             /* Update vectorial force */
1261             fix1             = _mm256_add_pd(fix1,tx);
1262             fiy1             = _mm256_add_pd(fiy1,ty);
1263             fiz1             = _mm256_add_pd(fiz1,tz);
1264
1265             fjx0             = _mm256_add_pd(fjx0,tx);
1266             fjy0             = _mm256_add_pd(fjy0,ty);
1267             fjz0             = _mm256_add_pd(fjz0,tz);
1268
1269             /**************************
1270              * CALCULATE INTERACTIONS *
1271              **************************/
1272
1273             r20              = _mm256_mul_pd(rsq20,rinv20);
1274             r20              = _mm256_andnot_pd(dummy_mask,r20);
1275
1276             /* Compute parameters for interactions between i and j atoms */
1277             qq20             = _mm256_mul_pd(iq2,jq0);
1278
1279             /* EWALD ELECTROSTATICS */
1280
1281             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1282             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1283             ewitab           = _mm256_cvttpd_epi32(ewrt);
1284             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1285             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1286                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1287                                             &ewtabF,&ewtabFn);
1288             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1289             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1290
1291             fscal            = felec;
1292
1293             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1294
1295             /* Calculate temporary vectorial force */
1296             tx               = _mm256_mul_pd(fscal,dx20);
1297             ty               = _mm256_mul_pd(fscal,dy20);
1298             tz               = _mm256_mul_pd(fscal,dz20);
1299
1300             /* Update vectorial force */
1301             fix2             = _mm256_add_pd(fix2,tx);
1302             fiy2             = _mm256_add_pd(fiy2,ty);
1303             fiz2             = _mm256_add_pd(fiz2,tz);
1304
1305             fjx0             = _mm256_add_pd(fjx0,tx);
1306             fjy0             = _mm256_add_pd(fjy0,ty);
1307             fjz0             = _mm256_add_pd(fjz0,tz);
1308
1309             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1310             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1311             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1312             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1313
1314             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1315
1316             /* Inner loop uses 140 flops */
1317         }
1318
1319         /* End of innermost loop */
1320
1321         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1322                                                  f+i_coord_offset,fshift+i_shift_offset);
1323
1324         /* Increment number of inner iterations */
1325         inneriter                  += j_index_end - j_index_start;
1326
1327         /* Outer loop uses 18 flops */
1328     }
1329
1330     /* Increment number of outer iterations */
1331     outeriter        += nri;
1332
1333     /* Update outer/inner flops */
1334
1335     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*140);
1336 }