Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water3-Particle
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     real *           vdwioffsetptr0;
84     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     real *           vdwioffsetptr1;
86     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
93     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
94     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
95     real             *charge;
96     int              nvdwtype;
97     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98     int              *vdwtype;
99     real             *vdwparam;
100     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
101     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
102     __m128i          ewitab;
103     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
104     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
105     real             *ewtab;
106     __m256d          dummy_mask,cutoff_mask;
107     __m128           tmpmask0,tmpmask1;
108     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
109     __m256d          one     = _mm256_set1_pd(1.0);
110     __m256d          two     = _mm256_set1_pd(2.0);
111     x                = xx[0];
112     f                = ff[0];
113
114     nri              = nlist->nri;
115     iinr             = nlist->iinr;
116     jindex           = nlist->jindex;
117     jjnr             = nlist->jjnr;
118     shiftidx         = nlist->shift;
119     gid              = nlist->gid;
120     shiftvec         = fr->shift_vec[0];
121     fshift           = fr->fshift[0];
122     facel            = _mm256_set1_pd(fr->ic->epsfac);
123     charge           = mdatoms->chargeA;
124     nvdwtype         = fr->ntype;
125     vdwparam         = fr->nbfp;
126     vdwtype          = mdatoms->typeA;
127
128     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
129     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
130     beta2            = _mm256_mul_pd(beta,beta);
131     beta3            = _mm256_mul_pd(beta,beta2);
132
133     ewtab            = fr->ic->tabq_coul_FDV0;
134     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
135     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
136
137     /* Setup water-specific parameters */
138     inr              = nlist->iinr[0];
139     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
140     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
141     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
142     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
143
144     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
145     rcutoff_scalar   = fr->ic->rcoulomb;
146     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
147     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
148
149     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
150     rvdw             = _mm256_set1_pd(fr->ic->rvdw);
151
152     /* Avoid stupid compiler warnings */
153     jnrA = jnrB = jnrC = jnrD = 0;
154     j_coord_offsetA = 0;
155     j_coord_offsetB = 0;
156     j_coord_offsetC = 0;
157     j_coord_offsetD = 0;
158
159     outeriter        = 0;
160     inneriter        = 0;
161
162     for(iidx=0;iidx<4*DIM;iidx++)
163     {
164         scratch[iidx] = 0.0;
165     }
166
167     /* Start outer loop over neighborlists */
168     for(iidx=0; iidx<nri; iidx++)
169     {
170         /* Load shift vector for this list */
171         i_shift_offset   = DIM*shiftidx[iidx];
172
173         /* Load limits for loop over neighbors */
174         j_index_start    = jindex[iidx];
175         j_index_end      = jindex[iidx+1];
176
177         /* Get outer coordinate index */
178         inr              = iinr[iidx];
179         i_coord_offset   = DIM*inr;
180
181         /* Load i particle coords and add shift vector */
182         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
184
185         fix0             = _mm256_setzero_pd();
186         fiy0             = _mm256_setzero_pd();
187         fiz0             = _mm256_setzero_pd();
188         fix1             = _mm256_setzero_pd();
189         fiy1             = _mm256_setzero_pd();
190         fiz1             = _mm256_setzero_pd();
191         fix2             = _mm256_setzero_pd();
192         fiy2             = _mm256_setzero_pd();
193         fiz2             = _mm256_setzero_pd();
194
195         /* Reset potential sums */
196         velecsum         = _mm256_setzero_pd();
197         vvdwsum          = _mm256_setzero_pd();
198
199         /* Start inner kernel loop */
200         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
201         {
202
203             /* Get j neighbor index, and coordinate index */
204             jnrA             = jjnr[jidx];
205             jnrB             = jjnr[jidx+1];
206             jnrC             = jjnr[jidx+2];
207             jnrD             = jjnr[jidx+3];
208             j_coord_offsetA  = DIM*jnrA;
209             j_coord_offsetB  = DIM*jnrB;
210             j_coord_offsetC  = DIM*jnrC;
211             j_coord_offsetD  = DIM*jnrD;
212
213             /* load j atom coordinates */
214             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
215                                                  x+j_coord_offsetC,x+j_coord_offsetD,
216                                                  &jx0,&jy0,&jz0);
217
218             /* Calculate displacement vector */
219             dx00             = _mm256_sub_pd(ix0,jx0);
220             dy00             = _mm256_sub_pd(iy0,jy0);
221             dz00             = _mm256_sub_pd(iz0,jz0);
222             dx10             = _mm256_sub_pd(ix1,jx0);
223             dy10             = _mm256_sub_pd(iy1,jy0);
224             dz10             = _mm256_sub_pd(iz1,jz0);
225             dx20             = _mm256_sub_pd(ix2,jx0);
226             dy20             = _mm256_sub_pd(iy2,jy0);
227             dz20             = _mm256_sub_pd(iz2,jz0);
228
229             /* Calculate squared distance and things based on it */
230             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
231             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
232             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
233
234             rinv00           = avx256_invsqrt_d(rsq00);
235             rinv10           = avx256_invsqrt_d(rsq10);
236             rinv20           = avx256_invsqrt_d(rsq20);
237
238             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
239             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
240             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
241
242             /* Load parameters for j particles */
243             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
244                                                                  charge+jnrC+0,charge+jnrD+0);
245             vdwjidx0A        = 2*vdwtype[jnrA+0];
246             vdwjidx0B        = 2*vdwtype[jnrB+0];
247             vdwjidx0C        = 2*vdwtype[jnrC+0];
248             vdwjidx0D        = 2*vdwtype[jnrD+0];
249
250             fjx0             = _mm256_setzero_pd();
251             fjy0             = _mm256_setzero_pd();
252             fjz0             = _mm256_setzero_pd();
253
254             /**************************
255              * CALCULATE INTERACTIONS *
256              **************************/
257
258             if (gmx_mm256_any_lt(rsq00,rcutoff2))
259             {
260
261             r00              = _mm256_mul_pd(rsq00,rinv00);
262
263             /* Compute parameters for interactions between i and j atoms */
264             qq00             = _mm256_mul_pd(iq0,jq0);
265             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
266                                             vdwioffsetptr0+vdwjidx0B,
267                                             vdwioffsetptr0+vdwjidx0C,
268                                             vdwioffsetptr0+vdwjidx0D,
269                                             &c6_00,&c12_00);
270
271             /* EWALD ELECTROSTATICS */
272
273             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
274             ewrt             = _mm256_mul_pd(r00,ewtabscale);
275             ewitab           = _mm256_cvttpd_epi32(ewrt);
276             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
277             ewitab           = _mm_slli_epi32(ewitab,2);
278             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
279             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
280             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
281             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
282             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
283             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
284             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
285             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
286             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
287
288             /* LENNARD-JONES DISPERSION/REPULSION */
289
290             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
291             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
292             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
293             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
294                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
295             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
296
297             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
298
299             /* Update potential sum for this i atom from the interaction with this j atom. */
300             velec            = _mm256_and_pd(velec,cutoff_mask);
301             velecsum         = _mm256_add_pd(velecsum,velec);
302             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
303             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
304
305             fscal            = _mm256_add_pd(felec,fvdw);
306
307             fscal            = _mm256_and_pd(fscal,cutoff_mask);
308
309             /* Calculate temporary vectorial force */
310             tx               = _mm256_mul_pd(fscal,dx00);
311             ty               = _mm256_mul_pd(fscal,dy00);
312             tz               = _mm256_mul_pd(fscal,dz00);
313
314             /* Update vectorial force */
315             fix0             = _mm256_add_pd(fix0,tx);
316             fiy0             = _mm256_add_pd(fiy0,ty);
317             fiz0             = _mm256_add_pd(fiz0,tz);
318
319             fjx0             = _mm256_add_pd(fjx0,tx);
320             fjy0             = _mm256_add_pd(fjy0,ty);
321             fjz0             = _mm256_add_pd(fjz0,tz);
322
323             }
324
325             /**************************
326              * CALCULATE INTERACTIONS *
327              **************************/
328
329             if (gmx_mm256_any_lt(rsq10,rcutoff2))
330             {
331
332             r10              = _mm256_mul_pd(rsq10,rinv10);
333
334             /* Compute parameters for interactions between i and j atoms */
335             qq10             = _mm256_mul_pd(iq1,jq0);
336
337             /* EWALD ELECTROSTATICS */
338
339             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
340             ewrt             = _mm256_mul_pd(r10,ewtabscale);
341             ewitab           = _mm256_cvttpd_epi32(ewrt);
342             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
343             ewitab           = _mm_slli_epi32(ewitab,2);
344             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
345             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
346             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
347             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
348             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
349             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
350             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
351             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
352             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
353
354             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
355
356             /* Update potential sum for this i atom from the interaction with this j atom. */
357             velec            = _mm256_and_pd(velec,cutoff_mask);
358             velecsum         = _mm256_add_pd(velecsum,velec);
359
360             fscal            = felec;
361
362             fscal            = _mm256_and_pd(fscal,cutoff_mask);
363
364             /* Calculate temporary vectorial force */
365             tx               = _mm256_mul_pd(fscal,dx10);
366             ty               = _mm256_mul_pd(fscal,dy10);
367             tz               = _mm256_mul_pd(fscal,dz10);
368
369             /* Update vectorial force */
370             fix1             = _mm256_add_pd(fix1,tx);
371             fiy1             = _mm256_add_pd(fiy1,ty);
372             fiz1             = _mm256_add_pd(fiz1,tz);
373
374             fjx0             = _mm256_add_pd(fjx0,tx);
375             fjy0             = _mm256_add_pd(fjy0,ty);
376             fjz0             = _mm256_add_pd(fjz0,tz);
377
378             }
379
380             /**************************
381              * CALCULATE INTERACTIONS *
382              **************************/
383
384             if (gmx_mm256_any_lt(rsq20,rcutoff2))
385             {
386
387             r20              = _mm256_mul_pd(rsq20,rinv20);
388
389             /* Compute parameters for interactions between i and j atoms */
390             qq20             = _mm256_mul_pd(iq2,jq0);
391
392             /* EWALD ELECTROSTATICS */
393
394             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
395             ewrt             = _mm256_mul_pd(r20,ewtabscale);
396             ewitab           = _mm256_cvttpd_epi32(ewrt);
397             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
398             ewitab           = _mm_slli_epi32(ewitab,2);
399             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
400             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
401             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
402             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
403             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
404             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
405             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
406             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
407             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
408
409             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
410
411             /* Update potential sum for this i atom from the interaction with this j atom. */
412             velec            = _mm256_and_pd(velec,cutoff_mask);
413             velecsum         = _mm256_add_pd(velecsum,velec);
414
415             fscal            = felec;
416
417             fscal            = _mm256_and_pd(fscal,cutoff_mask);
418
419             /* Calculate temporary vectorial force */
420             tx               = _mm256_mul_pd(fscal,dx20);
421             ty               = _mm256_mul_pd(fscal,dy20);
422             tz               = _mm256_mul_pd(fscal,dz20);
423
424             /* Update vectorial force */
425             fix2             = _mm256_add_pd(fix2,tx);
426             fiy2             = _mm256_add_pd(fiy2,ty);
427             fiz2             = _mm256_add_pd(fiz2,tz);
428
429             fjx0             = _mm256_add_pd(fjx0,tx);
430             fjy0             = _mm256_add_pd(fjy0,ty);
431             fjz0             = _mm256_add_pd(fjz0,tz);
432
433             }
434
435             fjptrA             = f+j_coord_offsetA;
436             fjptrB             = f+j_coord_offsetB;
437             fjptrC             = f+j_coord_offsetC;
438             fjptrD             = f+j_coord_offsetD;
439
440             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
441
442             /* Inner loop uses 159 flops */
443         }
444
445         if(jidx<j_index_end)
446         {
447
448             /* Get j neighbor index, and coordinate index */
449             jnrlistA         = jjnr[jidx];
450             jnrlistB         = jjnr[jidx+1];
451             jnrlistC         = jjnr[jidx+2];
452             jnrlistD         = jjnr[jidx+3];
453             /* Sign of each element will be negative for non-real atoms.
454              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
455              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
456              */
457             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
458
459             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
460             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
461             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
462
463             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
464             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
465             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
466             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
467             j_coord_offsetA  = DIM*jnrA;
468             j_coord_offsetB  = DIM*jnrB;
469             j_coord_offsetC  = DIM*jnrC;
470             j_coord_offsetD  = DIM*jnrD;
471
472             /* load j atom coordinates */
473             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
474                                                  x+j_coord_offsetC,x+j_coord_offsetD,
475                                                  &jx0,&jy0,&jz0);
476
477             /* Calculate displacement vector */
478             dx00             = _mm256_sub_pd(ix0,jx0);
479             dy00             = _mm256_sub_pd(iy0,jy0);
480             dz00             = _mm256_sub_pd(iz0,jz0);
481             dx10             = _mm256_sub_pd(ix1,jx0);
482             dy10             = _mm256_sub_pd(iy1,jy0);
483             dz10             = _mm256_sub_pd(iz1,jz0);
484             dx20             = _mm256_sub_pd(ix2,jx0);
485             dy20             = _mm256_sub_pd(iy2,jy0);
486             dz20             = _mm256_sub_pd(iz2,jz0);
487
488             /* Calculate squared distance and things based on it */
489             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
490             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
491             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
492
493             rinv00           = avx256_invsqrt_d(rsq00);
494             rinv10           = avx256_invsqrt_d(rsq10);
495             rinv20           = avx256_invsqrt_d(rsq20);
496
497             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
498             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
499             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
500
501             /* Load parameters for j particles */
502             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
503                                                                  charge+jnrC+0,charge+jnrD+0);
504             vdwjidx0A        = 2*vdwtype[jnrA+0];
505             vdwjidx0B        = 2*vdwtype[jnrB+0];
506             vdwjidx0C        = 2*vdwtype[jnrC+0];
507             vdwjidx0D        = 2*vdwtype[jnrD+0];
508
509             fjx0             = _mm256_setzero_pd();
510             fjy0             = _mm256_setzero_pd();
511             fjz0             = _mm256_setzero_pd();
512
513             /**************************
514              * CALCULATE INTERACTIONS *
515              **************************/
516
517             if (gmx_mm256_any_lt(rsq00,rcutoff2))
518             {
519
520             r00              = _mm256_mul_pd(rsq00,rinv00);
521             r00              = _mm256_andnot_pd(dummy_mask,r00);
522
523             /* Compute parameters for interactions between i and j atoms */
524             qq00             = _mm256_mul_pd(iq0,jq0);
525             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
526                                             vdwioffsetptr0+vdwjidx0B,
527                                             vdwioffsetptr0+vdwjidx0C,
528                                             vdwioffsetptr0+vdwjidx0D,
529                                             &c6_00,&c12_00);
530
531             /* EWALD ELECTROSTATICS */
532
533             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
534             ewrt             = _mm256_mul_pd(r00,ewtabscale);
535             ewitab           = _mm256_cvttpd_epi32(ewrt);
536             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
537             ewitab           = _mm_slli_epi32(ewitab,2);
538             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
539             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
540             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
541             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
542             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
543             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
544             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
545             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
546             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
547
548             /* LENNARD-JONES DISPERSION/REPULSION */
549
550             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
551             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
552             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
553             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
554                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
555             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
556
557             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
558
559             /* Update potential sum for this i atom from the interaction with this j atom. */
560             velec            = _mm256_and_pd(velec,cutoff_mask);
561             velec            = _mm256_andnot_pd(dummy_mask,velec);
562             velecsum         = _mm256_add_pd(velecsum,velec);
563             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
564             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
565             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
566
567             fscal            = _mm256_add_pd(felec,fvdw);
568
569             fscal            = _mm256_and_pd(fscal,cutoff_mask);
570
571             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
572
573             /* Calculate temporary vectorial force */
574             tx               = _mm256_mul_pd(fscal,dx00);
575             ty               = _mm256_mul_pd(fscal,dy00);
576             tz               = _mm256_mul_pd(fscal,dz00);
577
578             /* Update vectorial force */
579             fix0             = _mm256_add_pd(fix0,tx);
580             fiy0             = _mm256_add_pd(fiy0,ty);
581             fiz0             = _mm256_add_pd(fiz0,tz);
582
583             fjx0             = _mm256_add_pd(fjx0,tx);
584             fjy0             = _mm256_add_pd(fjy0,ty);
585             fjz0             = _mm256_add_pd(fjz0,tz);
586
587             }
588
589             /**************************
590              * CALCULATE INTERACTIONS *
591              **************************/
592
593             if (gmx_mm256_any_lt(rsq10,rcutoff2))
594             {
595
596             r10              = _mm256_mul_pd(rsq10,rinv10);
597             r10              = _mm256_andnot_pd(dummy_mask,r10);
598
599             /* Compute parameters for interactions between i and j atoms */
600             qq10             = _mm256_mul_pd(iq1,jq0);
601
602             /* EWALD ELECTROSTATICS */
603
604             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
605             ewrt             = _mm256_mul_pd(r10,ewtabscale);
606             ewitab           = _mm256_cvttpd_epi32(ewrt);
607             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
608             ewitab           = _mm_slli_epi32(ewitab,2);
609             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
610             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
611             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
612             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
613             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
614             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
615             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
616             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
617             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
618
619             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
620
621             /* Update potential sum for this i atom from the interaction with this j atom. */
622             velec            = _mm256_and_pd(velec,cutoff_mask);
623             velec            = _mm256_andnot_pd(dummy_mask,velec);
624             velecsum         = _mm256_add_pd(velecsum,velec);
625
626             fscal            = felec;
627
628             fscal            = _mm256_and_pd(fscal,cutoff_mask);
629
630             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
631
632             /* Calculate temporary vectorial force */
633             tx               = _mm256_mul_pd(fscal,dx10);
634             ty               = _mm256_mul_pd(fscal,dy10);
635             tz               = _mm256_mul_pd(fscal,dz10);
636
637             /* Update vectorial force */
638             fix1             = _mm256_add_pd(fix1,tx);
639             fiy1             = _mm256_add_pd(fiy1,ty);
640             fiz1             = _mm256_add_pd(fiz1,tz);
641
642             fjx0             = _mm256_add_pd(fjx0,tx);
643             fjy0             = _mm256_add_pd(fjy0,ty);
644             fjz0             = _mm256_add_pd(fjz0,tz);
645
646             }
647
648             /**************************
649              * CALCULATE INTERACTIONS *
650              **************************/
651
652             if (gmx_mm256_any_lt(rsq20,rcutoff2))
653             {
654
655             r20              = _mm256_mul_pd(rsq20,rinv20);
656             r20              = _mm256_andnot_pd(dummy_mask,r20);
657
658             /* Compute parameters for interactions between i and j atoms */
659             qq20             = _mm256_mul_pd(iq2,jq0);
660
661             /* EWALD ELECTROSTATICS */
662
663             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
664             ewrt             = _mm256_mul_pd(r20,ewtabscale);
665             ewitab           = _mm256_cvttpd_epi32(ewrt);
666             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
667             ewitab           = _mm_slli_epi32(ewitab,2);
668             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
669             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
670             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
671             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
672             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
673             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
674             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
675             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
676             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
677
678             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
679
680             /* Update potential sum for this i atom from the interaction with this j atom. */
681             velec            = _mm256_and_pd(velec,cutoff_mask);
682             velec            = _mm256_andnot_pd(dummy_mask,velec);
683             velecsum         = _mm256_add_pd(velecsum,velec);
684
685             fscal            = felec;
686
687             fscal            = _mm256_and_pd(fscal,cutoff_mask);
688
689             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
690
691             /* Calculate temporary vectorial force */
692             tx               = _mm256_mul_pd(fscal,dx20);
693             ty               = _mm256_mul_pd(fscal,dy20);
694             tz               = _mm256_mul_pd(fscal,dz20);
695
696             /* Update vectorial force */
697             fix2             = _mm256_add_pd(fix2,tx);
698             fiy2             = _mm256_add_pd(fiy2,ty);
699             fiz2             = _mm256_add_pd(fiz2,tz);
700
701             fjx0             = _mm256_add_pd(fjx0,tx);
702             fjy0             = _mm256_add_pd(fjy0,ty);
703             fjz0             = _mm256_add_pd(fjz0,tz);
704
705             }
706
707             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
708             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
709             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
710             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
711
712             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
713
714             /* Inner loop uses 162 flops */
715         }
716
717         /* End of innermost loop */
718
719         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
720                                                  f+i_coord_offset,fshift+i_shift_offset);
721
722         ggid                        = gid[iidx];
723         /* Update potential energies */
724         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
725         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
726
727         /* Increment number of inner iterations */
728         inneriter                  += j_index_end - j_index_start;
729
730         /* Outer loop uses 20 flops */
731     }
732
733     /* Increment number of outer iterations */
734     outeriter        += nri;
735
736     /* Update outer/inner flops */
737
738     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*162);
739 }
740 /*
741  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_double
742  * Electrostatics interaction: Ewald
743  * VdW interaction:            LennardJones
744  * Geometry:                   Water3-Particle
745  * Calculate force/pot:        Force
746  */
747 void
748 nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_double
749                     (t_nblist                    * gmx_restrict       nlist,
750                      rvec                        * gmx_restrict          xx,
751                      rvec                        * gmx_restrict          ff,
752                      struct t_forcerec           * gmx_restrict          fr,
753                      t_mdatoms                   * gmx_restrict     mdatoms,
754                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
755                      t_nrnb                      * gmx_restrict        nrnb)
756 {
757     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
758      * just 0 for non-waters.
759      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
760      * jnr indices corresponding to data put in the four positions in the SIMD register.
761      */
762     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
763     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
764     int              jnrA,jnrB,jnrC,jnrD;
765     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
766     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
767     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
768     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
769     real             rcutoff_scalar;
770     real             *shiftvec,*fshift,*x,*f;
771     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
772     real             scratch[4*DIM];
773     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
774     real *           vdwioffsetptr0;
775     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
776     real *           vdwioffsetptr1;
777     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
778     real *           vdwioffsetptr2;
779     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
780     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
781     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
782     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
783     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
784     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
785     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
786     real             *charge;
787     int              nvdwtype;
788     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
789     int              *vdwtype;
790     real             *vdwparam;
791     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
792     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
793     __m128i          ewitab;
794     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
795     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
796     real             *ewtab;
797     __m256d          dummy_mask,cutoff_mask;
798     __m128           tmpmask0,tmpmask1;
799     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
800     __m256d          one     = _mm256_set1_pd(1.0);
801     __m256d          two     = _mm256_set1_pd(2.0);
802     x                = xx[0];
803     f                = ff[0];
804
805     nri              = nlist->nri;
806     iinr             = nlist->iinr;
807     jindex           = nlist->jindex;
808     jjnr             = nlist->jjnr;
809     shiftidx         = nlist->shift;
810     gid              = nlist->gid;
811     shiftvec         = fr->shift_vec[0];
812     fshift           = fr->fshift[0];
813     facel            = _mm256_set1_pd(fr->ic->epsfac);
814     charge           = mdatoms->chargeA;
815     nvdwtype         = fr->ntype;
816     vdwparam         = fr->nbfp;
817     vdwtype          = mdatoms->typeA;
818
819     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
820     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
821     beta2            = _mm256_mul_pd(beta,beta);
822     beta3            = _mm256_mul_pd(beta,beta2);
823
824     ewtab            = fr->ic->tabq_coul_F;
825     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
826     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
827
828     /* Setup water-specific parameters */
829     inr              = nlist->iinr[0];
830     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
831     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
832     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
833     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
834
835     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
836     rcutoff_scalar   = fr->ic->rcoulomb;
837     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
838     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
839
840     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
841     rvdw             = _mm256_set1_pd(fr->ic->rvdw);
842
843     /* Avoid stupid compiler warnings */
844     jnrA = jnrB = jnrC = jnrD = 0;
845     j_coord_offsetA = 0;
846     j_coord_offsetB = 0;
847     j_coord_offsetC = 0;
848     j_coord_offsetD = 0;
849
850     outeriter        = 0;
851     inneriter        = 0;
852
853     for(iidx=0;iidx<4*DIM;iidx++)
854     {
855         scratch[iidx] = 0.0;
856     }
857
858     /* Start outer loop over neighborlists */
859     for(iidx=0; iidx<nri; iidx++)
860     {
861         /* Load shift vector for this list */
862         i_shift_offset   = DIM*shiftidx[iidx];
863
864         /* Load limits for loop over neighbors */
865         j_index_start    = jindex[iidx];
866         j_index_end      = jindex[iidx+1];
867
868         /* Get outer coordinate index */
869         inr              = iinr[iidx];
870         i_coord_offset   = DIM*inr;
871
872         /* Load i particle coords and add shift vector */
873         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
874                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
875
876         fix0             = _mm256_setzero_pd();
877         fiy0             = _mm256_setzero_pd();
878         fiz0             = _mm256_setzero_pd();
879         fix1             = _mm256_setzero_pd();
880         fiy1             = _mm256_setzero_pd();
881         fiz1             = _mm256_setzero_pd();
882         fix2             = _mm256_setzero_pd();
883         fiy2             = _mm256_setzero_pd();
884         fiz2             = _mm256_setzero_pd();
885
886         /* Start inner kernel loop */
887         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
888         {
889
890             /* Get j neighbor index, and coordinate index */
891             jnrA             = jjnr[jidx];
892             jnrB             = jjnr[jidx+1];
893             jnrC             = jjnr[jidx+2];
894             jnrD             = jjnr[jidx+3];
895             j_coord_offsetA  = DIM*jnrA;
896             j_coord_offsetB  = DIM*jnrB;
897             j_coord_offsetC  = DIM*jnrC;
898             j_coord_offsetD  = DIM*jnrD;
899
900             /* load j atom coordinates */
901             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
902                                                  x+j_coord_offsetC,x+j_coord_offsetD,
903                                                  &jx0,&jy0,&jz0);
904
905             /* Calculate displacement vector */
906             dx00             = _mm256_sub_pd(ix0,jx0);
907             dy00             = _mm256_sub_pd(iy0,jy0);
908             dz00             = _mm256_sub_pd(iz0,jz0);
909             dx10             = _mm256_sub_pd(ix1,jx0);
910             dy10             = _mm256_sub_pd(iy1,jy0);
911             dz10             = _mm256_sub_pd(iz1,jz0);
912             dx20             = _mm256_sub_pd(ix2,jx0);
913             dy20             = _mm256_sub_pd(iy2,jy0);
914             dz20             = _mm256_sub_pd(iz2,jz0);
915
916             /* Calculate squared distance and things based on it */
917             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
918             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
919             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
920
921             rinv00           = avx256_invsqrt_d(rsq00);
922             rinv10           = avx256_invsqrt_d(rsq10);
923             rinv20           = avx256_invsqrt_d(rsq20);
924
925             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
926             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
927             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
928
929             /* Load parameters for j particles */
930             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
931                                                                  charge+jnrC+0,charge+jnrD+0);
932             vdwjidx0A        = 2*vdwtype[jnrA+0];
933             vdwjidx0B        = 2*vdwtype[jnrB+0];
934             vdwjidx0C        = 2*vdwtype[jnrC+0];
935             vdwjidx0D        = 2*vdwtype[jnrD+0];
936
937             fjx0             = _mm256_setzero_pd();
938             fjy0             = _mm256_setzero_pd();
939             fjz0             = _mm256_setzero_pd();
940
941             /**************************
942              * CALCULATE INTERACTIONS *
943              **************************/
944
945             if (gmx_mm256_any_lt(rsq00,rcutoff2))
946             {
947
948             r00              = _mm256_mul_pd(rsq00,rinv00);
949
950             /* Compute parameters for interactions between i and j atoms */
951             qq00             = _mm256_mul_pd(iq0,jq0);
952             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
953                                             vdwioffsetptr0+vdwjidx0B,
954                                             vdwioffsetptr0+vdwjidx0C,
955                                             vdwioffsetptr0+vdwjidx0D,
956                                             &c6_00,&c12_00);
957
958             /* EWALD ELECTROSTATICS */
959
960             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
961             ewrt             = _mm256_mul_pd(r00,ewtabscale);
962             ewitab           = _mm256_cvttpd_epi32(ewrt);
963             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
964             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
965                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
966                                             &ewtabF,&ewtabFn);
967             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
968             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
969
970             /* LENNARD-JONES DISPERSION/REPULSION */
971
972             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
973             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
974
975             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
976
977             fscal            = _mm256_add_pd(felec,fvdw);
978
979             fscal            = _mm256_and_pd(fscal,cutoff_mask);
980
981             /* Calculate temporary vectorial force */
982             tx               = _mm256_mul_pd(fscal,dx00);
983             ty               = _mm256_mul_pd(fscal,dy00);
984             tz               = _mm256_mul_pd(fscal,dz00);
985
986             /* Update vectorial force */
987             fix0             = _mm256_add_pd(fix0,tx);
988             fiy0             = _mm256_add_pd(fiy0,ty);
989             fiz0             = _mm256_add_pd(fiz0,tz);
990
991             fjx0             = _mm256_add_pd(fjx0,tx);
992             fjy0             = _mm256_add_pd(fjy0,ty);
993             fjz0             = _mm256_add_pd(fjz0,tz);
994
995             }
996
997             /**************************
998              * CALCULATE INTERACTIONS *
999              **************************/
1000
1001             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1002             {
1003
1004             r10              = _mm256_mul_pd(rsq10,rinv10);
1005
1006             /* Compute parameters for interactions between i and j atoms */
1007             qq10             = _mm256_mul_pd(iq1,jq0);
1008
1009             /* EWALD ELECTROSTATICS */
1010
1011             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1012             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1013             ewitab           = _mm256_cvttpd_epi32(ewrt);
1014             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1015             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1016                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1017                                             &ewtabF,&ewtabFn);
1018             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1019             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1020
1021             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1022
1023             fscal            = felec;
1024
1025             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1026
1027             /* Calculate temporary vectorial force */
1028             tx               = _mm256_mul_pd(fscal,dx10);
1029             ty               = _mm256_mul_pd(fscal,dy10);
1030             tz               = _mm256_mul_pd(fscal,dz10);
1031
1032             /* Update vectorial force */
1033             fix1             = _mm256_add_pd(fix1,tx);
1034             fiy1             = _mm256_add_pd(fiy1,ty);
1035             fiz1             = _mm256_add_pd(fiz1,tz);
1036
1037             fjx0             = _mm256_add_pd(fjx0,tx);
1038             fjy0             = _mm256_add_pd(fjy0,ty);
1039             fjz0             = _mm256_add_pd(fjz0,tz);
1040
1041             }
1042
1043             /**************************
1044              * CALCULATE INTERACTIONS *
1045              **************************/
1046
1047             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1048             {
1049
1050             r20              = _mm256_mul_pd(rsq20,rinv20);
1051
1052             /* Compute parameters for interactions between i and j atoms */
1053             qq20             = _mm256_mul_pd(iq2,jq0);
1054
1055             /* EWALD ELECTROSTATICS */
1056
1057             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1058             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1059             ewitab           = _mm256_cvttpd_epi32(ewrt);
1060             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1061             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1062                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1063                                             &ewtabF,&ewtabFn);
1064             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1065             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1066
1067             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1068
1069             fscal            = felec;
1070
1071             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1072
1073             /* Calculate temporary vectorial force */
1074             tx               = _mm256_mul_pd(fscal,dx20);
1075             ty               = _mm256_mul_pd(fscal,dy20);
1076             tz               = _mm256_mul_pd(fscal,dz20);
1077
1078             /* Update vectorial force */
1079             fix2             = _mm256_add_pd(fix2,tx);
1080             fiy2             = _mm256_add_pd(fiy2,ty);
1081             fiz2             = _mm256_add_pd(fiz2,tz);
1082
1083             fjx0             = _mm256_add_pd(fjx0,tx);
1084             fjy0             = _mm256_add_pd(fjy0,ty);
1085             fjz0             = _mm256_add_pd(fjz0,tz);
1086
1087             }
1088
1089             fjptrA             = f+j_coord_offsetA;
1090             fjptrB             = f+j_coord_offsetB;
1091             fjptrC             = f+j_coord_offsetC;
1092             fjptrD             = f+j_coord_offsetD;
1093
1094             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1095
1096             /* Inner loop uses 127 flops */
1097         }
1098
1099         if(jidx<j_index_end)
1100         {
1101
1102             /* Get j neighbor index, and coordinate index */
1103             jnrlistA         = jjnr[jidx];
1104             jnrlistB         = jjnr[jidx+1];
1105             jnrlistC         = jjnr[jidx+2];
1106             jnrlistD         = jjnr[jidx+3];
1107             /* Sign of each element will be negative for non-real atoms.
1108              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1109              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1110              */
1111             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1112
1113             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1114             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1115             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1116
1117             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1118             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1119             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1120             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1121             j_coord_offsetA  = DIM*jnrA;
1122             j_coord_offsetB  = DIM*jnrB;
1123             j_coord_offsetC  = DIM*jnrC;
1124             j_coord_offsetD  = DIM*jnrD;
1125
1126             /* load j atom coordinates */
1127             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1128                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1129                                                  &jx0,&jy0,&jz0);
1130
1131             /* Calculate displacement vector */
1132             dx00             = _mm256_sub_pd(ix0,jx0);
1133             dy00             = _mm256_sub_pd(iy0,jy0);
1134             dz00             = _mm256_sub_pd(iz0,jz0);
1135             dx10             = _mm256_sub_pd(ix1,jx0);
1136             dy10             = _mm256_sub_pd(iy1,jy0);
1137             dz10             = _mm256_sub_pd(iz1,jz0);
1138             dx20             = _mm256_sub_pd(ix2,jx0);
1139             dy20             = _mm256_sub_pd(iy2,jy0);
1140             dz20             = _mm256_sub_pd(iz2,jz0);
1141
1142             /* Calculate squared distance and things based on it */
1143             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1144             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1145             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1146
1147             rinv00           = avx256_invsqrt_d(rsq00);
1148             rinv10           = avx256_invsqrt_d(rsq10);
1149             rinv20           = avx256_invsqrt_d(rsq20);
1150
1151             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1152             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1153             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1154
1155             /* Load parameters for j particles */
1156             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1157                                                                  charge+jnrC+0,charge+jnrD+0);
1158             vdwjidx0A        = 2*vdwtype[jnrA+0];
1159             vdwjidx0B        = 2*vdwtype[jnrB+0];
1160             vdwjidx0C        = 2*vdwtype[jnrC+0];
1161             vdwjidx0D        = 2*vdwtype[jnrD+0];
1162
1163             fjx0             = _mm256_setzero_pd();
1164             fjy0             = _mm256_setzero_pd();
1165             fjz0             = _mm256_setzero_pd();
1166
1167             /**************************
1168              * CALCULATE INTERACTIONS *
1169              **************************/
1170
1171             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1172             {
1173
1174             r00              = _mm256_mul_pd(rsq00,rinv00);
1175             r00              = _mm256_andnot_pd(dummy_mask,r00);
1176
1177             /* Compute parameters for interactions between i and j atoms */
1178             qq00             = _mm256_mul_pd(iq0,jq0);
1179             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1180                                             vdwioffsetptr0+vdwjidx0B,
1181                                             vdwioffsetptr0+vdwjidx0C,
1182                                             vdwioffsetptr0+vdwjidx0D,
1183                                             &c6_00,&c12_00);
1184
1185             /* EWALD ELECTROSTATICS */
1186
1187             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1188             ewrt             = _mm256_mul_pd(r00,ewtabscale);
1189             ewitab           = _mm256_cvttpd_epi32(ewrt);
1190             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1191             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1192                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1193                                             &ewtabF,&ewtabFn);
1194             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1195             felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1196
1197             /* LENNARD-JONES DISPERSION/REPULSION */
1198
1199             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1200             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1201
1202             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1203
1204             fscal            = _mm256_add_pd(felec,fvdw);
1205
1206             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1207
1208             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1209
1210             /* Calculate temporary vectorial force */
1211             tx               = _mm256_mul_pd(fscal,dx00);
1212             ty               = _mm256_mul_pd(fscal,dy00);
1213             tz               = _mm256_mul_pd(fscal,dz00);
1214
1215             /* Update vectorial force */
1216             fix0             = _mm256_add_pd(fix0,tx);
1217             fiy0             = _mm256_add_pd(fiy0,ty);
1218             fiz0             = _mm256_add_pd(fiz0,tz);
1219
1220             fjx0             = _mm256_add_pd(fjx0,tx);
1221             fjy0             = _mm256_add_pd(fjy0,ty);
1222             fjz0             = _mm256_add_pd(fjz0,tz);
1223
1224             }
1225
1226             /**************************
1227              * CALCULATE INTERACTIONS *
1228              **************************/
1229
1230             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1231             {
1232
1233             r10              = _mm256_mul_pd(rsq10,rinv10);
1234             r10              = _mm256_andnot_pd(dummy_mask,r10);
1235
1236             /* Compute parameters for interactions between i and j atoms */
1237             qq10             = _mm256_mul_pd(iq1,jq0);
1238
1239             /* EWALD ELECTROSTATICS */
1240
1241             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1242             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1243             ewitab           = _mm256_cvttpd_epi32(ewrt);
1244             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1245             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1246                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1247                                             &ewtabF,&ewtabFn);
1248             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1249             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1250
1251             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1252
1253             fscal            = felec;
1254
1255             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1256
1257             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1258
1259             /* Calculate temporary vectorial force */
1260             tx               = _mm256_mul_pd(fscal,dx10);
1261             ty               = _mm256_mul_pd(fscal,dy10);
1262             tz               = _mm256_mul_pd(fscal,dz10);
1263
1264             /* Update vectorial force */
1265             fix1             = _mm256_add_pd(fix1,tx);
1266             fiy1             = _mm256_add_pd(fiy1,ty);
1267             fiz1             = _mm256_add_pd(fiz1,tz);
1268
1269             fjx0             = _mm256_add_pd(fjx0,tx);
1270             fjy0             = _mm256_add_pd(fjy0,ty);
1271             fjz0             = _mm256_add_pd(fjz0,tz);
1272
1273             }
1274
1275             /**************************
1276              * CALCULATE INTERACTIONS *
1277              **************************/
1278
1279             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1280             {
1281
1282             r20              = _mm256_mul_pd(rsq20,rinv20);
1283             r20              = _mm256_andnot_pd(dummy_mask,r20);
1284
1285             /* Compute parameters for interactions between i and j atoms */
1286             qq20             = _mm256_mul_pd(iq2,jq0);
1287
1288             /* EWALD ELECTROSTATICS */
1289
1290             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1291             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1292             ewitab           = _mm256_cvttpd_epi32(ewrt);
1293             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1294             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1295                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1296                                             &ewtabF,&ewtabFn);
1297             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1298             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1299
1300             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1301
1302             fscal            = felec;
1303
1304             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1305
1306             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1307
1308             /* Calculate temporary vectorial force */
1309             tx               = _mm256_mul_pd(fscal,dx20);
1310             ty               = _mm256_mul_pd(fscal,dy20);
1311             tz               = _mm256_mul_pd(fscal,dz20);
1312
1313             /* Update vectorial force */
1314             fix2             = _mm256_add_pd(fix2,tx);
1315             fiy2             = _mm256_add_pd(fiy2,ty);
1316             fiz2             = _mm256_add_pd(fiz2,tz);
1317
1318             fjx0             = _mm256_add_pd(fjx0,tx);
1319             fjy0             = _mm256_add_pd(fjy0,ty);
1320             fjz0             = _mm256_add_pd(fjz0,tz);
1321
1322             }
1323
1324             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1325             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1326             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1327             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1328
1329             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1330
1331             /* Inner loop uses 130 flops */
1332         }
1333
1334         /* End of innermost loop */
1335
1336         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1337                                                  f+i_coord_offset,fshift+i_shift_offset);
1338
1339         /* Increment number of inner iterations */
1340         inneriter                  += j_index_end - j_index_start;
1341
1342         /* Outer loop uses 18 flops */
1343     }
1344
1345     /* Increment number of outer iterations */
1346     outeriter        += nri;
1347
1348     /* Update outer/inner flops */
1349
1350     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*130);
1351 }