Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEw_VdwLJ_GeomW4P1_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_double
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water4-Particle
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     real *           vdwioffsetptr0;
84     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     real *           vdwioffsetptr1;
86     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     real *           vdwioffsetptr3;
90     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
96     __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
97     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
98     real             *charge;
99     int              nvdwtype;
100     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101     int              *vdwtype;
102     real             *vdwparam;
103     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
104     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
105     __m128i          ewitab;
106     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
107     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
108     real             *ewtab;
109     __m256d          dummy_mask,cutoff_mask;
110     __m128           tmpmask0,tmpmask1;
111     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
112     __m256d          one     = _mm256_set1_pd(1.0);
113     __m256d          two     = _mm256_set1_pd(2.0);
114     x                = xx[0];
115     f                = ff[0];
116
117     nri              = nlist->nri;
118     iinr             = nlist->iinr;
119     jindex           = nlist->jindex;
120     jjnr             = nlist->jjnr;
121     shiftidx         = nlist->shift;
122     gid              = nlist->gid;
123     shiftvec         = fr->shift_vec[0];
124     fshift           = fr->fshift[0];
125     facel            = _mm256_set1_pd(fr->ic->epsfac);
126     charge           = mdatoms->chargeA;
127     nvdwtype         = fr->ntype;
128     vdwparam         = fr->nbfp;
129     vdwtype          = mdatoms->typeA;
130
131     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
132     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
133     beta2            = _mm256_mul_pd(beta,beta);
134     beta3            = _mm256_mul_pd(beta,beta2);
135
136     ewtab            = fr->ic->tabq_coul_FDV0;
137     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
138     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
139
140     /* Setup water-specific parameters */
141     inr              = nlist->iinr[0];
142     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
143     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
144     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
145     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
146
147     /* Avoid stupid compiler warnings */
148     jnrA = jnrB = jnrC = jnrD = 0;
149     j_coord_offsetA = 0;
150     j_coord_offsetB = 0;
151     j_coord_offsetC = 0;
152     j_coord_offsetD = 0;
153
154     outeriter        = 0;
155     inneriter        = 0;
156
157     for(iidx=0;iidx<4*DIM;iidx++)
158     {
159         scratch[iidx] = 0.0;
160     }
161
162     /* Start outer loop over neighborlists */
163     for(iidx=0; iidx<nri; iidx++)
164     {
165         /* Load shift vector for this list */
166         i_shift_offset   = DIM*shiftidx[iidx];
167
168         /* Load limits for loop over neighbors */
169         j_index_start    = jindex[iidx];
170         j_index_end      = jindex[iidx+1];
171
172         /* Get outer coordinate index */
173         inr              = iinr[iidx];
174         i_coord_offset   = DIM*inr;
175
176         /* Load i particle coords and add shift vector */
177         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
178                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
179
180         fix0             = _mm256_setzero_pd();
181         fiy0             = _mm256_setzero_pd();
182         fiz0             = _mm256_setzero_pd();
183         fix1             = _mm256_setzero_pd();
184         fiy1             = _mm256_setzero_pd();
185         fiz1             = _mm256_setzero_pd();
186         fix2             = _mm256_setzero_pd();
187         fiy2             = _mm256_setzero_pd();
188         fiz2             = _mm256_setzero_pd();
189         fix3             = _mm256_setzero_pd();
190         fiy3             = _mm256_setzero_pd();
191         fiz3             = _mm256_setzero_pd();
192
193         /* Reset potential sums */
194         velecsum         = _mm256_setzero_pd();
195         vvdwsum          = _mm256_setzero_pd();
196
197         /* Start inner kernel loop */
198         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
199         {
200
201             /* Get j neighbor index, and coordinate index */
202             jnrA             = jjnr[jidx];
203             jnrB             = jjnr[jidx+1];
204             jnrC             = jjnr[jidx+2];
205             jnrD             = jjnr[jidx+3];
206             j_coord_offsetA  = DIM*jnrA;
207             j_coord_offsetB  = DIM*jnrB;
208             j_coord_offsetC  = DIM*jnrC;
209             j_coord_offsetD  = DIM*jnrD;
210
211             /* load j atom coordinates */
212             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
213                                                  x+j_coord_offsetC,x+j_coord_offsetD,
214                                                  &jx0,&jy0,&jz0);
215
216             /* Calculate displacement vector */
217             dx00             = _mm256_sub_pd(ix0,jx0);
218             dy00             = _mm256_sub_pd(iy0,jy0);
219             dz00             = _mm256_sub_pd(iz0,jz0);
220             dx10             = _mm256_sub_pd(ix1,jx0);
221             dy10             = _mm256_sub_pd(iy1,jy0);
222             dz10             = _mm256_sub_pd(iz1,jz0);
223             dx20             = _mm256_sub_pd(ix2,jx0);
224             dy20             = _mm256_sub_pd(iy2,jy0);
225             dz20             = _mm256_sub_pd(iz2,jz0);
226             dx30             = _mm256_sub_pd(ix3,jx0);
227             dy30             = _mm256_sub_pd(iy3,jy0);
228             dz30             = _mm256_sub_pd(iz3,jz0);
229
230             /* Calculate squared distance and things based on it */
231             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
232             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
233             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
234             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
235
236             rinv10           = avx256_invsqrt_d(rsq10);
237             rinv20           = avx256_invsqrt_d(rsq20);
238             rinv30           = avx256_invsqrt_d(rsq30);
239
240             rinvsq00         = avx256_inv_d(rsq00);
241             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
242             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
243             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
244
245             /* Load parameters for j particles */
246             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
247                                                                  charge+jnrC+0,charge+jnrD+0);
248             vdwjidx0A        = 2*vdwtype[jnrA+0];
249             vdwjidx0B        = 2*vdwtype[jnrB+0];
250             vdwjidx0C        = 2*vdwtype[jnrC+0];
251             vdwjidx0D        = 2*vdwtype[jnrD+0];
252
253             fjx0             = _mm256_setzero_pd();
254             fjy0             = _mm256_setzero_pd();
255             fjz0             = _mm256_setzero_pd();
256
257             /**************************
258              * CALCULATE INTERACTIONS *
259              **************************/
260
261             /* Compute parameters for interactions between i and j atoms */
262             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
263                                             vdwioffsetptr0+vdwjidx0B,
264                                             vdwioffsetptr0+vdwjidx0C,
265                                             vdwioffsetptr0+vdwjidx0D,
266                                             &c6_00,&c12_00);
267
268             /* LENNARD-JONES DISPERSION/REPULSION */
269
270             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
271             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
272             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
273             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
274             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
275
276             /* Update potential sum for this i atom from the interaction with this j atom. */
277             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
278
279             fscal            = fvdw;
280
281             /* Calculate temporary vectorial force */
282             tx               = _mm256_mul_pd(fscal,dx00);
283             ty               = _mm256_mul_pd(fscal,dy00);
284             tz               = _mm256_mul_pd(fscal,dz00);
285
286             /* Update vectorial force */
287             fix0             = _mm256_add_pd(fix0,tx);
288             fiy0             = _mm256_add_pd(fiy0,ty);
289             fiz0             = _mm256_add_pd(fiz0,tz);
290
291             fjx0             = _mm256_add_pd(fjx0,tx);
292             fjy0             = _mm256_add_pd(fjy0,ty);
293             fjz0             = _mm256_add_pd(fjz0,tz);
294
295             /**************************
296              * CALCULATE INTERACTIONS *
297              **************************/
298
299             r10              = _mm256_mul_pd(rsq10,rinv10);
300
301             /* Compute parameters for interactions between i and j atoms */
302             qq10             = _mm256_mul_pd(iq1,jq0);
303
304             /* EWALD ELECTROSTATICS */
305
306             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
307             ewrt             = _mm256_mul_pd(r10,ewtabscale);
308             ewitab           = _mm256_cvttpd_epi32(ewrt);
309             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
310             ewitab           = _mm_slli_epi32(ewitab,2);
311             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
312             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
313             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
314             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
315             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
316             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
317             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
318             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
319             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
320
321             /* Update potential sum for this i atom from the interaction with this j atom. */
322             velecsum         = _mm256_add_pd(velecsum,velec);
323
324             fscal            = felec;
325
326             /* Calculate temporary vectorial force */
327             tx               = _mm256_mul_pd(fscal,dx10);
328             ty               = _mm256_mul_pd(fscal,dy10);
329             tz               = _mm256_mul_pd(fscal,dz10);
330
331             /* Update vectorial force */
332             fix1             = _mm256_add_pd(fix1,tx);
333             fiy1             = _mm256_add_pd(fiy1,ty);
334             fiz1             = _mm256_add_pd(fiz1,tz);
335
336             fjx0             = _mm256_add_pd(fjx0,tx);
337             fjy0             = _mm256_add_pd(fjy0,ty);
338             fjz0             = _mm256_add_pd(fjz0,tz);
339
340             /**************************
341              * CALCULATE INTERACTIONS *
342              **************************/
343
344             r20              = _mm256_mul_pd(rsq20,rinv20);
345
346             /* Compute parameters for interactions between i and j atoms */
347             qq20             = _mm256_mul_pd(iq2,jq0);
348
349             /* EWALD ELECTROSTATICS */
350
351             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
352             ewrt             = _mm256_mul_pd(r20,ewtabscale);
353             ewitab           = _mm256_cvttpd_epi32(ewrt);
354             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
355             ewitab           = _mm_slli_epi32(ewitab,2);
356             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
357             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
358             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
359             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
360             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
361             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
362             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
363             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
364             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
365
366             /* Update potential sum for this i atom from the interaction with this j atom. */
367             velecsum         = _mm256_add_pd(velecsum,velec);
368
369             fscal            = felec;
370
371             /* Calculate temporary vectorial force */
372             tx               = _mm256_mul_pd(fscal,dx20);
373             ty               = _mm256_mul_pd(fscal,dy20);
374             tz               = _mm256_mul_pd(fscal,dz20);
375
376             /* Update vectorial force */
377             fix2             = _mm256_add_pd(fix2,tx);
378             fiy2             = _mm256_add_pd(fiy2,ty);
379             fiz2             = _mm256_add_pd(fiz2,tz);
380
381             fjx0             = _mm256_add_pd(fjx0,tx);
382             fjy0             = _mm256_add_pd(fjy0,ty);
383             fjz0             = _mm256_add_pd(fjz0,tz);
384
385             /**************************
386              * CALCULATE INTERACTIONS *
387              **************************/
388
389             r30              = _mm256_mul_pd(rsq30,rinv30);
390
391             /* Compute parameters for interactions between i and j atoms */
392             qq30             = _mm256_mul_pd(iq3,jq0);
393
394             /* EWALD ELECTROSTATICS */
395
396             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
397             ewrt             = _mm256_mul_pd(r30,ewtabscale);
398             ewitab           = _mm256_cvttpd_epi32(ewrt);
399             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
400             ewitab           = _mm_slli_epi32(ewitab,2);
401             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
402             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
403             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
404             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
405             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
406             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
407             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
408             velec            = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
409             felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
410
411             /* Update potential sum for this i atom from the interaction with this j atom. */
412             velecsum         = _mm256_add_pd(velecsum,velec);
413
414             fscal            = felec;
415
416             /* Calculate temporary vectorial force */
417             tx               = _mm256_mul_pd(fscal,dx30);
418             ty               = _mm256_mul_pd(fscal,dy30);
419             tz               = _mm256_mul_pd(fscal,dz30);
420
421             /* Update vectorial force */
422             fix3             = _mm256_add_pd(fix3,tx);
423             fiy3             = _mm256_add_pd(fiy3,ty);
424             fiz3             = _mm256_add_pd(fiz3,tz);
425
426             fjx0             = _mm256_add_pd(fjx0,tx);
427             fjy0             = _mm256_add_pd(fjy0,ty);
428             fjz0             = _mm256_add_pd(fjz0,tz);
429
430             fjptrA             = f+j_coord_offsetA;
431             fjptrB             = f+j_coord_offsetB;
432             fjptrC             = f+j_coord_offsetC;
433             fjptrD             = f+j_coord_offsetD;
434
435             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
436
437             /* Inner loop uses 158 flops */
438         }
439
440         if(jidx<j_index_end)
441         {
442
443             /* Get j neighbor index, and coordinate index */
444             jnrlistA         = jjnr[jidx];
445             jnrlistB         = jjnr[jidx+1];
446             jnrlistC         = jjnr[jidx+2];
447             jnrlistD         = jjnr[jidx+3];
448             /* Sign of each element will be negative for non-real atoms.
449              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
450              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
451              */
452             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
453
454             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
455             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
456             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
457
458             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
459             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
460             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
461             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
462             j_coord_offsetA  = DIM*jnrA;
463             j_coord_offsetB  = DIM*jnrB;
464             j_coord_offsetC  = DIM*jnrC;
465             j_coord_offsetD  = DIM*jnrD;
466
467             /* load j atom coordinates */
468             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
469                                                  x+j_coord_offsetC,x+j_coord_offsetD,
470                                                  &jx0,&jy0,&jz0);
471
472             /* Calculate displacement vector */
473             dx00             = _mm256_sub_pd(ix0,jx0);
474             dy00             = _mm256_sub_pd(iy0,jy0);
475             dz00             = _mm256_sub_pd(iz0,jz0);
476             dx10             = _mm256_sub_pd(ix1,jx0);
477             dy10             = _mm256_sub_pd(iy1,jy0);
478             dz10             = _mm256_sub_pd(iz1,jz0);
479             dx20             = _mm256_sub_pd(ix2,jx0);
480             dy20             = _mm256_sub_pd(iy2,jy0);
481             dz20             = _mm256_sub_pd(iz2,jz0);
482             dx30             = _mm256_sub_pd(ix3,jx0);
483             dy30             = _mm256_sub_pd(iy3,jy0);
484             dz30             = _mm256_sub_pd(iz3,jz0);
485
486             /* Calculate squared distance and things based on it */
487             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
488             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
489             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
490             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
491
492             rinv10           = avx256_invsqrt_d(rsq10);
493             rinv20           = avx256_invsqrt_d(rsq20);
494             rinv30           = avx256_invsqrt_d(rsq30);
495
496             rinvsq00         = avx256_inv_d(rsq00);
497             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
498             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
499             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
500
501             /* Load parameters for j particles */
502             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
503                                                                  charge+jnrC+0,charge+jnrD+0);
504             vdwjidx0A        = 2*vdwtype[jnrA+0];
505             vdwjidx0B        = 2*vdwtype[jnrB+0];
506             vdwjidx0C        = 2*vdwtype[jnrC+0];
507             vdwjidx0D        = 2*vdwtype[jnrD+0];
508
509             fjx0             = _mm256_setzero_pd();
510             fjy0             = _mm256_setzero_pd();
511             fjz0             = _mm256_setzero_pd();
512
513             /**************************
514              * CALCULATE INTERACTIONS *
515              **************************/
516
517             /* Compute parameters for interactions between i and j atoms */
518             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
519                                             vdwioffsetptr0+vdwjidx0B,
520                                             vdwioffsetptr0+vdwjidx0C,
521                                             vdwioffsetptr0+vdwjidx0D,
522                                             &c6_00,&c12_00);
523
524             /* LENNARD-JONES DISPERSION/REPULSION */
525
526             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
527             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
528             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
529             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
530             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
531
532             /* Update potential sum for this i atom from the interaction with this j atom. */
533             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
534             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
535
536             fscal            = fvdw;
537
538             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
539
540             /* Calculate temporary vectorial force */
541             tx               = _mm256_mul_pd(fscal,dx00);
542             ty               = _mm256_mul_pd(fscal,dy00);
543             tz               = _mm256_mul_pd(fscal,dz00);
544
545             /* Update vectorial force */
546             fix0             = _mm256_add_pd(fix0,tx);
547             fiy0             = _mm256_add_pd(fiy0,ty);
548             fiz0             = _mm256_add_pd(fiz0,tz);
549
550             fjx0             = _mm256_add_pd(fjx0,tx);
551             fjy0             = _mm256_add_pd(fjy0,ty);
552             fjz0             = _mm256_add_pd(fjz0,tz);
553
554             /**************************
555              * CALCULATE INTERACTIONS *
556              **************************/
557
558             r10              = _mm256_mul_pd(rsq10,rinv10);
559             r10              = _mm256_andnot_pd(dummy_mask,r10);
560
561             /* Compute parameters for interactions between i and j atoms */
562             qq10             = _mm256_mul_pd(iq1,jq0);
563
564             /* EWALD ELECTROSTATICS */
565
566             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
567             ewrt             = _mm256_mul_pd(r10,ewtabscale);
568             ewitab           = _mm256_cvttpd_epi32(ewrt);
569             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
570             ewitab           = _mm_slli_epi32(ewitab,2);
571             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
572             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
573             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
574             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
575             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
576             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
577             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
578             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
579             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
580
581             /* Update potential sum for this i atom from the interaction with this j atom. */
582             velec            = _mm256_andnot_pd(dummy_mask,velec);
583             velecsum         = _mm256_add_pd(velecsum,velec);
584
585             fscal            = felec;
586
587             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
588
589             /* Calculate temporary vectorial force */
590             tx               = _mm256_mul_pd(fscal,dx10);
591             ty               = _mm256_mul_pd(fscal,dy10);
592             tz               = _mm256_mul_pd(fscal,dz10);
593
594             /* Update vectorial force */
595             fix1             = _mm256_add_pd(fix1,tx);
596             fiy1             = _mm256_add_pd(fiy1,ty);
597             fiz1             = _mm256_add_pd(fiz1,tz);
598
599             fjx0             = _mm256_add_pd(fjx0,tx);
600             fjy0             = _mm256_add_pd(fjy0,ty);
601             fjz0             = _mm256_add_pd(fjz0,tz);
602
603             /**************************
604              * CALCULATE INTERACTIONS *
605              **************************/
606
607             r20              = _mm256_mul_pd(rsq20,rinv20);
608             r20              = _mm256_andnot_pd(dummy_mask,r20);
609
610             /* Compute parameters for interactions between i and j atoms */
611             qq20             = _mm256_mul_pd(iq2,jq0);
612
613             /* EWALD ELECTROSTATICS */
614
615             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
616             ewrt             = _mm256_mul_pd(r20,ewtabscale);
617             ewitab           = _mm256_cvttpd_epi32(ewrt);
618             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
619             ewitab           = _mm_slli_epi32(ewitab,2);
620             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
621             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
622             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
623             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
624             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
625             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
626             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
627             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
628             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
629
630             /* Update potential sum for this i atom from the interaction with this j atom. */
631             velec            = _mm256_andnot_pd(dummy_mask,velec);
632             velecsum         = _mm256_add_pd(velecsum,velec);
633
634             fscal            = felec;
635
636             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
637
638             /* Calculate temporary vectorial force */
639             tx               = _mm256_mul_pd(fscal,dx20);
640             ty               = _mm256_mul_pd(fscal,dy20);
641             tz               = _mm256_mul_pd(fscal,dz20);
642
643             /* Update vectorial force */
644             fix2             = _mm256_add_pd(fix2,tx);
645             fiy2             = _mm256_add_pd(fiy2,ty);
646             fiz2             = _mm256_add_pd(fiz2,tz);
647
648             fjx0             = _mm256_add_pd(fjx0,tx);
649             fjy0             = _mm256_add_pd(fjy0,ty);
650             fjz0             = _mm256_add_pd(fjz0,tz);
651
652             /**************************
653              * CALCULATE INTERACTIONS *
654              **************************/
655
656             r30              = _mm256_mul_pd(rsq30,rinv30);
657             r30              = _mm256_andnot_pd(dummy_mask,r30);
658
659             /* Compute parameters for interactions between i and j atoms */
660             qq30             = _mm256_mul_pd(iq3,jq0);
661
662             /* EWALD ELECTROSTATICS */
663
664             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
665             ewrt             = _mm256_mul_pd(r30,ewtabscale);
666             ewitab           = _mm256_cvttpd_epi32(ewrt);
667             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
668             ewitab           = _mm_slli_epi32(ewitab,2);
669             ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
670             ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
671             ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
672             ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
673             GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
674             felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
675             velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
676             velec            = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
677             felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
678
679             /* Update potential sum for this i atom from the interaction with this j atom. */
680             velec            = _mm256_andnot_pd(dummy_mask,velec);
681             velecsum         = _mm256_add_pd(velecsum,velec);
682
683             fscal            = felec;
684
685             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
686
687             /* Calculate temporary vectorial force */
688             tx               = _mm256_mul_pd(fscal,dx30);
689             ty               = _mm256_mul_pd(fscal,dy30);
690             tz               = _mm256_mul_pd(fscal,dz30);
691
692             /* Update vectorial force */
693             fix3             = _mm256_add_pd(fix3,tx);
694             fiy3             = _mm256_add_pd(fiy3,ty);
695             fiz3             = _mm256_add_pd(fiz3,tz);
696
697             fjx0             = _mm256_add_pd(fjx0,tx);
698             fjy0             = _mm256_add_pd(fjy0,ty);
699             fjz0             = _mm256_add_pd(fjz0,tz);
700
701             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
702             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
703             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
704             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
705
706             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
707
708             /* Inner loop uses 161 flops */
709         }
710
711         /* End of innermost loop */
712
713         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
714                                                  f+i_coord_offset,fshift+i_shift_offset);
715
716         ggid                        = gid[iidx];
717         /* Update potential energies */
718         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
719         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
720
721         /* Increment number of inner iterations */
722         inneriter                  += j_index_end - j_index_start;
723
724         /* Outer loop uses 26 flops */
725     }
726
727     /* Increment number of outer iterations */
728     outeriter        += nri;
729
730     /* Update outer/inner flops */
731
732     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*161);
733 }
734 /*
735  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_double
736  * Electrostatics interaction: Ewald
737  * VdW interaction:            LennardJones
738  * Geometry:                   Water4-Particle
739  * Calculate force/pot:        Force
740  */
741 void
742 nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_double
743                     (t_nblist                    * gmx_restrict       nlist,
744                      rvec                        * gmx_restrict          xx,
745                      rvec                        * gmx_restrict          ff,
746                      struct t_forcerec           * gmx_restrict          fr,
747                      t_mdatoms                   * gmx_restrict     mdatoms,
748                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
749                      t_nrnb                      * gmx_restrict        nrnb)
750 {
751     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
752      * just 0 for non-waters.
753      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
754      * jnr indices corresponding to data put in the four positions in the SIMD register.
755      */
756     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
757     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
758     int              jnrA,jnrB,jnrC,jnrD;
759     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
760     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
761     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
762     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
763     real             rcutoff_scalar;
764     real             *shiftvec,*fshift,*x,*f;
765     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
766     real             scratch[4*DIM];
767     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
768     real *           vdwioffsetptr0;
769     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
770     real *           vdwioffsetptr1;
771     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
772     real *           vdwioffsetptr2;
773     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
774     real *           vdwioffsetptr3;
775     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
776     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
777     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
778     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
779     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
780     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
781     __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
782     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
783     real             *charge;
784     int              nvdwtype;
785     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
786     int              *vdwtype;
787     real             *vdwparam;
788     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
789     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
790     __m128i          ewitab;
791     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
792     __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
793     real             *ewtab;
794     __m256d          dummy_mask,cutoff_mask;
795     __m128           tmpmask0,tmpmask1;
796     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
797     __m256d          one     = _mm256_set1_pd(1.0);
798     __m256d          two     = _mm256_set1_pd(2.0);
799     x                = xx[0];
800     f                = ff[0];
801
802     nri              = nlist->nri;
803     iinr             = nlist->iinr;
804     jindex           = nlist->jindex;
805     jjnr             = nlist->jjnr;
806     shiftidx         = nlist->shift;
807     gid              = nlist->gid;
808     shiftvec         = fr->shift_vec[0];
809     fshift           = fr->fshift[0];
810     facel            = _mm256_set1_pd(fr->ic->epsfac);
811     charge           = mdatoms->chargeA;
812     nvdwtype         = fr->ntype;
813     vdwparam         = fr->nbfp;
814     vdwtype          = mdatoms->typeA;
815
816     sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
817     beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
818     beta2            = _mm256_mul_pd(beta,beta);
819     beta3            = _mm256_mul_pd(beta,beta2);
820
821     ewtab            = fr->ic->tabq_coul_F;
822     ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
823     ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
824
825     /* Setup water-specific parameters */
826     inr              = nlist->iinr[0];
827     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
828     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
829     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
830     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
831
832     /* Avoid stupid compiler warnings */
833     jnrA = jnrB = jnrC = jnrD = 0;
834     j_coord_offsetA = 0;
835     j_coord_offsetB = 0;
836     j_coord_offsetC = 0;
837     j_coord_offsetD = 0;
838
839     outeriter        = 0;
840     inneriter        = 0;
841
842     for(iidx=0;iidx<4*DIM;iidx++)
843     {
844         scratch[iidx] = 0.0;
845     }
846
847     /* Start outer loop over neighborlists */
848     for(iidx=0; iidx<nri; iidx++)
849     {
850         /* Load shift vector for this list */
851         i_shift_offset   = DIM*shiftidx[iidx];
852
853         /* Load limits for loop over neighbors */
854         j_index_start    = jindex[iidx];
855         j_index_end      = jindex[iidx+1];
856
857         /* Get outer coordinate index */
858         inr              = iinr[iidx];
859         i_coord_offset   = DIM*inr;
860
861         /* Load i particle coords and add shift vector */
862         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
863                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
864
865         fix0             = _mm256_setzero_pd();
866         fiy0             = _mm256_setzero_pd();
867         fiz0             = _mm256_setzero_pd();
868         fix1             = _mm256_setzero_pd();
869         fiy1             = _mm256_setzero_pd();
870         fiz1             = _mm256_setzero_pd();
871         fix2             = _mm256_setzero_pd();
872         fiy2             = _mm256_setzero_pd();
873         fiz2             = _mm256_setzero_pd();
874         fix3             = _mm256_setzero_pd();
875         fiy3             = _mm256_setzero_pd();
876         fiz3             = _mm256_setzero_pd();
877
878         /* Start inner kernel loop */
879         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
880         {
881
882             /* Get j neighbor index, and coordinate index */
883             jnrA             = jjnr[jidx];
884             jnrB             = jjnr[jidx+1];
885             jnrC             = jjnr[jidx+2];
886             jnrD             = jjnr[jidx+3];
887             j_coord_offsetA  = DIM*jnrA;
888             j_coord_offsetB  = DIM*jnrB;
889             j_coord_offsetC  = DIM*jnrC;
890             j_coord_offsetD  = DIM*jnrD;
891
892             /* load j atom coordinates */
893             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
894                                                  x+j_coord_offsetC,x+j_coord_offsetD,
895                                                  &jx0,&jy0,&jz0);
896
897             /* Calculate displacement vector */
898             dx00             = _mm256_sub_pd(ix0,jx0);
899             dy00             = _mm256_sub_pd(iy0,jy0);
900             dz00             = _mm256_sub_pd(iz0,jz0);
901             dx10             = _mm256_sub_pd(ix1,jx0);
902             dy10             = _mm256_sub_pd(iy1,jy0);
903             dz10             = _mm256_sub_pd(iz1,jz0);
904             dx20             = _mm256_sub_pd(ix2,jx0);
905             dy20             = _mm256_sub_pd(iy2,jy0);
906             dz20             = _mm256_sub_pd(iz2,jz0);
907             dx30             = _mm256_sub_pd(ix3,jx0);
908             dy30             = _mm256_sub_pd(iy3,jy0);
909             dz30             = _mm256_sub_pd(iz3,jz0);
910
911             /* Calculate squared distance and things based on it */
912             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
913             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
914             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
915             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
916
917             rinv10           = avx256_invsqrt_d(rsq10);
918             rinv20           = avx256_invsqrt_d(rsq20);
919             rinv30           = avx256_invsqrt_d(rsq30);
920
921             rinvsq00         = avx256_inv_d(rsq00);
922             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
923             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
924             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
925
926             /* Load parameters for j particles */
927             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
928                                                                  charge+jnrC+0,charge+jnrD+0);
929             vdwjidx0A        = 2*vdwtype[jnrA+0];
930             vdwjidx0B        = 2*vdwtype[jnrB+0];
931             vdwjidx0C        = 2*vdwtype[jnrC+0];
932             vdwjidx0D        = 2*vdwtype[jnrD+0];
933
934             fjx0             = _mm256_setzero_pd();
935             fjy0             = _mm256_setzero_pd();
936             fjz0             = _mm256_setzero_pd();
937
938             /**************************
939              * CALCULATE INTERACTIONS *
940              **************************/
941
942             /* Compute parameters for interactions between i and j atoms */
943             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
944                                             vdwioffsetptr0+vdwjidx0B,
945                                             vdwioffsetptr0+vdwjidx0C,
946                                             vdwioffsetptr0+vdwjidx0D,
947                                             &c6_00,&c12_00);
948
949             /* LENNARD-JONES DISPERSION/REPULSION */
950
951             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
952             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
953
954             fscal            = fvdw;
955
956             /* Calculate temporary vectorial force */
957             tx               = _mm256_mul_pd(fscal,dx00);
958             ty               = _mm256_mul_pd(fscal,dy00);
959             tz               = _mm256_mul_pd(fscal,dz00);
960
961             /* Update vectorial force */
962             fix0             = _mm256_add_pd(fix0,tx);
963             fiy0             = _mm256_add_pd(fiy0,ty);
964             fiz0             = _mm256_add_pd(fiz0,tz);
965
966             fjx0             = _mm256_add_pd(fjx0,tx);
967             fjy0             = _mm256_add_pd(fjy0,ty);
968             fjz0             = _mm256_add_pd(fjz0,tz);
969
970             /**************************
971              * CALCULATE INTERACTIONS *
972              **************************/
973
974             r10              = _mm256_mul_pd(rsq10,rinv10);
975
976             /* Compute parameters for interactions between i and j atoms */
977             qq10             = _mm256_mul_pd(iq1,jq0);
978
979             /* EWALD ELECTROSTATICS */
980
981             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
982             ewrt             = _mm256_mul_pd(r10,ewtabscale);
983             ewitab           = _mm256_cvttpd_epi32(ewrt);
984             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
985             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
986                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
987                                             &ewtabF,&ewtabFn);
988             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
989             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
990
991             fscal            = felec;
992
993             /* Calculate temporary vectorial force */
994             tx               = _mm256_mul_pd(fscal,dx10);
995             ty               = _mm256_mul_pd(fscal,dy10);
996             tz               = _mm256_mul_pd(fscal,dz10);
997
998             /* Update vectorial force */
999             fix1             = _mm256_add_pd(fix1,tx);
1000             fiy1             = _mm256_add_pd(fiy1,ty);
1001             fiz1             = _mm256_add_pd(fiz1,tz);
1002
1003             fjx0             = _mm256_add_pd(fjx0,tx);
1004             fjy0             = _mm256_add_pd(fjy0,ty);
1005             fjz0             = _mm256_add_pd(fjz0,tz);
1006
1007             /**************************
1008              * CALCULATE INTERACTIONS *
1009              **************************/
1010
1011             r20              = _mm256_mul_pd(rsq20,rinv20);
1012
1013             /* Compute parameters for interactions between i and j atoms */
1014             qq20             = _mm256_mul_pd(iq2,jq0);
1015
1016             /* EWALD ELECTROSTATICS */
1017
1018             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1019             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1020             ewitab           = _mm256_cvttpd_epi32(ewrt);
1021             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1022             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1023                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1024                                             &ewtabF,&ewtabFn);
1025             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1026             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1027
1028             fscal            = felec;
1029
1030             /* Calculate temporary vectorial force */
1031             tx               = _mm256_mul_pd(fscal,dx20);
1032             ty               = _mm256_mul_pd(fscal,dy20);
1033             tz               = _mm256_mul_pd(fscal,dz20);
1034
1035             /* Update vectorial force */
1036             fix2             = _mm256_add_pd(fix2,tx);
1037             fiy2             = _mm256_add_pd(fiy2,ty);
1038             fiz2             = _mm256_add_pd(fiz2,tz);
1039
1040             fjx0             = _mm256_add_pd(fjx0,tx);
1041             fjy0             = _mm256_add_pd(fjy0,ty);
1042             fjz0             = _mm256_add_pd(fjz0,tz);
1043
1044             /**************************
1045              * CALCULATE INTERACTIONS *
1046              **************************/
1047
1048             r30              = _mm256_mul_pd(rsq30,rinv30);
1049
1050             /* Compute parameters for interactions between i and j atoms */
1051             qq30             = _mm256_mul_pd(iq3,jq0);
1052
1053             /* EWALD ELECTROSTATICS */
1054
1055             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1056             ewrt             = _mm256_mul_pd(r30,ewtabscale);
1057             ewitab           = _mm256_cvttpd_epi32(ewrt);
1058             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1059             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1060                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1061                                             &ewtabF,&ewtabFn);
1062             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1063             felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
1064
1065             fscal            = felec;
1066
1067             /* Calculate temporary vectorial force */
1068             tx               = _mm256_mul_pd(fscal,dx30);
1069             ty               = _mm256_mul_pd(fscal,dy30);
1070             tz               = _mm256_mul_pd(fscal,dz30);
1071
1072             /* Update vectorial force */
1073             fix3             = _mm256_add_pd(fix3,tx);
1074             fiy3             = _mm256_add_pd(fiy3,ty);
1075             fiz3             = _mm256_add_pd(fiz3,tz);
1076
1077             fjx0             = _mm256_add_pd(fjx0,tx);
1078             fjy0             = _mm256_add_pd(fjy0,ty);
1079             fjz0             = _mm256_add_pd(fjz0,tz);
1080
1081             fjptrA             = f+j_coord_offsetA;
1082             fjptrB             = f+j_coord_offsetB;
1083             fjptrC             = f+j_coord_offsetC;
1084             fjptrD             = f+j_coord_offsetD;
1085
1086             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1087
1088             /* Inner loop uses 138 flops */
1089         }
1090
1091         if(jidx<j_index_end)
1092         {
1093
1094             /* Get j neighbor index, and coordinate index */
1095             jnrlistA         = jjnr[jidx];
1096             jnrlistB         = jjnr[jidx+1];
1097             jnrlistC         = jjnr[jidx+2];
1098             jnrlistD         = jjnr[jidx+3];
1099             /* Sign of each element will be negative for non-real atoms.
1100              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1101              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1102              */
1103             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1104
1105             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1106             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1107             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1108
1109             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1110             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1111             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1112             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1113             j_coord_offsetA  = DIM*jnrA;
1114             j_coord_offsetB  = DIM*jnrB;
1115             j_coord_offsetC  = DIM*jnrC;
1116             j_coord_offsetD  = DIM*jnrD;
1117
1118             /* load j atom coordinates */
1119             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1120                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1121                                                  &jx0,&jy0,&jz0);
1122
1123             /* Calculate displacement vector */
1124             dx00             = _mm256_sub_pd(ix0,jx0);
1125             dy00             = _mm256_sub_pd(iy0,jy0);
1126             dz00             = _mm256_sub_pd(iz0,jz0);
1127             dx10             = _mm256_sub_pd(ix1,jx0);
1128             dy10             = _mm256_sub_pd(iy1,jy0);
1129             dz10             = _mm256_sub_pd(iz1,jz0);
1130             dx20             = _mm256_sub_pd(ix2,jx0);
1131             dy20             = _mm256_sub_pd(iy2,jy0);
1132             dz20             = _mm256_sub_pd(iz2,jz0);
1133             dx30             = _mm256_sub_pd(ix3,jx0);
1134             dy30             = _mm256_sub_pd(iy3,jy0);
1135             dz30             = _mm256_sub_pd(iz3,jz0);
1136
1137             /* Calculate squared distance and things based on it */
1138             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1139             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1140             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1141             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
1142
1143             rinv10           = avx256_invsqrt_d(rsq10);
1144             rinv20           = avx256_invsqrt_d(rsq20);
1145             rinv30           = avx256_invsqrt_d(rsq30);
1146
1147             rinvsq00         = avx256_inv_d(rsq00);
1148             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1149             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1150             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
1151
1152             /* Load parameters for j particles */
1153             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1154                                                                  charge+jnrC+0,charge+jnrD+0);
1155             vdwjidx0A        = 2*vdwtype[jnrA+0];
1156             vdwjidx0B        = 2*vdwtype[jnrB+0];
1157             vdwjidx0C        = 2*vdwtype[jnrC+0];
1158             vdwjidx0D        = 2*vdwtype[jnrD+0];
1159
1160             fjx0             = _mm256_setzero_pd();
1161             fjy0             = _mm256_setzero_pd();
1162             fjz0             = _mm256_setzero_pd();
1163
1164             /**************************
1165              * CALCULATE INTERACTIONS *
1166              **************************/
1167
1168             /* Compute parameters for interactions between i and j atoms */
1169             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1170                                             vdwioffsetptr0+vdwjidx0B,
1171                                             vdwioffsetptr0+vdwjidx0C,
1172                                             vdwioffsetptr0+vdwjidx0D,
1173                                             &c6_00,&c12_00);
1174
1175             /* LENNARD-JONES DISPERSION/REPULSION */
1176
1177             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1178             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1179
1180             fscal            = fvdw;
1181
1182             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1183
1184             /* Calculate temporary vectorial force */
1185             tx               = _mm256_mul_pd(fscal,dx00);
1186             ty               = _mm256_mul_pd(fscal,dy00);
1187             tz               = _mm256_mul_pd(fscal,dz00);
1188
1189             /* Update vectorial force */
1190             fix0             = _mm256_add_pd(fix0,tx);
1191             fiy0             = _mm256_add_pd(fiy0,ty);
1192             fiz0             = _mm256_add_pd(fiz0,tz);
1193
1194             fjx0             = _mm256_add_pd(fjx0,tx);
1195             fjy0             = _mm256_add_pd(fjy0,ty);
1196             fjz0             = _mm256_add_pd(fjz0,tz);
1197
1198             /**************************
1199              * CALCULATE INTERACTIONS *
1200              **************************/
1201
1202             r10              = _mm256_mul_pd(rsq10,rinv10);
1203             r10              = _mm256_andnot_pd(dummy_mask,r10);
1204
1205             /* Compute parameters for interactions between i and j atoms */
1206             qq10             = _mm256_mul_pd(iq1,jq0);
1207
1208             /* EWALD ELECTROSTATICS */
1209
1210             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1211             ewrt             = _mm256_mul_pd(r10,ewtabscale);
1212             ewitab           = _mm256_cvttpd_epi32(ewrt);
1213             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1214             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1215                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1216                                             &ewtabF,&ewtabFn);
1217             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1218             felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1219
1220             fscal            = felec;
1221
1222             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1223
1224             /* Calculate temporary vectorial force */
1225             tx               = _mm256_mul_pd(fscal,dx10);
1226             ty               = _mm256_mul_pd(fscal,dy10);
1227             tz               = _mm256_mul_pd(fscal,dz10);
1228
1229             /* Update vectorial force */
1230             fix1             = _mm256_add_pd(fix1,tx);
1231             fiy1             = _mm256_add_pd(fiy1,ty);
1232             fiz1             = _mm256_add_pd(fiz1,tz);
1233
1234             fjx0             = _mm256_add_pd(fjx0,tx);
1235             fjy0             = _mm256_add_pd(fjy0,ty);
1236             fjz0             = _mm256_add_pd(fjz0,tz);
1237
1238             /**************************
1239              * CALCULATE INTERACTIONS *
1240              **************************/
1241
1242             r20              = _mm256_mul_pd(rsq20,rinv20);
1243             r20              = _mm256_andnot_pd(dummy_mask,r20);
1244
1245             /* Compute parameters for interactions between i and j atoms */
1246             qq20             = _mm256_mul_pd(iq2,jq0);
1247
1248             /* EWALD ELECTROSTATICS */
1249
1250             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1251             ewrt             = _mm256_mul_pd(r20,ewtabscale);
1252             ewitab           = _mm256_cvttpd_epi32(ewrt);
1253             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1254             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1255                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1256                                             &ewtabF,&ewtabFn);
1257             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1258             felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1259
1260             fscal            = felec;
1261
1262             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1263
1264             /* Calculate temporary vectorial force */
1265             tx               = _mm256_mul_pd(fscal,dx20);
1266             ty               = _mm256_mul_pd(fscal,dy20);
1267             tz               = _mm256_mul_pd(fscal,dz20);
1268
1269             /* Update vectorial force */
1270             fix2             = _mm256_add_pd(fix2,tx);
1271             fiy2             = _mm256_add_pd(fiy2,ty);
1272             fiz2             = _mm256_add_pd(fiz2,tz);
1273
1274             fjx0             = _mm256_add_pd(fjx0,tx);
1275             fjy0             = _mm256_add_pd(fjy0,ty);
1276             fjz0             = _mm256_add_pd(fjz0,tz);
1277
1278             /**************************
1279              * CALCULATE INTERACTIONS *
1280              **************************/
1281
1282             r30              = _mm256_mul_pd(rsq30,rinv30);
1283             r30              = _mm256_andnot_pd(dummy_mask,r30);
1284
1285             /* Compute parameters for interactions between i and j atoms */
1286             qq30             = _mm256_mul_pd(iq3,jq0);
1287
1288             /* EWALD ELECTROSTATICS */
1289
1290             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1291             ewrt             = _mm256_mul_pd(r30,ewtabscale);
1292             ewitab           = _mm256_cvttpd_epi32(ewrt);
1293             eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1294             gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1295                                             ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1296                                             &ewtabF,&ewtabFn);
1297             felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1298             felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
1299
1300             fscal            = felec;
1301
1302             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1303
1304             /* Calculate temporary vectorial force */
1305             tx               = _mm256_mul_pd(fscal,dx30);
1306             ty               = _mm256_mul_pd(fscal,dy30);
1307             tz               = _mm256_mul_pd(fscal,dz30);
1308
1309             /* Update vectorial force */
1310             fix3             = _mm256_add_pd(fix3,tx);
1311             fiy3             = _mm256_add_pd(fiy3,ty);
1312             fiz3             = _mm256_add_pd(fiz3,tz);
1313
1314             fjx0             = _mm256_add_pd(fjx0,tx);
1315             fjy0             = _mm256_add_pd(fjy0,ty);
1316             fjz0             = _mm256_add_pd(fjz0,tz);
1317
1318             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1319             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1320             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1321             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1322
1323             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1324
1325             /* Inner loop uses 141 flops */
1326         }
1327
1328         /* End of innermost loop */
1329
1330         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1331                                                  f+i_coord_offset,fshift+i_shift_offset);
1332
1333         /* Increment number of inner iterations */
1334         inneriter                  += j_index_end - j_index_start;
1335
1336         /* Outer loop uses 24 flops */
1337     }
1338
1339     /* Increment number of outer iterations */
1340     outeriter        += nri;
1341
1342     /* Update outer/inner flops */
1343
1344     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*141);
1345 }