made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_avx_256_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_double
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Particle
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
65     real             rcutoff_scalar;
66     real             *shiftvec,*fshift,*x,*f;
67     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68     real             scratch[4*DIM];
69     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70     real *           vdwioffsetptr0;
71     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72     real *           vdwioffsetptr1;
73     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74     real *           vdwioffsetptr2;
75     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76     real *           vdwioffsetptr3;
77     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
81     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
83     __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
84     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
85     real             *charge;
86     int              nvdwtype;
87     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
88     int              *vdwtype;
89     real             *vdwparam;
90     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
91     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
92     __m128i          vfitab;
93     __m128i          ifour       = _mm_set1_epi32(4);
94     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
95     real             *vftab;
96     __m256d          dummy_mask,cutoff_mask;
97     __m128           tmpmask0,tmpmask1;
98     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
99     __m256d          one     = _mm256_set1_pd(1.0);
100     __m256d          two     = _mm256_set1_pd(2.0);
101     x                = xx[0];
102     f                = ff[0];
103
104     nri              = nlist->nri;
105     iinr             = nlist->iinr;
106     jindex           = nlist->jindex;
107     jjnr             = nlist->jjnr;
108     shiftidx         = nlist->shift;
109     gid              = nlist->gid;
110     shiftvec         = fr->shift_vec[0];
111     fshift           = fr->fshift[0];
112     facel            = _mm256_set1_pd(fr->epsfac);
113     charge           = mdatoms->chargeA;
114     nvdwtype         = fr->ntype;
115     vdwparam         = fr->nbfp;
116     vdwtype          = mdatoms->typeA;
117
118     vftab            = kernel_data->table_elec->data;
119     vftabscale       = _mm256_set1_pd(kernel_data->table_elec->scale);
120
121     /* Setup water-specific parameters */
122     inr              = nlist->iinr[0];
123     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
124     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
125     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
126     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
127
128     /* Avoid stupid compiler warnings */
129     jnrA = jnrB = jnrC = jnrD = 0;
130     j_coord_offsetA = 0;
131     j_coord_offsetB = 0;
132     j_coord_offsetC = 0;
133     j_coord_offsetD = 0;
134
135     outeriter        = 0;
136     inneriter        = 0;
137
138     for(iidx=0;iidx<4*DIM;iidx++)
139     {
140         scratch[iidx] = 0.0;
141     }
142
143     /* Start outer loop over neighborlists */
144     for(iidx=0; iidx<nri; iidx++)
145     {
146         /* Load shift vector for this list */
147         i_shift_offset   = DIM*shiftidx[iidx];
148
149         /* Load limits for loop over neighbors */
150         j_index_start    = jindex[iidx];
151         j_index_end      = jindex[iidx+1];
152
153         /* Get outer coordinate index */
154         inr              = iinr[iidx];
155         i_coord_offset   = DIM*inr;
156
157         /* Load i particle coords and add shift vector */
158         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
159                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
160
161         fix0             = _mm256_setzero_pd();
162         fiy0             = _mm256_setzero_pd();
163         fiz0             = _mm256_setzero_pd();
164         fix1             = _mm256_setzero_pd();
165         fiy1             = _mm256_setzero_pd();
166         fiz1             = _mm256_setzero_pd();
167         fix2             = _mm256_setzero_pd();
168         fiy2             = _mm256_setzero_pd();
169         fiz2             = _mm256_setzero_pd();
170         fix3             = _mm256_setzero_pd();
171         fiy3             = _mm256_setzero_pd();
172         fiz3             = _mm256_setzero_pd();
173
174         /* Reset potential sums */
175         velecsum         = _mm256_setzero_pd();
176         vvdwsum          = _mm256_setzero_pd();
177
178         /* Start inner kernel loop */
179         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
180         {
181
182             /* Get j neighbor index, and coordinate index */
183             jnrA             = jjnr[jidx];
184             jnrB             = jjnr[jidx+1];
185             jnrC             = jjnr[jidx+2];
186             jnrD             = jjnr[jidx+3];
187             j_coord_offsetA  = DIM*jnrA;
188             j_coord_offsetB  = DIM*jnrB;
189             j_coord_offsetC  = DIM*jnrC;
190             j_coord_offsetD  = DIM*jnrD;
191
192             /* load j atom coordinates */
193             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
194                                                  x+j_coord_offsetC,x+j_coord_offsetD,
195                                                  &jx0,&jy0,&jz0);
196
197             /* Calculate displacement vector */
198             dx00             = _mm256_sub_pd(ix0,jx0);
199             dy00             = _mm256_sub_pd(iy0,jy0);
200             dz00             = _mm256_sub_pd(iz0,jz0);
201             dx10             = _mm256_sub_pd(ix1,jx0);
202             dy10             = _mm256_sub_pd(iy1,jy0);
203             dz10             = _mm256_sub_pd(iz1,jz0);
204             dx20             = _mm256_sub_pd(ix2,jx0);
205             dy20             = _mm256_sub_pd(iy2,jy0);
206             dz20             = _mm256_sub_pd(iz2,jz0);
207             dx30             = _mm256_sub_pd(ix3,jx0);
208             dy30             = _mm256_sub_pd(iy3,jy0);
209             dz30             = _mm256_sub_pd(iz3,jz0);
210
211             /* Calculate squared distance and things based on it */
212             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
213             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
214             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
215             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
216
217             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
218             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
219             rinv30           = gmx_mm256_invsqrt_pd(rsq30);
220
221             rinvsq00         = gmx_mm256_inv_pd(rsq00);
222
223             /* Load parameters for j particles */
224             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
225                                                                  charge+jnrC+0,charge+jnrD+0);
226             vdwjidx0A        = 2*vdwtype[jnrA+0];
227             vdwjidx0B        = 2*vdwtype[jnrB+0];
228             vdwjidx0C        = 2*vdwtype[jnrC+0];
229             vdwjidx0D        = 2*vdwtype[jnrD+0];
230
231             fjx0             = _mm256_setzero_pd();
232             fjy0             = _mm256_setzero_pd();
233             fjz0             = _mm256_setzero_pd();
234
235             /**************************
236              * CALCULATE INTERACTIONS *
237              **************************/
238
239             /* Compute parameters for interactions between i and j atoms */
240             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
241                                             vdwioffsetptr0+vdwjidx0B,
242                                             vdwioffsetptr0+vdwjidx0C,
243                                             vdwioffsetptr0+vdwjidx0D,
244                                             &c6_00,&c12_00);
245
246             /* LENNARD-JONES DISPERSION/REPULSION */
247
248             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
249             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
250             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
251             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
252             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
253
254             /* Update potential sum for this i atom from the interaction with this j atom. */
255             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
256
257             fscal            = fvdw;
258
259             /* Calculate temporary vectorial force */
260             tx               = _mm256_mul_pd(fscal,dx00);
261             ty               = _mm256_mul_pd(fscal,dy00);
262             tz               = _mm256_mul_pd(fscal,dz00);
263
264             /* Update vectorial force */
265             fix0             = _mm256_add_pd(fix0,tx);
266             fiy0             = _mm256_add_pd(fiy0,ty);
267             fiz0             = _mm256_add_pd(fiz0,tz);
268
269             fjx0             = _mm256_add_pd(fjx0,tx);
270             fjy0             = _mm256_add_pd(fjy0,ty);
271             fjz0             = _mm256_add_pd(fjz0,tz);
272
273             /**************************
274              * CALCULATE INTERACTIONS *
275              **************************/
276
277             r10              = _mm256_mul_pd(rsq10,rinv10);
278
279             /* Compute parameters for interactions between i and j atoms */
280             qq10             = _mm256_mul_pd(iq1,jq0);
281
282             /* Calculate table index by multiplying r with table scale and truncate to integer */
283             rt               = _mm256_mul_pd(r10,vftabscale);
284             vfitab           = _mm256_cvttpd_epi32(rt);
285             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
286             vfitab           = _mm_slli_epi32(vfitab,2);
287
288             /* CUBIC SPLINE TABLE ELECTROSTATICS */
289             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
290             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
291             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
292             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
293             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
294             Heps             = _mm256_mul_pd(vfeps,H);
295             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
296             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
297             velec            = _mm256_mul_pd(qq10,VV);
298             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
299             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
300
301             /* Update potential sum for this i atom from the interaction with this j atom. */
302             velecsum         = _mm256_add_pd(velecsum,velec);
303
304             fscal            = felec;
305
306             /* Calculate temporary vectorial force */
307             tx               = _mm256_mul_pd(fscal,dx10);
308             ty               = _mm256_mul_pd(fscal,dy10);
309             tz               = _mm256_mul_pd(fscal,dz10);
310
311             /* Update vectorial force */
312             fix1             = _mm256_add_pd(fix1,tx);
313             fiy1             = _mm256_add_pd(fiy1,ty);
314             fiz1             = _mm256_add_pd(fiz1,tz);
315
316             fjx0             = _mm256_add_pd(fjx0,tx);
317             fjy0             = _mm256_add_pd(fjy0,ty);
318             fjz0             = _mm256_add_pd(fjz0,tz);
319
320             /**************************
321              * CALCULATE INTERACTIONS *
322              **************************/
323
324             r20              = _mm256_mul_pd(rsq20,rinv20);
325
326             /* Compute parameters for interactions between i and j atoms */
327             qq20             = _mm256_mul_pd(iq2,jq0);
328
329             /* Calculate table index by multiplying r with table scale and truncate to integer */
330             rt               = _mm256_mul_pd(r20,vftabscale);
331             vfitab           = _mm256_cvttpd_epi32(rt);
332             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
333             vfitab           = _mm_slli_epi32(vfitab,2);
334
335             /* CUBIC SPLINE TABLE ELECTROSTATICS */
336             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
337             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
338             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
339             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
340             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
341             Heps             = _mm256_mul_pd(vfeps,H);
342             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
343             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
344             velec            = _mm256_mul_pd(qq20,VV);
345             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
346             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
347
348             /* Update potential sum for this i atom from the interaction with this j atom. */
349             velecsum         = _mm256_add_pd(velecsum,velec);
350
351             fscal            = felec;
352
353             /* Calculate temporary vectorial force */
354             tx               = _mm256_mul_pd(fscal,dx20);
355             ty               = _mm256_mul_pd(fscal,dy20);
356             tz               = _mm256_mul_pd(fscal,dz20);
357
358             /* Update vectorial force */
359             fix2             = _mm256_add_pd(fix2,tx);
360             fiy2             = _mm256_add_pd(fiy2,ty);
361             fiz2             = _mm256_add_pd(fiz2,tz);
362
363             fjx0             = _mm256_add_pd(fjx0,tx);
364             fjy0             = _mm256_add_pd(fjy0,ty);
365             fjz0             = _mm256_add_pd(fjz0,tz);
366
367             /**************************
368              * CALCULATE INTERACTIONS *
369              **************************/
370
371             r30              = _mm256_mul_pd(rsq30,rinv30);
372
373             /* Compute parameters for interactions between i and j atoms */
374             qq30             = _mm256_mul_pd(iq3,jq0);
375
376             /* Calculate table index by multiplying r with table scale and truncate to integer */
377             rt               = _mm256_mul_pd(r30,vftabscale);
378             vfitab           = _mm256_cvttpd_epi32(rt);
379             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
380             vfitab           = _mm_slli_epi32(vfitab,2);
381
382             /* CUBIC SPLINE TABLE ELECTROSTATICS */
383             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
384             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
385             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
386             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
387             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
388             Heps             = _mm256_mul_pd(vfeps,H);
389             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
390             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
391             velec            = _mm256_mul_pd(qq30,VV);
392             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
393             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq30,FF),_mm256_mul_pd(vftabscale,rinv30)));
394
395             /* Update potential sum for this i atom from the interaction with this j atom. */
396             velecsum         = _mm256_add_pd(velecsum,velec);
397
398             fscal            = felec;
399
400             /* Calculate temporary vectorial force */
401             tx               = _mm256_mul_pd(fscal,dx30);
402             ty               = _mm256_mul_pd(fscal,dy30);
403             tz               = _mm256_mul_pd(fscal,dz30);
404
405             /* Update vectorial force */
406             fix3             = _mm256_add_pd(fix3,tx);
407             fiy3             = _mm256_add_pd(fiy3,ty);
408             fiz3             = _mm256_add_pd(fiz3,tz);
409
410             fjx0             = _mm256_add_pd(fjx0,tx);
411             fjy0             = _mm256_add_pd(fjy0,ty);
412             fjz0             = _mm256_add_pd(fjz0,tz);
413
414             fjptrA             = f+j_coord_offsetA;
415             fjptrB             = f+j_coord_offsetB;
416             fjptrC             = f+j_coord_offsetC;
417             fjptrD             = f+j_coord_offsetD;
418
419             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
420
421             /* Inner loop uses 164 flops */
422         }
423
424         if(jidx<j_index_end)
425         {
426
427             /* Get j neighbor index, and coordinate index */
428             jnrlistA         = jjnr[jidx];
429             jnrlistB         = jjnr[jidx+1];
430             jnrlistC         = jjnr[jidx+2];
431             jnrlistD         = jjnr[jidx+3];
432             /* Sign of each element will be negative for non-real atoms.
433              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
434              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
435              */
436             tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
437
438             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
439             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
440             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
441
442             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
443             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
444             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
445             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
446             j_coord_offsetA  = DIM*jnrA;
447             j_coord_offsetB  = DIM*jnrB;
448             j_coord_offsetC  = DIM*jnrC;
449             j_coord_offsetD  = DIM*jnrD;
450
451             /* load j atom coordinates */
452             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
453                                                  x+j_coord_offsetC,x+j_coord_offsetD,
454                                                  &jx0,&jy0,&jz0);
455
456             /* Calculate displacement vector */
457             dx00             = _mm256_sub_pd(ix0,jx0);
458             dy00             = _mm256_sub_pd(iy0,jy0);
459             dz00             = _mm256_sub_pd(iz0,jz0);
460             dx10             = _mm256_sub_pd(ix1,jx0);
461             dy10             = _mm256_sub_pd(iy1,jy0);
462             dz10             = _mm256_sub_pd(iz1,jz0);
463             dx20             = _mm256_sub_pd(ix2,jx0);
464             dy20             = _mm256_sub_pd(iy2,jy0);
465             dz20             = _mm256_sub_pd(iz2,jz0);
466             dx30             = _mm256_sub_pd(ix3,jx0);
467             dy30             = _mm256_sub_pd(iy3,jy0);
468             dz30             = _mm256_sub_pd(iz3,jz0);
469
470             /* Calculate squared distance and things based on it */
471             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
472             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
473             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
474             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
475
476             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
477             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
478             rinv30           = gmx_mm256_invsqrt_pd(rsq30);
479
480             rinvsq00         = gmx_mm256_inv_pd(rsq00);
481
482             /* Load parameters for j particles */
483             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
484                                                                  charge+jnrC+0,charge+jnrD+0);
485             vdwjidx0A        = 2*vdwtype[jnrA+0];
486             vdwjidx0B        = 2*vdwtype[jnrB+0];
487             vdwjidx0C        = 2*vdwtype[jnrC+0];
488             vdwjidx0D        = 2*vdwtype[jnrD+0];
489
490             fjx0             = _mm256_setzero_pd();
491             fjy0             = _mm256_setzero_pd();
492             fjz0             = _mm256_setzero_pd();
493
494             /**************************
495              * CALCULATE INTERACTIONS *
496              **************************/
497
498             /* Compute parameters for interactions between i and j atoms */
499             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
500                                             vdwioffsetptr0+vdwjidx0B,
501                                             vdwioffsetptr0+vdwjidx0C,
502                                             vdwioffsetptr0+vdwjidx0D,
503                                             &c6_00,&c12_00);
504
505             /* LENNARD-JONES DISPERSION/REPULSION */
506
507             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
508             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
509             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
510             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
511             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
512
513             /* Update potential sum for this i atom from the interaction with this j atom. */
514             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
515             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
516
517             fscal            = fvdw;
518
519             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
520
521             /* Calculate temporary vectorial force */
522             tx               = _mm256_mul_pd(fscal,dx00);
523             ty               = _mm256_mul_pd(fscal,dy00);
524             tz               = _mm256_mul_pd(fscal,dz00);
525
526             /* Update vectorial force */
527             fix0             = _mm256_add_pd(fix0,tx);
528             fiy0             = _mm256_add_pd(fiy0,ty);
529             fiz0             = _mm256_add_pd(fiz0,tz);
530
531             fjx0             = _mm256_add_pd(fjx0,tx);
532             fjy0             = _mm256_add_pd(fjy0,ty);
533             fjz0             = _mm256_add_pd(fjz0,tz);
534
535             /**************************
536              * CALCULATE INTERACTIONS *
537              **************************/
538
539             r10              = _mm256_mul_pd(rsq10,rinv10);
540             r10              = _mm256_andnot_pd(dummy_mask,r10);
541
542             /* Compute parameters for interactions between i and j atoms */
543             qq10             = _mm256_mul_pd(iq1,jq0);
544
545             /* Calculate table index by multiplying r with table scale and truncate to integer */
546             rt               = _mm256_mul_pd(r10,vftabscale);
547             vfitab           = _mm256_cvttpd_epi32(rt);
548             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
549             vfitab           = _mm_slli_epi32(vfitab,2);
550
551             /* CUBIC SPLINE TABLE ELECTROSTATICS */
552             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
553             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
554             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
555             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
556             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
557             Heps             = _mm256_mul_pd(vfeps,H);
558             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
559             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
560             velec            = _mm256_mul_pd(qq10,VV);
561             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
562             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
563
564             /* Update potential sum for this i atom from the interaction with this j atom. */
565             velec            = _mm256_andnot_pd(dummy_mask,velec);
566             velecsum         = _mm256_add_pd(velecsum,velec);
567
568             fscal            = felec;
569
570             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
571
572             /* Calculate temporary vectorial force */
573             tx               = _mm256_mul_pd(fscal,dx10);
574             ty               = _mm256_mul_pd(fscal,dy10);
575             tz               = _mm256_mul_pd(fscal,dz10);
576
577             /* Update vectorial force */
578             fix1             = _mm256_add_pd(fix1,tx);
579             fiy1             = _mm256_add_pd(fiy1,ty);
580             fiz1             = _mm256_add_pd(fiz1,tz);
581
582             fjx0             = _mm256_add_pd(fjx0,tx);
583             fjy0             = _mm256_add_pd(fjy0,ty);
584             fjz0             = _mm256_add_pd(fjz0,tz);
585
586             /**************************
587              * CALCULATE INTERACTIONS *
588              **************************/
589
590             r20              = _mm256_mul_pd(rsq20,rinv20);
591             r20              = _mm256_andnot_pd(dummy_mask,r20);
592
593             /* Compute parameters for interactions between i and j atoms */
594             qq20             = _mm256_mul_pd(iq2,jq0);
595
596             /* Calculate table index by multiplying r with table scale and truncate to integer */
597             rt               = _mm256_mul_pd(r20,vftabscale);
598             vfitab           = _mm256_cvttpd_epi32(rt);
599             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
600             vfitab           = _mm_slli_epi32(vfitab,2);
601
602             /* CUBIC SPLINE TABLE ELECTROSTATICS */
603             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
604             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
605             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
606             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
607             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
608             Heps             = _mm256_mul_pd(vfeps,H);
609             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
610             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
611             velec            = _mm256_mul_pd(qq20,VV);
612             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
613             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
614
615             /* Update potential sum for this i atom from the interaction with this j atom. */
616             velec            = _mm256_andnot_pd(dummy_mask,velec);
617             velecsum         = _mm256_add_pd(velecsum,velec);
618
619             fscal            = felec;
620
621             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
622
623             /* Calculate temporary vectorial force */
624             tx               = _mm256_mul_pd(fscal,dx20);
625             ty               = _mm256_mul_pd(fscal,dy20);
626             tz               = _mm256_mul_pd(fscal,dz20);
627
628             /* Update vectorial force */
629             fix2             = _mm256_add_pd(fix2,tx);
630             fiy2             = _mm256_add_pd(fiy2,ty);
631             fiz2             = _mm256_add_pd(fiz2,tz);
632
633             fjx0             = _mm256_add_pd(fjx0,tx);
634             fjy0             = _mm256_add_pd(fjy0,ty);
635             fjz0             = _mm256_add_pd(fjz0,tz);
636
637             /**************************
638              * CALCULATE INTERACTIONS *
639              **************************/
640
641             r30              = _mm256_mul_pd(rsq30,rinv30);
642             r30              = _mm256_andnot_pd(dummy_mask,r30);
643
644             /* Compute parameters for interactions between i and j atoms */
645             qq30             = _mm256_mul_pd(iq3,jq0);
646
647             /* Calculate table index by multiplying r with table scale and truncate to integer */
648             rt               = _mm256_mul_pd(r30,vftabscale);
649             vfitab           = _mm256_cvttpd_epi32(rt);
650             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
651             vfitab           = _mm_slli_epi32(vfitab,2);
652
653             /* CUBIC SPLINE TABLE ELECTROSTATICS */
654             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
655             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
656             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
657             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
658             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
659             Heps             = _mm256_mul_pd(vfeps,H);
660             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
661             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
662             velec            = _mm256_mul_pd(qq30,VV);
663             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
664             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq30,FF),_mm256_mul_pd(vftabscale,rinv30)));
665
666             /* Update potential sum for this i atom from the interaction with this j atom. */
667             velec            = _mm256_andnot_pd(dummy_mask,velec);
668             velecsum         = _mm256_add_pd(velecsum,velec);
669
670             fscal            = felec;
671
672             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
673
674             /* Calculate temporary vectorial force */
675             tx               = _mm256_mul_pd(fscal,dx30);
676             ty               = _mm256_mul_pd(fscal,dy30);
677             tz               = _mm256_mul_pd(fscal,dz30);
678
679             /* Update vectorial force */
680             fix3             = _mm256_add_pd(fix3,tx);
681             fiy3             = _mm256_add_pd(fiy3,ty);
682             fiz3             = _mm256_add_pd(fiz3,tz);
683
684             fjx0             = _mm256_add_pd(fjx0,tx);
685             fjy0             = _mm256_add_pd(fjy0,ty);
686             fjz0             = _mm256_add_pd(fjz0,tz);
687
688             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
689             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
690             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
691             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
692
693             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
694
695             /* Inner loop uses 167 flops */
696         }
697
698         /* End of innermost loop */
699
700         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
701                                                  f+i_coord_offset,fshift+i_shift_offset);
702
703         ggid                        = gid[iidx];
704         /* Update potential energies */
705         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
706         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
707
708         /* Increment number of inner iterations */
709         inneriter                  += j_index_end - j_index_start;
710
711         /* Outer loop uses 26 flops */
712     }
713
714     /* Increment number of outer iterations */
715     outeriter        += nri;
716
717     /* Update outer/inner flops */
718
719     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*167);
720 }
721 /*
722  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_double
723  * Electrostatics interaction: CubicSplineTable
724  * VdW interaction:            LennardJones
725  * Geometry:                   Water4-Particle
726  * Calculate force/pot:        Force
727  */
728 void
729 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_double
730                     (t_nblist * gmx_restrict                nlist,
731                      rvec * gmx_restrict                    xx,
732                      rvec * gmx_restrict                    ff,
733                      t_forcerec * gmx_restrict              fr,
734                      t_mdatoms * gmx_restrict               mdatoms,
735                      nb_kernel_data_t * gmx_restrict        kernel_data,
736                      t_nrnb * gmx_restrict                  nrnb)
737 {
738     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
739      * just 0 for non-waters.
740      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
741      * jnr indices corresponding to data put in the four positions in the SIMD register.
742      */
743     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
744     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
745     int              jnrA,jnrB,jnrC,jnrD;
746     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
747     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
748     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
749     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
750     real             rcutoff_scalar;
751     real             *shiftvec,*fshift,*x,*f;
752     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
753     real             scratch[4*DIM];
754     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
755     real *           vdwioffsetptr0;
756     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
757     real *           vdwioffsetptr1;
758     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
759     real *           vdwioffsetptr2;
760     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
761     real *           vdwioffsetptr3;
762     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
763     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
764     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
765     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
766     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
767     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
768     __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
769     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
770     real             *charge;
771     int              nvdwtype;
772     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
773     int              *vdwtype;
774     real             *vdwparam;
775     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
776     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
777     __m128i          vfitab;
778     __m128i          ifour       = _mm_set1_epi32(4);
779     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
780     real             *vftab;
781     __m256d          dummy_mask,cutoff_mask;
782     __m128           tmpmask0,tmpmask1;
783     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
784     __m256d          one     = _mm256_set1_pd(1.0);
785     __m256d          two     = _mm256_set1_pd(2.0);
786     x                = xx[0];
787     f                = ff[0];
788
789     nri              = nlist->nri;
790     iinr             = nlist->iinr;
791     jindex           = nlist->jindex;
792     jjnr             = nlist->jjnr;
793     shiftidx         = nlist->shift;
794     gid              = nlist->gid;
795     shiftvec         = fr->shift_vec[0];
796     fshift           = fr->fshift[0];
797     facel            = _mm256_set1_pd(fr->epsfac);
798     charge           = mdatoms->chargeA;
799     nvdwtype         = fr->ntype;
800     vdwparam         = fr->nbfp;
801     vdwtype          = mdatoms->typeA;
802
803     vftab            = kernel_data->table_elec->data;
804     vftabscale       = _mm256_set1_pd(kernel_data->table_elec->scale);
805
806     /* Setup water-specific parameters */
807     inr              = nlist->iinr[0];
808     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
809     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
810     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
811     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
812
813     /* Avoid stupid compiler warnings */
814     jnrA = jnrB = jnrC = jnrD = 0;
815     j_coord_offsetA = 0;
816     j_coord_offsetB = 0;
817     j_coord_offsetC = 0;
818     j_coord_offsetD = 0;
819
820     outeriter        = 0;
821     inneriter        = 0;
822
823     for(iidx=0;iidx<4*DIM;iidx++)
824     {
825         scratch[iidx] = 0.0;
826     }
827
828     /* Start outer loop over neighborlists */
829     for(iidx=0; iidx<nri; iidx++)
830     {
831         /* Load shift vector for this list */
832         i_shift_offset   = DIM*shiftidx[iidx];
833
834         /* Load limits for loop over neighbors */
835         j_index_start    = jindex[iidx];
836         j_index_end      = jindex[iidx+1];
837
838         /* Get outer coordinate index */
839         inr              = iinr[iidx];
840         i_coord_offset   = DIM*inr;
841
842         /* Load i particle coords and add shift vector */
843         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
844                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
845
846         fix0             = _mm256_setzero_pd();
847         fiy0             = _mm256_setzero_pd();
848         fiz0             = _mm256_setzero_pd();
849         fix1             = _mm256_setzero_pd();
850         fiy1             = _mm256_setzero_pd();
851         fiz1             = _mm256_setzero_pd();
852         fix2             = _mm256_setzero_pd();
853         fiy2             = _mm256_setzero_pd();
854         fiz2             = _mm256_setzero_pd();
855         fix3             = _mm256_setzero_pd();
856         fiy3             = _mm256_setzero_pd();
857         fiz3             = _mm256_setzero_pd();
858
859         /* Start inner kernel loop */
860         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
861         {
862
863             /* Get j neighbor index, and coordinate index */
864             jnrA             = jjnr[jidx];
865             jnrB             = jjnr[jidx+1];
866             jnrC             = jjnr[jidx+2];
867             jnrD             = jjnr[jidx+3];
868             j_coord_offsetA  = DIM*jnrA;
869             j_coord_offsetB  = DIM*jnrB;
870             j_coord_offsetC  = DIM*jnrC;
871             j_coord_offsetD  = DIM*jnrD;
872
873             /* load j atom coordinates */
874             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
875                                                  x+j_coord_offsetC,x+j_coord_offsetD,
876                                                  &jx0,&jy0,&jz0);
877
878             /* Calculate displacement vector */
879             dx00             = _mm256_sub_pd(ix0,jx0);
880             dy00             = _mm256_sub_pd(iy0,jy0);
881             dz00             = _mm256_sub_pd(iz0,jz0);
882             dx10             = _mm256_sub_pd(ix1,jx0);
883             dy10             = _mm256_sub_pd(iy1,jy0);
884             dz10             = _mm256_sub_pd(iz1,jz0);
885             dx20             = _mm256_sub_pd(ix2,jx0);
886             dy20             = _mm256_sub_pd(iy2,jy0);
887             dz20             = _mm256_sub_pd(iz2,jz0);
888             dx30             = _mm256_sub_pd(ix3,jx0);
889             dy30             = _mm256_sub_pd(iy3,jy0);
890             dz30             = _mm256_sub_pd(iz3,jz0);
891
892             /* Calculate squared distance and things based on it */
893             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
894             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
895             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
896             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
897
898             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
899             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
900             rinv30           = gmx_mm256_invsqrt_pd(rsq30);
901
902             rinvsq00         = gmx_mm256_inv_pd(rsq00);
903
904             /* Load parameters for j particles */
905             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
906                                                                  charge+jnrC+0,charge+jnrD+0);
907             vdwjidx0A        = 2*vdwtype[jnrA+0];
908             vdwjidx0B        = 2*vdwtype[jnrB+0];
909             vdwjidx0C        = 2*vdwtype[jnrC+0];
910             vdwjidx0D        = 2*vdwtype[jnrD+0];
911
912             fjx0             = _mm256_setzero_pd();
913             fjy0             = _mm256_setzero_pd();
914             fjz0             = _mm256_setzero_pd();
915
916             /**************************
917              * CALCULATE INTERACTIONS *
918              **************************/
919
920             /* Compute parameters for interactions between i and j atoms */
921             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
922                                             vdwioffsetptr0+vdwjidx0B,
923                                             vdwioffsetptr0+vdwjidx0C,
924                                             vdwioffsetptr0+vdwjidx0D,
925                                             &c6_00,&c12_00);
926
927             /* LENNARD-JONES DISPERSION/REPULSION */
928
929             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
930             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
931
932             fscal            = fvdw;
933
934             /* Calculate temporary vectorial force */
935             tx               = _mm256_mul_pd(fscal,dx00);
936             ty               = _mm256_mul_pd(fscal,dy00);
937             tz               = _mm256_mul_pd(fscal,dz00);
938
939             /* Update vectorial force */
940             fix0             = _mm256_add_pd(fix0,tx);
941             fiy0             = _mm256_add_pd(fiy0,ty);
942             fiz0             = _mm256_add_pd(fiz0,tz);
943
944             fjx0             = _mm256_add_pd(fjx0,tx);
945             fjy0             = _mm256_add_pd(fjy0,ty);
946             fjz0             = _mm256_add_pd(fjz0,tz);
947
948             /**************************
949              * CALCULATE INTERACTIONS *
950              **************************/
951
952             r10              = _mm256_mul_pd(rsq10,rinv10);
953
954             /* Compute parameters for interactions between i and j atoms */
955             qq10             = _mm256_mul_pd(iq1,jq0);
956
957             /* Calculate table index by multiplying r with table scale and truncate to integer */
958             rt               = _mm256_mul_pd(r10,vftabscale);
959             vfitab           = _mm256_cvttpd_epi32(rt);
960             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
961             vfitab           = _mm_slli_epi32(vfitab,2);
962
963             /* CUBIC SPLINE TABLE ELECTROSTATICS */
964             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
965             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
966             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
967             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
968             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
969             Heps             = _mm256_mul_pd(vfeps,H);
970             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
971             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
972             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
973
974             fscal            = felec;
975
976             /* Calculate temporary vectorial force */
977             tx               = _mm256_mul_pd(fscal,dx10);
978             ty               = _mm256_mul_pd(fscal,dy10);
979             tz               = _mm256_mul_pd(fscal,dz10);
980
981             /* Update vectorial force */
982             fix1             = _mm256_add_pd(fix1,tx);
983             fiy1             = _mm256_add_pd(fiy1,ty);
984             fiz1             = _mm256_add_pd(fiz1,tz);
985
986             fjx0             = _mm256_add_pd(fjx0,tx);
987             fjy0             = _mm256_add_pd(fjy0,ty);
988             fjz0             = _mm256_add_pd(fjz0,tz);
989
990             /**************************
991              * CALCULATE INTERACTIONS *
992              **************************/
993
994             r20              = _mm256_mul_pd(rsq20,rinv20);
995
996             /* Compute parameters for interactions between i and j atoms */
997             qq20             = _mm256_mul_pd(iq2,jq0);
998
999             /* Calculate table index by multiplying r with table scale and truncate to integer */
1000             rt               = _mm256_mul_pd(r20,vftabscale);
1001             vfitab           = _mm256_cvttpd_epi32(rt);
1002             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1003             vfitab           = _mm_slli_epi32(vfitab,2);
1004
1005             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1006             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1007             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1008             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1009             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1010             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1011             Heps             = _mm256_mul_pd(vfeps,H);
1012             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1013             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1014             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1015
1016             fscal            = felec;
1017
1018             /* Calculate temporary vectorial force */
1019             tx               = _mm256_mul_pd(fscal,dx20);
1020             ty               = _mm256_mul_pd(fscal,dy20);
1021             tz               = _mm256_mul_pd(fscal,dz20);
1022
1023             /* Update vectorial force */
1024             fix2             = _mm256_add_pd(fix2,tx);
1025             fiy2             = _mm256_add_pd(fiy2,ty);
1026             fiz2             = _mm256_add_pd(fiz2,tz);
1027
1028             fjx0             = _mm256_add_pd(fjx0,tx);
1029             fjy0             = _mm256_add_pd(fjy0,ty);
1030             fjz0             = _mm256_add_pd(fjz0,tz);
1031
1032             /**************************
1033              * CALCULATE INTERACTIONS *
1034              **************************/
1035
1036             r30              = _mm256_mul_pd(rsq30,rinv30);
1037
1038             /* Compute parameters for interactions between i and j atoms */
1039             qq30             = _mm256_mul_pd(iq3,jq0);
1040
1041             /* Calculate table index by multiplying r with table scale and truncate to integer */
1042             rt               = _mm256_mul_pd(r30,vftabscale);
1043             vfitab           = _mm256_cvttpd_epi32(rt);
1044             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1045             vfitab           = _mm_slli_epi32(vfitab,2);
1046
1047             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1048             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1049             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1050             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1051             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1052             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1053             Heps             = _mm256_mul_pd(vfeps,H);
1054             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1055             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1056             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq30,FF),_mm256_mul_pd(vftabscale,rinv30)));
1057
1058             fscal            = felec;
1059
1060             /* Calculate temporary vectorial force */
1061             tx               = _mm256_mul_pd(fscal,dx30);
1062             ty               = _mm256_mul_pd(fscal,dy30);
1063             tz               = _mm256_mul_pd(fscal,dz30);
1064
1065             /* Update vectorial force */
1066             fix3             = _mm256_add_pd(fix3,tx);
1067             fiy3             = _mm256_add_pd(fiy3,ty);
1068             fiz3             = _mm256_add_pd(fiz3,tz);
1069
1070             fjx0             = _mm256_add_pd(fjx0,tx);
1071             fjy0             = _mm256_add_pd(fjy0,ty);
1072             fjz0             = _mm256_add_pd(fjz0,tz);
1073
1074             fjptrA             = f+j_coord_offsetA;
1075             fjptrB             = f+j_coord_offsetB;
1076             fjptrC             = f+j_coord_offsetC;
1077             fjptrD             = f+j_coord_offsetD;
1078
1079             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1080
1081             /* Inner loop uses 147 flops */
1082         }
1083
1084         if(jidx<j_index_end)
1085         {
1086
1087             /* Get j neighbor index, and coordinate index */
1088             jnrlistA         = jjnr[jidx];
1089             jnrlistB         = jjnr[jidx+1];
1090             jnrlistC         = jjnr[jidx+2];
1091             jnrlistD         = jjnr[jidx+3];
1092             /* Sign of each element will be negative for non-real atoms.
1093              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1094              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1095              */
1096             tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1097
1098             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1099             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1100             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1101
1102             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1103             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1104             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1105             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1106             j_coord_offsetA  = DIM*jnrA;
1107             j_coord_offsetB  = DIM*jnrB;
1108             j_coord_offsetC  = DIM*jnrC;
1109             j_coord_offsetD  = DIM*jnrD;
1110
1111             /* load j atom coordinates */
1112             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1113                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1114                                                  &jx0,&jy0,&jz0);
1115
1116             /* Calculate displacement vector */
1117             dx00             = _mm256_sub_pd(ix0,jx0);
1118             dy00             = _mm256_sub_pd(iy0,jy0);
1119             dz00             = _mm256_sub_pd(iz0,jz0);
1120             dx10             = _mm256_sub_pd(ix1,jx0);
1121             dy10             = _mm256_sub_pd(iy1,jy0);
1122             dz10             = _mm256_sub_pd(iz1,jz0);
1123             dx20             = _mm256_sub_pd(ix2,jx0);
1124             dy20             = _mm256_sub_pd(iy2,jy0);
1125             dz20             = _mm256_sub_pd(iz2,jz0);
1126             dx30             = _mm256_sub_pd(ix3,jx0);
1127             dy30             = _mm256_sub_pd(iy3,jy0);
1128             dz30             = _mm256_sub_pd(iz3,jz0);
1129
1130             /* Calculate squared distance and things based on it */
1131             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1132             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1133             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1134             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
1135
1136             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1137             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1138             rinv30           = gmx_mm256_invsqrt_pd(rsq30);
1139
1140             rinvsq00         = gmx_mm256_inv_pd(rsq00);
1141
1142             /* Load parameters for j particles */
1143             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1144                                                                  charge+jnrC+0,charge+jnrD+0);
1145             vdwjidx0A        = 2*vdwtype[jnrA+0];
1146             vdwjidx0B        = 2*vdwtype[jnrB+0];
1147             vdwjidx0C        = 2*vdwtype[jnrC+0];
1148             vdwjidx0D        = 2*vdwtype[jnrD+0];
1149
1150             fjx0             = _mm256_setzero_pd();
1151             fjy0             = _mm256_setzero_pd();
1152             fjz0             = _mm256_setzero_pd();
1153
1154             /**************************
1155              * CALCULATE INTERACTIONS *
1156              **************************/
1157
1158             /* Compute parameters for interactions between i and j atoms */
1159             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1160                                             vdwioffsetptr0+vdwjidx0B,
1161                                             vdwioffsetptr0+vdwjidx0C,
1162                                             vdwioffsetptr0+vdwjidx0D,
1163                                             &c6_00,&c12_00);
1164
1165             /* LENNARD-JONES DISPERSION/REPULSION */
1166
1167             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1168             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1169
1170             fscal            = fvdw;
1171
1172             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1173
1174             /* Calculate temporary vectorial force */
1175             tx               = _mm256_mul_pd(fscal,dx00);
1176             ty               = _mm256_mul_pd(fscal,dy00);
1177             tz               = _mm256_mul_pd(fscal,dz00);
1178
1179             /* Update vectorial force */
1180             fix0             = _mm256_add_pd(fix0,tx);
1181             fiy0             = _mm256_add_pd(fiy0,ty);
1182             fiz0             = _mm256_add_pd(fiz0,tz);
1183
1184             fjx0             = _mm256_add_pd(fjx0,tx);
1185             fjy0             = _mm256_add_pd(fjy0,ty);
1186             fjz0             = _mm256_add_pd(fjz0,tz);
1187
1188             /**************************
1189              * CALCULATE INTERACTIONS *
1190              **************************/
1191
1192             r10              = _mm256_mul_pd(rsq10,rinv10);
1193             r10              = _mm256_andnot_pd(dummy_mask,r10);
1194
1195             /* Compute parameters for interactions between i and j atoms */
1196             qq10             = _mm256_mul_pd(iq1,jq0);
1197
1198             /* Calculate table index by multiplying r with table scale and truncate to integer */
1199             rt               = _mm256_mul_pd(r10,vftabscale);
1200             vfitab           = _mm256_cvttpd_epi32(rt);
1201             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1202             vfitab           = _mm_slli_epi32(vfitab,2);
1203
1204             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1205             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1206             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1207             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1208             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1209             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1210             Heps             = _mm256_mul_pd(vfeps,H);
1211             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1212             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1213             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
1214
1215             fscal            = felec;
1216
1217             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1218
1219             /* Calculate temporary vectorial force */
1220             tx               = _mm256_mul_pd(fscal,dx10);
1221             ty               = _mm256_mul_pd(fscal,dy10);
1222             tz               = _mm256_mul_pd(fscal,dz10);
1223
1224             /* Update vectorial force */
1225             fix1             = _mm256_add_pd(fix1,tx);
1226             fiy1             = _mm256_add_pd(fiy1,ty);
1227             fiz1             = _mm256_add_pd(fiz1,tz);
1228
1229             fjx0             = _mm256_add_pd(fjx0,tx);
1230             fjy0             = _mm256_add_pd(fjy0,ty);
1231             fjz0             = _mm256_add_pd(fjz0,tz);
1232
1233             /**************************
1234              * CALCULATE INTERACTIONS *
1235              **************************/
1236
1237             r20              = _mm256_mul_pd(rsq20,rinv20);
1238             r20              = _mm256_andnot_pd(dummy_mask,r20);
1239
1240             /* Compute parameters for interactions between i and j atoms */
1241             qq20             = _mm256_mul_pd(iq2,jq0);
1242
1243             /* Calculate table index by multiplying r with table scale and truncate to integer */
1244             rt               = _mm256_mul_pd(r20,vftabscale);
1245             vfitab           = _mm256_cvttpd_epi32(rt);
1246             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1247             vfitab           = _mm_slli_epi32(vfitab,2);
1248
1249             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1250             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1251             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1252             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1253             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1254             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1255             Heps             = _mm256_mul_pd(vfeps,H);
1256             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1257             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1258             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1259
1260             fscal            = felec;
1261
1262             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1263
1264             /* Calculate temporary vectorial force */
1265             tx               = _mm256_mul_pd(fscal,dx20);
1266             ty               = _mm256_mul_pd(fscal,dy20);
1267             tz               = _mm256_mul_pd(fscal,dz20);
1268
1269             /* Update vectorial force */
1270             fix2             = _mm256_add_pd(fix2,tx);
1271             fiy2             = _mm256_add_pd(fiy2,ty);
1272             fiz2             = _mm256_add_pd(fiz2,tz);
1273
1274             fjx0             = _mm256_add_pd(fjx0,tx);
1275             fjy0             = _mm256_add_pd(fjy0,ty);
1276             fjz0             = _mm256_add_pd(fjz0,tz);
1277
1278             /**************************
1279              * CALCULATE INTERACTIONS *
1280              **************************/
1281
1282             r30              = _mm256_mul_pd(rsq30,rinv30);
1283             r30              = _mm256_andnot_pd(dummy_mask,r30);
1284
1285             /* Compute parameters for interactions between i and j atoms */
1286             qq30             = _mm256_mul_pd(iq3,jq0);
1287
1288             /* Calculate table index by multiplying r with table scale and truncate to integer */
1289             rt               = _mm256_mul_pd(r30,vftabscale);
1290             vfitab           = _mm256_cvttpd_epi32(rt);
1291             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1292             vfitab           = _mm_slli_epi32(vfitab,2);
1293
1294             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1295             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1296             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1297             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1298             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1299             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1300             Heps             = _mm256_mul_pd(vfeps,H);
1301             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1302             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1303             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq30,FF),_mm256_mul_pd(vftabscale,rinv30)));
1304
1305             fscal            = felec;
1306
1307             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1308
1309             /* Calculate temporary vectorial force */
1310             tx               = _mm256_mul_pd(fscal,dx30);
1311             ty               = _mm256_mul_pd(fscal,dy30);
1312             tz               = _mm256_mul_pd(fscal,dz30);
1313
1314             /* Update vectorial force */
1315             fix3             = _mm256_add_pd(fix3,tx);
1316             fiy3             = _mm256_add_pd(fiy3,ty);
1317             fiz3             = _mm256_add_pd(fiz3,tz);
1318
1319             fjx0             = _mm256_add_pd(fjx0,tx);
1320             fjy0             = _mm256_add_pd(fjy0,ty);
1321             fjz0             = _mm256_add_pd(fjz0,tz);
1322
1323             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1324             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1325             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1326             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1327
1328             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1329
1330             /* Inner loop uses 150 flops */
1331         }
1332
1333         /* End of innermost loop */
1334
1335         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1336                                                  f+i_coord_offset,fshift+i_shift_offset);
1337
1338         /* Increment number of inner iterations */
1339         inneriter                  += j_index_end - j_index_start;
1340
1341         /* Outer loop uses 24 flops */
1342     }
1343
1344     /* Increment number of outer iterations */
1345     outeriter        += nri;
1346
1347     /* Update outer/inner flops */
1348
1349     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*150);
1350 }