Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_single
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     real *           vdwioffsetptr3;
79     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
80     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
81     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
82     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
83     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
84     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
85     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
86     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
87     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
88     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
89     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
90     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
91     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
92     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
93     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
94     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
95     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
96     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
97     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
98     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
99     real             *charge;
100     int              nvdwtype;
101     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102     int              *vdwtype;
103     real             *vdwparam;
104     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
105     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
106     __m256           dummy_mask,cutoff_mask;
107     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
108     __m256           one     = _mm256_set1_ps(1.0);
109     __m256           two     = _mm256_set1_ps(2.0);
110     x                = xx[0];
111     f                = ff[0];
112
113     nri              = nlist->nri;
114     iinr             = nlist->iinr;
115     jindex           = nlist->jindex;
116     jjnr             = nlist->jjnr;
117     shiftidx         = nlist->shift;
118     gid              = nlist->gid;
119     shiftvec         = fr->shift_vec[0];
120     fshift           = fr->fshift[0];
121     facel            = _mm256_set1_ps(fr->epsfac);
122     charge           = mdatoms->chargeA;
123     krf              = _mm256_set1_ps(fr->ic->k_rf);
124     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
125     crf              = _mm256_set1_ps(fr->ic->c_rf);
126     nvdwtype         = fr->ntype;
127     vdwparam         = fr->nbfp;
128     vdwtype          = mdatoms->typeA;
129
130     /* Setup water-specific parameters */
131     inr              = nlist->iinr[0];
132     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
133     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
134     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
135     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
136
137     jq1              = _mm256_set1_ps(charge[inr+1]);
138     jq2              = _mm256_set1_ps(charge[inr+2]);
139     jq3              = _mm256_set1_ps(charge[inr+3]);
140     vdwjidx0A        = 2*vdwtype[inr+0];
141     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
142     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
143     qq11             = _mm256_mul_ps(iq1,jq1);
144     qq12             = _mm256_mul_ps(iq1,jq2);
145     qq13             = _mm256_mul_ps(iq1,jq3);
146     qq21             = _mm256_mul_ps(iq2,jq1);
147     qq22             = _mm256_mul_ps(iq2,jq2);
148     qq23             = _mm256_mul_ps(iq2,jq3);
149     qq31             = _mm256_mul_ps(iq3,jq1);
150     qq32             = _mm256_mul_ps(iq3,jq2);
151     qq33             = _mm256_mul_ps(iq3,jq3);
152
153     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
154     rcutoff_scalar   = fr->rcoulomb;
155     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
156     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
157
158     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
159     rvdw             = _mm256_set1_ps(fr->rvdw);
160
161     /* Avoid stupid compiler warnings */
162     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
163     j_coord_offsetA = 0;
164     j_coord_offsetB = 0;
165     j_coord_offsetC = 0;
166     j_coord_offsetD = 0;
167     j_coord_offsetE = 0;
168     j_coord_offsetF = 0;
169     j_coord_offsetG = 0;
170     j_coord_offsetH = 0;
171
172     outeriter        = 0;
173     inneriter        = 0;
174
175     for(iidx=0;iidx<4*DIM;iidx++)
176     {
177         scratch[iidx] = 0.0;
178     }
179
180     /* Start outer loop over neighborlists */
181     for(iidx=0; iidx<nri; iidx++)
182     {
183         /* Load shift vector for this list */
184         i_shift_offset   = DIM*shiftidx[iidx];
185
186         /* Load limits for loop over neighbors */
187         j_index_start    = jindex[iidx];
188         j_index_end      = jindex[iidx+1];
189
190         /* Get outer coordinate index */
191         inr              = iinr[iidx];
192         i_coord_offset   = DIM*inr;
193
194         /* Load i particle coords and add shift vector */
195         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
196                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
197
198         fix0             = _mm256_setzero_ps();
199         fiy0             = _mm256_setzero_ps();
200         fiz0             = _mm256_setzero_ps();
201         fix1             = _mm256_setzero_ps();
202         fiy1             = _mm256_setzero_ps();
203         fiz1             = _mm256_setzero_ps();
204         fix2             = _mm256_setzero_ps();
205         fiy2             = _mm256_setzero_ps();
206         fiz2             = _mm256_setzero_ps();
207         fix3             = _mm256_setzero_ps();
208         fiy3             = _mm256_setzero_ps();
209         fiz3             = _mm256_setzero_ps();
210
211         /* Reset potential sums */
212         velecsum         = _mm256_setzero_ps();
213         vvdwsum          = _mm256_setzero_ps();
214
215         /* Start inner kernel loop */
216         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
217         {
218
219             /* Get j neighbor index, and coordinate index */
220             jnrA             = jjnr[jidx];
221             jnrB             = jjnr[jidx+1];
222             jnrC             = jjnr[jidx+2];
223             jnrD             = jjnr[jidx+3];
224             jnrE             = jjnr[jidx+4];
225             jnrF             = jjnr[jidx+5];
226             jnrG             = jjnr[jidx+6];
227             jnrH             = jjnr[jidx+7];
228             j_coord_offsetA  = DIM*jnrA;
229             j_coord_offsetB  = DIM*jnrB;
230             j_coord_offsetC  = DIM*jnrC;
231             j_coord_offsetD  = DIM*jnrD;
232             j_coord_offsetE  = DIM*jnrE;
233             j_coord_offsetF  = DIM*jnrF;
234             j_coord_offsetG  = DIM*jnrG;
235             j_coord_offsetH  = DIM*jnrH;
236
237             /* load j atom coordinates */
238             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
239                                                  x+j_coord_offsetC,x+j_coord_offsetD,
240                                                  x+j_coord_offsetE,x+j_coord_offsetF,
241                                                  x+j_coord_offsetG,x+j_coord_offsetH,
242                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
243                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
244
245             /* Calculate displacement vector */
246             dx00             = _mm256_sub_ps(ix0,jx0);
247             dy00             = _mm256_sub_ps(iy0,jy0);
248             dz00             = _mm256_sub_ps(iz0,jz0);
249             dx11             = _mm256_sub_ps(ix1,jx1);
250             dy11             = _mm256_sub_ps(iy1,jy1);
251             dz11             = _mm256_sub_ps(iz1,jz1);
252             dx12             = _mm256_sub_ps(ix1,jx2);
253             dy12             = _mm256_sub_ps(iy1,jy2);
254             dz12             = _mm256_sub_ps(iz1,jz2);
255             dx13             = _mm256_sub_ps(ix1,jx3);
256             dy13             = _mm256_sub_ps(iy1,jy3);
257             dz13             = _mm256_sub_ps(iz1,jz3);
258             dx21             = _mm256_sub_ps(ix2,jx1);
259             dy21             = _mm256_sub_ps(iy2,jy1);
260             dz21             = _mm256_sub_ps(iz2,jz1);
261             dx22             = _mm256_sub_ps(ix2,jx2);
262             dy22             = _mm256_sub_ps(iy2,jy2);
263             dz22             = _mm256_sub_ps(iz2,jz2);
264             dx23             = _mm256_sub_ps(ix2,jx3);
265             dy23             = _mm256_sub_ps(iy2,jy3);
266             dz23             = _mm256_sub_ps(iz2,jz3);
267             dx31             = _mm256_sub_ps(ix3,jx1);
268             dy31             = _mm256_sub_ps(iy3,jy1);
269             dz31             = _mm256_sub_ps(iz3,jz1);
270             dx32             = _mm256_sub_ps(ix3,jx2);
271             dy32             = _mm256_sub_ps(iy3,jy2);
272             dz32             = _mm256_sub_ps(iz3,jz2);
273             dx33             = _mm256_sub_ps(ix3,jx3);
274             dy33             = _mm256_sub_ps(iy3,jy3);
275             dz33             = _mm256_sub_ps(iz3,jz3);
276
277             /* Calculate squared distance and things based on it */
278             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
279             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
280             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
281             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
282             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
283             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
284             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
285             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
286             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
287             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
288
289             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
290             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
291             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
292             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
293             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
294             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
295             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
296             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
297             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
298
299             rinvsq00         = gmx_mm256_inv_ps(rsq00);
300             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
301             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
302             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
303             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
304             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
305             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
306             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
307             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
308             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
309
310             fjx0             = _mm256_setzero_ps();
311             fjy0             = _mm256_setzero_ps();
312             fjz0             = _mm256_setzero_ps();
313             fjx1             = _mm256_setzero_ps();
314             fjy1             = _mm256_setzero_ps();
315             fjz1             = _mm256_setzero_ps();
316             fjx2             = _mm256_setzero_ps();
317             fjy2             = _mm256_setzero_ps();
318             fjz2             = _mm256_setzero_ps();
319             fjx3             = _mm256_setzero_ps();
320             fjy3             = _mm256_setzero_ps();
321             fjz3             = _mm256_setzero_ps();
322
323             /**************************
324              * CALCULATE INTERACTIONS *
325              **************************/
326
327             if (gmx_mm256_any_lt(rsq00,rcutoff2))
328             {
329
330             /* LENNARD-JONES DISPERSION/REPULSION */
331
332             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
333             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
334             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
335             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
336                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
337             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
338
339             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
340
341             /* Update potential sum for this i atom from the interaction with this j atom. */
342             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
343             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
344
345             fscal            = fvdw;
346
347             fscal            = _mm256_and_ps(fscal,cutoff_mask);
348
349             /* Calculate temporary vectorial force */
350             tx               = _mm256_mul_ps(fscal,dx00);
351             ty               = _mm256_mul_ps(fscal,dy00);
352             tz               = _mm256_mul_ps(fscal,dz00);
353
354             /* Update vectorial force */
355             fix0             = _mm256_add_ps(fix0,tx);
356             fiy0             = _mm256_add_ps(fiy0,ty);
357             fiz0             = _mm256_add_ps(fiz0,tz);
358
359             fjx0             = _mm256_add_ps(fjx0,tx);
360             fjy0             = _mm256_add_ps(fjy0,ty);
361             fjz0             = _mm256_add_ps(fjz0,tz);
362
363             }
364
365             /**************************
366              * CALCULATE INTERACTIONS *
367              **************************/
368
369             if (gmx_mm256_any_lt(rsq11,rcutoff2))
370             {
371
372             /* REACTION-FIELD ELECTROSTATICS */
373             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
374             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
375
376             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
377
378             /* Update potential sum for this i atom from the interaction with this j atom. */
379             velec            = _mm256_and_ps(velec,cutoff_mask);
380             velecsum         = _mm256_add_ps(velecsum,velec);
381
382             fscal            = felec;
383
384             fscal            = _mm256_and_ps(fscal,cutoff_mask);
385
386             /* Calculate temporary vectorial force */
387             tx               = _mm256_mul_ps(fscal,dx11);
388             ty               = _mm256_mul_ps(fscal,dy11);
389             tz               = _mm256_mul_ps(fscal,dz11);
390
391             /* Update vectorial force */
392             fix1             = _mm256_add_ps(fix1,tx);
393             fiy1             = _mm256_add_ps(fiy1,ty);
394             fiz1             = _mm256_add_ps(fiz1,tz);
395
396             fjx1             = _mm256_add_ps(fjx1,tx);
397             fjy1             = _mm256_add_ps(fjy1,ty);
398             fjz1             = _mm256_add_ps(fjz1,tz);
399
400             }
401
402             /**************************
403              * CALCULATE INTERACTIONS *
404              **************************/
405
406             if (gmx_mm256_any_lt(rsq12,rcutoff2))
407             {
408
409             /* REACTION-FIELD ELECTROSTATICS */
410             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
411             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
412
413             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
414
415             /* Update potential sum for this i atom from the interaction with this j atom. */
416             velec            = _mm256_and_ps(velec,cutoff_mask);
417             velecsum         = _mm256_add_ps(velecsum,velec);
418
419             fscal            = felec;
420
421             fscal            = _mm256_and_ps(fscal,cutoff_mask);
422
423             /* Calculate temporary vectorial force */
424             tx               = _mm256_mul_ps(fscal,dx12);
425             ty               = _mm256_mul_ps(fscal,dy12);
426             tz               = _mm256_mul_ps(fscal,dz12);
427
428             /* Update vectorial force */
429             fix1             = _mm256_add_ps(fix1,tx);
430             fiy1             = _mm256_add_ps(fiy1,ty);
431             fiz1             = _mm256_add_ps(fiz1,tz);
432
433             fjx2             = _mm256_add_ps(fjx2,tx);
434             fjy2             = _mm256_add_ps(fjy2,ty);
435             fjz2             = _mm256_add_ps(fjz2,tz);
436
437             }
438
439             /**************************
440              * CALCULATE INTERACTIONS *
441              **************************/
442
443             if (gmx_mm256_any_lt(rsq13,rcutoff2))
444             {
445
446             /* REACTION-FIELD ELECTROSTATICS */
447             velec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
448             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
449
450             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
451
452             /* Update potential sum for this i atom from the interaction with this j atom. */
453             velec            = _mm256_and_ps(velec,cutoff_mask);
454             velecsum         = _mm256_add_ps(velecsum,velec);
455
456             fscal            = felec;
457
458             fscal            = _mm256_and_ps(fscal,cutoff_mask);
459
460             /* Calculate temporary vectorial force */
461             tx               = _mm256_mul_ps(fscal,dx13);
462             ty               = _mm256_mul_ps(fscal,dy13);
463             tz               = _mm256_mul_ps(fscal,dz13);
464
465             /* Update vectorial force */
466             fix1             = _mm256_add_ps(fix1,tx);
467             fiy1             = _mm256_add_ps(fiy1,ty);
468             fiz1             = _mm256_add_ps(fiz1,tz);
469
470             fjx3             = _mm256_add_ps(fjx3,tx);
471             fjy3             = _mm256_add_ps(fjy3,ty);
472             fjz3             = _mm256_add_ps(fjz3,tz);
473
474             }
475
476             /**************************
477              * CALCULATE INTERACTIONS *
478              **************************/
479
480             if (gmx_mm256_any_lt(rsq21,rcutoff2))
481             {
482
483             /* REACTION-FIELD ELECTROSTATICS */
484             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
485             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
486
487             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
488
489             /* Update potential sum for this i atom from the interaction with this j atom. */
490             velec            = _mm256_and_ps(velec,cutoff_mask);
491             velecsum         = _mm256_add_ps(velecsum,velec);
492
493             fscal            = felec;
494
495             fscal            = _mm256_and_ps(fscal,cutoff_mask);
496
497             /* Calculate temporary vectorial force */
498             tx               = _mm256_mul_ps(fscal,dx21);
499             ty               = _mm256_mul_ps(fscal,dy21);
500             tz               = _mm256_mul_ps(fscal,dz21);
501
502             /* Update vectorial force */
503             fix2             = _mm256_add_ps(fix2,tx);
504             fiy2             = _mm256_add_ps(fiy2,ty);
505             fiz2             = _mm256_add_ps(fiz2,tz);
506
507             fjx1             = _mm256_add_ps(fjx1,tx);
508             fjy1             = _mm256_add_ps(fjy1,ty);
509             fjz1             = _mm256_add_ps(fjz1,tz);
510
511             }
512
513             /**************************
514              * CALCULATE INTERACTIONS *
515              **************************/
516
517             if (gmx_mm256_any_lt(rsq22,rcutoff2))
518             {
519
520             /* REACTION-FIELD ELECTROSTATICS */
521             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
522             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
523
524             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
525
526             /* Update potential sum for this i atom from the interaction with this j atom. */
527             velec            = _mm256_and_ps(velec,cutoff_mask);
528             velecsum         = _mm256_add_ps(velecsum,velec);
529
530             fscal            = felec;
531
532             fscal            = _mm256_and_ps(fscal,cutoff_mask);
533
534             /* Calculate temporary vectorial force */
535             tx               = _mm256_mul_ps(fscal,dx22);
536             ty               = _mm256_mul_ps(fscal,dy22);
537             tz               = _mm256_mul_ps(fscal,dz22);
538
539             /* Update vectorial force */
540             fix2             = _mm256_add_ps(fix2,tx);
541             fiy2             = _mm256_add_ps(fiy2,ty);
542             fiz2             = _mm256_add_ps(fiz2,tz);
543
544             fjx2             = _mm256_add_ps(fjx2,tx);
545             fjy2             = _mm256_add_ps(fjy2,ty);
546             fjz2             = _mm256_add_ps(fjz2,tz);
547
548             }
549
550             /**************************
551              * CALCULATE INTERACTIONS *
552              **************************/
553
554             if (gmx_mm256_any_lt(rsq23,rcutoff2))
555             {
556
557             /* REACTION-FIELD ELECTROSTATICS */
558             velec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
559             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
560
561             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
562
563             /* Update potential sum for this i atom from the interaction with this j atom. */
564             velec            = _mm256_and_ps(velec,cutoff_mask);
565             velecsum         = _mm256_add_ps(velecsum,velec);
566
567             fscal            = felec;
568
569             fscal            = _mm256_and_ps(fscal,cutoff_mask);
570
571             /* Calculate temporary vectorial force */
572             tx               = _mm256_mul_ps(fscal,dx23);
573             ty               = _mm256_mul_ps(fscal,dy23);
574             tz               = _mm256_mul_ps(fscal,dz23);
575
576             /* Update vectorial force */
577             fix2             = _mm256_add_ps(fix2,tx);
578             fiy2             = _mm256_add_ps(fiy2,ty);
579             fiz2             = _mm256_add_ps(fiz2,tz);
580
581             fjx3             = _mm256_add_ps(fjx3,tx);
582             fjy3             = _mm256_add_ps(fjy3,ty);
583             fjz3             = _mm256_add_ps(fjz3,tz);
584
585             }
586
587             /**************************
588              * CALCULATE INTERACTIONS *
589              **************************/
590
591             if (gmx_mm256_any_lt(rsq31,rcutoff2))
592             {
593
594             /* REACTION-FIELD ELECTROSTATICS */
595             velec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
596             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
597
598             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
599
600             /* Update potential sum for this i atom from the interaction with this j atom. */
601             velec            = _mm256_and_ps(velec,cutoff_mask);
602             velecsum         = _mm256_add_ps(velecsum,velec);
603
604             fscal            = felec;
605
606             fscal            = _mm256_and_ps(fscal,cutoff_mask);
607
608             /* Calculate temporary vectorial force */
609             tx               = _mm256_mul_ps(fscal,dx31);
610             ty               = _mm256_mul_ps(fscal,dy31);
611             tz               = _mm256_mul_ps(fscal,dz31);
612
613             /* Update vectorial force */
614             fix3             = _mm256_add_ps(fix3,tx);
615             fiy3             = _mm256_add_ps(fiy3,ty);
616             fiz3             = _mm256_add_ps(fiz3,tz);
617
618             fjx1             = _mm256_add_ps(fjx1,tx);
619             fjy1             = _mm256_add_ps(fjy1,ty);
620             fjz1             = _mm256_add_ps(fjz1,tz);
621
622             }
623
624             /**************************
625              * CALCULATE INTERACTIONS *
626              **************************/
627
628             if (gmx_mm256_any_lt(rsq32,rcutoff2))
629             {
630
631             /* REACTION-FIELD ELECTROSTATICS */
632             velec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
633             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
634
635             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
636
637             /* Update potential sum for this i atom from the interaction with this j atom. */
638             velec            = _mm256_and_ps(velec,cutoff_mask);
639             velecsum         = _mm256_add_ps(velecsum,velec);
640
641             fscal            = felec;
642
643             fscal            = _mm256_and_ps(fscal,cutoff_mask);
644
645             /* Calculate temporary vectorial force */
646             tx               = _mm256_mul_ps(fscal,dx32);
647             ty               = _mm256_mul_ps(fscal,dy32);
648             tz               = _mm256_mul_ps(fscal,dz32);
649
650             /* Update vectorial force */
651             fix3             = _mm256_add_ps(fix3,tx);
652             fiy3             = _mm256_add_ps(fiy3,ty);
653             fiz3             = _mm256_add_ps(fiz3,tz);
654
655             fjx2             = _mm256_add_ps(fjx2,tx);
656             fjy2             = _mm256_add_ps(fjy2,ty);
657             fjz2             = _mm256_add_ps(fjz2,tz);
658
659             }
660
661             /**************************
662              * CALCULATE INTERACTIONS *
663              **************************/
664
665             if (gmx_mm256_any_lt(rsq33,rcutoff2))
666             {
667
668             /* REACTION-FIELD ELECTROSTATICS */
669             velec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
670             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
671
672             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
673
674             /* Update potential sum for this i atom from the interaction with this j atom. */
675             velec            = _mm256_and_ps(velec,cutoff_mask);
676             velecsum         = _mm256_add_ps(velecsum,velec);
677
678             fscal            = felec;
679
680             fscal            = _mm256_and_ps(fscal,cutoff_mask);
681
682             /* Calculate temporary vectorial force */
683             tx               = _mm256_mul_ps(fscal,dx33);
684             ty               = _mm256_mul_ps(fscal,dy33);
685             tz               = _mm256_mul_ps(fscal,dz33);
686
687             /* Update vectorial force */
688             fix3             = _mm256_add_ps(fix3,tx);
689             fiy3             = _mm256_add_ps(fiy3,ty);
690             fiz3             = _mm256_add_ps(fiz3,tz);
691
692             fjx3             = _mm256_add_ps(fjx3,tx);
693             fjy3             = _mm256_add_ps(fjy3,ty);
694             fjz3             = _mm256_add_ps(fjz3,tz);
695
696             }
697
698             fjptrA             = f+j_coord_offsetA;
699             fjptrB             = f+j_coord_offsetB;
700             fjptrC             = f+j_coord_offsetC;
701             fjptrD             = f+j_coord_offsetD;
702             fjptrE             = f+j_coord_offsetE;
703             fjptrF             = f+j_coord_offsetF;
704             fjptrG             = f+j_coord_offsetG;
705             fjptrH             = f+j_coord_offsetH;
706
707             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
708                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
709                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
710
711             /* Inner loop uses 368 flops */
712         }
713
714         if(jidx<j_index_end)
715         {
716
717             /* Get j neighbor index, and coordinate index */
718             jnrlistA         = jjnr[jidx];
719             jnrlistB         = jjnr[jidx+1];
720             jnrlistC         = jjnr[jidx+2];
721             jnrlistD         = jjnr[jidx+3];
722             jnrlistE         = jjnr[jidx+4];
723             jnrlistF         = jjnr[jidx+5];
724             jnrlistG         = jjnr[jidx+6];
725             jnrlistH         = jjnr[jidx+7];
726             /* Sign of each element will be negative for non-real atoms.
727              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
728              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
729              */
730             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
731                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
732                                             
733             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
734             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
735             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
736             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
737             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
738             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
739             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
740             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
741             j_coord_offsetA  = DIM*jnrA;
742             j_coord_offsetB  = DIM*jnrB;
743             j_coord_offsetC  = DIM*jnrC;
744             j_coord_offsetD  = DIM*jnrD;
745             j_coord_offsetE  = DIM*jnrE;
746             j_coord_offsetF  = DIM*jnrF;
747             j_coord_offsetG  = DIM*jnrG;
748             j_coord_offsetH  = DIM*jnrH;
749
750             /* load j atom coordinates */
751             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
752                                                  x+j_coord_offsetC,x+j_coord_offsetD,
753                                                  x+j_coord_offsetE,x+j_coord_offsetF,
754                                                  x+j_coord_offsetG,x+j_coord_offsetH,
755                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
756                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
757
758             /* Calculate displacement vector */
759             dx00             = _mm256_sub_ps(ix0,jx0);
760             dy00             = _mm256_sub_ps(iy0,jy0);
761             dz00             = _mm256_sub_ps(iz0,jz0);
762             dx11             = _mm256_sub_ps(ix1,jx1);
763             dy11             = _mm256_sub_ps(iy1,jy1);
764             dz11             = _mm256_sub_ps(iz1,jz1);
765             dx12             = _mm256_sub_ps(ix1,jx2);
766             dy12             = _mm256_sub_ps(iy1,jy2);
767             dz12             = _mm256_sub_ps(iz1,jz2);
768             dx13             = _mm256_sub_ps(ix1,jx3);
769             dy13             = _mm256_sub_ps(iy1,jy3);
770             dz13             = _mm256_sub_ps(iz1,jz3);
771             dx21             = _mm256_sub_ps(ix2,jx1);
772             dy21             = _mm256_sub_ps(iy2,jy1);
773             dz21             = _mm256_sub_ps(iz2,jz1);
774             dx22             = _mm256_sub_ps(ix2,jx2);
775             dy22             = _mm256_sub_ps(iy2,jy2);
776             dz22             = _mm256_sub_ps(iz2,jz2);
777             dx23             = _mm256_sub_ps(ix2,jx3);
778             dy23             = _mm256_sub_ps(iy2,jy3);
779             dz23             = _mm256_sub_ps(iz2,jz3);
780             dx31             = _mm256_sub_ps(ix3,jx1);
781             dy31             = _mm256_sub_ps(iy3,jy1);
782             dz31             = _mm256_sub_ps(iz3,jz1);
783             dx32             = _mm256_sub_ps(ix3,jx2);
784             dy32             = _mm256_sub_ps(iy3,jy2);
785             dz32             = _mm256_sub_ps(iz3,jz2);
786             dx33             = _mm256_sub_ps(ix3,jx3);
787             dy33             = _mm256_sub_ps(iy3,jy3);
788             dz33             = _mm256_sub_ps(iz3,jz3);
789
790             /* Calculate squared distance and things based on it */
791             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
792             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
793             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
794             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
795             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
796             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
797             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
798             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
799             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
800             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
801
802             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
803             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
804             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
805             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
806             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
807             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
808             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
809             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
810             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
811
812             rinvsq00         = gmx_mm256_inv_ps(rsq00);
813             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
814             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
815             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
816             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
817             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
818             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
819             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
820             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
821             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
822
823             fjx0             = _mm256_setzero_ps();
824             fjy0             = _mm256_setzero_ps();
825             fjz0             = _mm256_setzero_ps();
826             fjx1             = _mm256_setzero_ps();
827             fjy1             = _mm256_setzero_ps();
828             fjz1             = _mm256_setzero_ps();
829             fjx2             = _mm256_setzero_ps();
830             fjy2             = _mm256_setzero_ps();
831             fjz2             = _mm256_setzero_ps();
832             fjx3             = _mm256_setzero_ps();
833             fjy3             = _mm256_setzero_ps();
834             fjz3             = _mm256_setzero_ps();
835
836             /**************************
837              * CALCULATE INTERACTIONS *
838              **************************/
839
840             if (gmx_mm256_any_lt(rsq00,rcutoff2))
841             {
842
843             /* LENNARD-JONES DISPERSION/REPULSION */
844
845             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
846             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
847             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
848             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
849                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
850             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
851
852             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
853
854             /* Update potential sum for this i atom from the interaction with this j atom. */
855             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
856             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
857             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
858
859             fscal            = fvdw;
860
861             fscal            = _mm256_and_ps(fscal,cutoff_mask);
862
863             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
864
865             /* Calculate temporary vectorial force */
866             tx               = _mm256_mul_ps(fscal,dx00);
867             ty               = _mm256_mul_ps(fscal,dy00);
868             tz               = _mm256_mul_ps(fscal,dz00);
869
870             /* Update vectorial force */
871             fix0             = _mm256_add_ps(fix0,tx);
872             fiy0             = _mm256_add_ps(fiy0,ty);
873             fiz0             = _mm256_add_ps(fiz0,tz);
874
875             fjx0             = _mm256_add_ps(fjx0,tx);
876             fjy0             = _mm256_add_ps(fjy0,ty);
877             fjz0             = _mm256_add_ps(fjz0,tz);
878
879             }
880
881             /**************************
882              * CALCULATE INTERACTIONS *
883              **************************/
884
885             if (gmx_mm256_any_lt(rsq11,rcutoff2))
886             {
887
888             /* REACTION-FIELD ELECTROSTATICS */
889             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
890             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
891
892             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
893
894             /* Update potential sum for this i atom from the interaction with this j atom. */
895             velec            = _mm256_and_ps(velec,cutoff_mask);
896             velec            = _mm256_andnot_ps(dummy_mask,velec);
897             velecsum         = _mm256_add_ps(velecsum,velec);
898
899             fscal            = felec;
900
901             fscal            = _mm256_and_ps(fscal,cutoff_mask);
902
903             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
904
905             /* Calculate temporary vectorial force */
906             tx               = _mm256_mul_ps(fscal,dx11);
907             ty               = _mm256_mul_ps(fscal,dy11);
908             tz               = _mm256_mul_ps(fscal,dz11);
909
910             /* Update vectorial force */
911             fix1             = _mm256_add_ps(fix1,tx);
912             fiy1             = _mm256_add_ps(fiy1,ty);
913             fiz1             = _mm256_add_ps(fiz1,tz);
914
915             fjx1             = _mm256_add_ps(fjx1,tx);
916             fjy1             = _mm256_add_ps(fjy1,ty);
917             fjz1             = _mm256_add_ps(fjz1,tz);
918
919             }
920
921             /**************************
922              * CALCULATE INTERACTIONS *
923              **************************/
924
925             if (gmx_mm256_any_lt(rsq12,rcutoff2))
926             {
927
928             /* REACTION-FIELD ELECTROSTATICS */
929             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
930             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
931
932             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
933
934             /* Update potential sum for this i atom from the interaction with this j atom. */
935             velec            = _mm256_and_ps(velec,cutoff_mask);
936             velec            = _mm256_andnot_ps(dummy_mask,velec);
937             velecsum         = _mm256_add_ps(velecsum,velec);
938
939             fscal            = felec;
940
941             fscal            = _mm256_and_ps(fscal,cutoff_mask);
942
943             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
944
945             /* Calculate temporary vectorial force */
946             tx               = _mm256_mul_ps(fscal,dx12);
947             ty               = _mm256_mul_ps(fscal,dy12);
948             tz               = _mm256_mul_ps(fscal,dz12);
949
950             /* Update vectorial force */
951             fix1             = _mm256_add_ps(fix1,tx);
952             fiy1             = _mm256_add_ps(fiy1,ty);
953             fiz1             = _mm256_add_ps(fiz1,tz);
954
955             fjx2             = _mm256_add_ps(fjx2,tx);
956             fjy2             = _mm256_add_ps(fjy2,ty);
957             fjz2             = _mm256_add_ps(fjz2,tz);
958
959             }
960
961             /**************************
962              * CALCULATE INTERACTIONS *
963              **************************/
964
965             if (gmx_mm256_any_lt(rsq13,rcutoff2))
966             {
967
968             /* REACTION-FIELD ELECTROSTATICS */
969             velec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
970             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
971
972             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
973
974             /* Update potential sum for this i atom from the interaction with this j atom. */
975             velec            = _mm256_and_ps(velec,cutoff_mask);
976             velec            = _mm256_andnot_ps(dummy_mask,velec);
977             velecsum         = _mm256_add_ps(velecsum,velec);
978
979             fscal            = felec;
980
981             fscal            = _mm256_and_ps(fscal,cutoff_mask);
982
983             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
984
985             /* Calculate temporary vectorial force */
986             tx               = _mm256_mul_ps(fscal,dx13);
987             ty               = _mm256_mul_ps(fscal,dy13);
988             tz               = _mm256_mul_ps(fscal,dz13);
989
990             /* Update vectorial force */
991             fix1             = _mm256_add_ps(fix1,tx);
992             fiy1             = _mm256_add_ps(fiy1,ty);
993             fiz1             = _mm256_add_ps(fiz1,tz);
994
995             fjx3             = _mm256_add_ps(fjx3,tx);
996             fjy3             = _mm256_add_ps(fjy3,ty);
997             fjz3             = _mm256_add_ps(fjz3,tz);
998
999             }
1000
1001             /**************************
1002              * CALCULATE INTERACTIONS *
1003              **************************/
1004
1005             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1006             {
1007
1008             /* REACTION-FIELD ELECTROSTATICS */
1009             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
1010             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1011
1012             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1013
1014             /* Update potential sum for this i atom from the interaction with this j atom. */
1015             velec            = _mm256_and_ps(velec,cutoff_mask);
1016             velec            = _mm256_andnot_ps(dummy_mask,velec);
1017             velecsum         = _mm256_add_ps(velecsum,velec);
1018
1019             fscal            = felec;
1020
1021             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1022
1023             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1024
1025             /* Calculate temporary vectorial force */
1026             tx               = _mm256_mul_ps(fscal,dx21);
1027             ty               = _mm256_mul_ps(fscal,dy21);
1028             tz               = _mm256_mul_ps(fscal,dz21);
1029
1030             /* Update vectorial force */
1031             fix2             = _mm256_add_ps(fix2,tx);
1032             fiy2             = _mm256_add_ps(fiy2,ty);
1033             fiz2             = _mm256_add_ps(fiz2,tz);
1034
1035             fjx1             = _mm256_add_ps(fjx1,tx);
1036             fjy1             = _mm256_add_ps(fjy1,ty);
1037             fjz1             = _mm256_add_ps(fjz1,tz);
1038
1039             }
1040
1041             /**************************
1042              * CALCULATE INTERACTIONS *
1043              **************************/
1044
1045             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1046             {
1047
1048             /* REACTION-FIELD ELECTROSTATICS */
1049             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
1050             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1051
1052             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1053
1054             /* Update potential sum for this i atom from the interaction with this j atom. */
1055             velec            = _mm256_and_ps(velec,cutoff_mask);
1056             velec            = _mm256_andnot_ps(dummy_mask,velec);
1057             velecsum         = _mm256_add_ps(velecsum,velec);
1058
1059             fscal            = felec;
1060
1061             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1062
1063             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1064
1065             /* Calculate temporary vectorial force */
1066             tx               = _mm256_mul_ps(fscal,dx22);
1067             ty               = _mm256_mul_ps(fscal,dy22);
1068             tz               = _mm256_mul_ps(fscal,dz22);
1069
1070             /* Update vectorial force */
1071             fix2             = _mm256_add_ps(fix2,tx);
1072             fiy2             = _mm256_add_ps(fiy2,ty);
1073             fiz2             = _mm256_add_ps(fiz2,tz);
1074
1075             fjx2             = _mm256_add_ps(fjx2,tx);
1076             fjy2             = _mm256_add_ps(fjy2,ty);
1077             fjz2             = _mm256_add_ps(fjz2,tz);
1078
1079             }
1080
1081             /**************************
1082              * CALCULATE INTERACTIONS *
1083              **************************/
1084
1085             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1086             {
1087
1088             /* REACTION-FIELD ELECTROSTATICS */
1089             velec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
1090             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1091
1092             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1093
1094             /* Update potential sum for this i atom from the interaction with this j atom. */
1095             velec            = _mm256_and_ps(velec,cutoff_mask);
1096             velec            = _mm256_andnot_ps(dummy_mask,velec);
1097             velecsum         = _mm256_add_ps(velecsum,velec);
1098
1099             fscal            = felec;
1100
1101             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1102
1103             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1104
1105             /* Calculate temporary vectorial force */
1106             tx               = _mm256_mul_ps(fscal,dx23);
1107             ty               = _mm256_mul_ps(fscal,dy23);
1108             tz               = _mm256_mul_ps(fscal,dz23);
1109
1110             /* Update vectorial force */
1111             fix2             = _mm256_add_ps(fix2,tx);
1112             fiy2             = _mm256_add_ps(fiy2,ty);
1113             fiz2             = _mm256_add_ps(fiz2,tz);
1114
1115             fjx3             = _mm256_add_ps(fjx3,tx);
1116             fjy3             = _mm256_add_ps(fjy3,ty);
1117             fjz3             = _mm256_add_ps(fjz3,tz);
1118
1119             }
1120
1121             /**************************
1122              * CALCULATE INTERACTIONS *
1123              **************************/
1124
1125             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1126             {
1127
1128             /* REACTION-FIELD ELECTROSTATICS */
1129             velec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
1130             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1131
1132             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1133
1134             /* Update potential sum for this i atom from the interaction with this j atom. */
1135             velec            = _mm256_and_ps(velec,cutoff_mask);
1136             velec            = _mm256_andnot_ps(dummy_mask,velec);
1137             velecsum         = _mm256_add_ps(velecsum,velec);
1138
1139             fscal            = felec;
1140
1141             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1142
1143             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1144
1145             /* Calculate temporary vectorial force */
1146             tx               = _mm256_mul_ps(fscal,dx31);
1147             ty               = _mm256_mul_ps(fscal,dy31);
1148             tz               = _mm256_mul_ps(fscal,dz31);
1149
1150             /* Update vectorial force */
1151             fix3             = _mm256_add_ps(fix3,tx);
1152             fiy3             = _mm256_add_ps(fiy3,ty);
1153             fiz3             = _mm256_add_ps(fiz3,tz);
1154
1155             fjx1             = _mm256_add_ps(fjx1,tx);
1156             fjy1             = _mm256_add_ps(fjy1,ty);
1157             fjz1             = _mm256_add_ps(fjz1,tz);
1158
1159             }
1160
1161             /**************************
1162              * CALCULATE INTERACTIONS *
1163              **************************/
1164
1165             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1166             {
1167
1168             /* REACTION-FIELD ELECTROSTATICS */
1169             velec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
1170             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1171
1172             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1173
1174             /* Update potential sum for this i atom from the interaction with this j atom. */
1175             velec            = _mm256_and_ps(velec,cutoff_mask);
1176             velec            = _mm256_andnot_ps(dummy_mask,velec);
1177             velecsum         = _mm256_add_ps(velecsum,velec);
1178
1179             fscal            = felec;
1180
1181             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1182
1183             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1184
1185             /* Calculate temporary vectorial force */
1186             tx               = _mm256_mul_ps(fscal,dx32);
1187             ty               = _mm256_mul_ps(fscal,dy32);
1188             tz               = _mm256_mul_ps(fscal,dz32);
1189
1190             /* Update vectorial force */
1191             fix3             = _mm256_add_ps(fix3,tx);
1192             fiy3             = _mm256_add_ps(fiy3,ty);
1193             fiz3             = _mm256_add_ps(fiz3,tz);
1194
1195             fjx2             = _mm256_add_ps(fjx2,tx);
1196             fjy2             = _mm256_add_ps(fjy2,ty);
1197             fjz2             = _mm256_add_ps(fjz2,tz);
1198
1199             }
1200
1201             /**************************
1202              * CALCULATE INTERACTIONS *
1203              **************************/
1204
1205             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1206             {
1207
1208             /* REACTION-FIELD ELECTROSTATICS */
1209             velec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
1210             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1211
1212             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1213
1214             /* Update potential sum for this i atom from the interaction with this j atom. */
1215             velec            = _mm256_and_ps(velec,cutoff_mask);
1216             velec            = _mm256_andnot_ps(dummy_mask,velec);
1217             velecsum         = _mm256_add_ps(velecsum,velec);
1218
1219             fscal            = felec;
1220
1221             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1222
1223             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1224
1225             /* Calculate temporary vectorial force */
1226             tx               = _mm256_mul_ps(fscal,dx33);
1227             ty               = _mm256_mul_ps(fscal,dy33);
1228             tz               = _mm256_mul_ps(fscal,dz33);
1229
1230             /* Update vectorial force */
1231             fix3             = _mm256_add_ps(fix3,tx);
1232             fiy3             = _mm256_add_ps(fiy3,ty);
1233             fiz3             = _mm256_add_ps(fiz3,tz);
1234
1235             fjx3             = _mm256_add_ps(fjx3,tx);
1236             fjy3             = _mm256_add_ps(fjy3,ty);
1237             fjz3             = _mm256_add_ps(fjz3,tz);
1238
1239             }
1240
1241             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1242             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1243             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1244             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1245             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1246             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1247             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1248             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1249
1250             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1251                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1252                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1253
1254             /* Inner loop uses 368 flops */
1255         }
1256
1257         /* End of innermost loop */
1258
1259         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1260                                                  f+i_coord_offset,fshift+i_shift_offset);
1261
1262         ggid                        = gid[iidx];
1263         /* Update potential energies */
1264         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1265         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1266
1267         /* Increment number of inner iterations */
1268         inneriter                  += j_index_end - j_index_start;
1269
1270         /* Outer loop uses 26 flops */
1271     }
1272
1273     /* Increment number of outer iterations */
1274     outeriter        += nri;
1275
1276     /* Update outer/inner flops */
1277
1278     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*368);
1279 }
1280 /*
1281  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_single
1282  * Electrostatics interaction: ReactionField
1283  * VdW interaction:            LennardJones
1284  * Geometry:                   Water4-Water4
1285  * Calculate force/pot:        Force
1286  */
1287 void
1288 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_single
1289                     (t_nblist * gmx_restrict                nlist,
1290                      rvec * gmx_restrict                    xx,
1291                      rvec * gmx_restrict                    ff,
1292                      t_forcerec * gmx_restrict              fr,
1293                      t_mdatoms * gmx_restrict               mdatoms,
1294                      nb_kernel_data_t * gmx_restrict        kernel_data,
1295                      t_nrnb * gmx_restrict                  nrnb)
1296 {
1297     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1298      * just 0 for non-waters.
1299      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1300      * jnr indices corresponding to data put in the four positions in the SIMD register.
1301      */
1302     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1303     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1304     int              jnrA,jnrB,jnrC,jnrD;
1305     int              jnrE,jnrF,jnrG,jnrH;
1306     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1307     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1308     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1309     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1310     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1311     real             rcutoff_scalar;
1312     real             *shiftvec,*fshift,*x,*f;
1313     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1314     real             scratch[4*DIM];
1315     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1316     real *           vdwioffsetptr0;
1317     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1318     real *           vdwioffsetptr1;
1319     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1320     real *           vdwioffsetptr2;
1321     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1322     real *           vdwioffsetptr3;
1323     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1324     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1325     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1326     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1327     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1328     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1329     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1330     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1331     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1332     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1333     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1334     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1335     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1336     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1337     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1338     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1339     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1340     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1341     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1342     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1343     real             *charge;
1344     int              nvdwtype;
1345     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1346     int              *vdwtype;
1347     real             *vdwparam;
1348     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1349     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1350     __m256           dummy_mask,cutoff_mask;
1351     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1352     __m256           one     = _mm256_set1_ps(1.0);
1353     __m256           two     = _mm256_set1_ps(2.0);
1354     x                = xx[0];
1355     f                = ff[0];
1356
1357     nri              = nlist->nri;
1358     iinr             = nlist->iinr;
1359     jindex           = nlist->jindex;
1360     jjnr             = nlist->jjnr;
1361     shiftidx         = nlist->shift;
1362     gid              = nlist->gid;
1363     shiftvec         = fr->shift_vec[0];
1364     fshift           = fr->fshift[0];
1365     facel            = _mm256_set1_ps(fr->epsfac);
1366     charge           = mdatoms->chargeA;
1367     krf              = _mm256_set1_ps(fr->ic->k_rf);
1368     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1369     crf              = _mm256_set1_ps(fr->ic->c_rf);
1370     nvdwtype         = fr->ntype;
1371     vdwparam         = fr->nbfp;
1372     vdwtype          = mdatoms->typeA;
1373
1374     /* Setup water-specific parameters */
1375     inr              = nlist->iinr[0];
1376     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1377     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1378     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1379     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1380
1381     jq1              = _mm256_set1_ps(charge[inr+1]);
1382     jq2              = _mm256_set1_ps(charge[inr+2]);
1383     jq3              = _mm256_set1_ps(charge[inr+3]);
1384     vdwjidx0A        = 2*vdwtype[inr+0];
1385     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1386     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1387     qq11             = _mm256_mul_ps(iq1,jq1);
1388     qq12             = _mm256_mul_ps(iq1,jq2);
1389     qq13             = _mm256_mul_ps(iq1,jq3);
1390     qq21             = _mm256_mul_ps(iq2,jq1);
1391     qq22             = _mm256_mul_ps(iq2,jq2);
1392     qq23             = _mm256_mul_ps(iq2,jq3);
1393     qq31             = _mm256_mul_ps(iq3,jq1);
1394     qq32             = _mm256_mul_ps(iq3,jq2);
1395     qq33             = _mm256_mul_ps(iq3,jq3);
1396
1397     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1398     rcutoff_scalar   = fr->rcoulomb;
1399     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1400     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1401
1402     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
1403     rvdw             = _mm256_set1_ps(fr->rvdw);
1404
1405     /* Avoid stupid compiler warnings */
1406     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1407     j_coord_offsetA = 0;
1408     j_coord_offsetB = 0;
1409     j_coord_offsetC = 0;
1410     j_coord_offsetD = 0;
1411     j_coord_offsetE = 0;
1412     j_coord_offsetF = 0;
1413     j_coord_offsetG = 0;
1414     j_coord_offsetH = 0;
1415
1416     outeriter        = 0;
1417     inneriter        = 0;
1418
1419     for(iidx=0;iidx<4*DIM;iidx++)
1420     {
1421         scratch[iidx] = 0.0;
1422     }
1423
1424     /* Start outer loop over neighborlists */
1425     for(iidx=0; iidx<nri; iidx++)
1426     {
1427         /* Load shift vector for this list */
1428         i_shift_offset   = DIM*shiftidx[iidx];
1429
1430         /* Load limits for loop over neighbors */
1431         j_index_start    = jindex[iidx];
1432         j_index_end      = jindex[iidx+1];
1433
1434         /* Get outer coordinate index */
1435         inr              = iinr[iidx];
1436         i_coord_offset   = DIM*inr;
1437
1438         /* Load i particle coords and add shift vector */
1439         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1440                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1441
1442         fix0             = _mm256_setzero_ps();
1443         fiy0             = _mm256_setzero_ps();
1444         fiz0             = _mm256_setzero_ps();
1445         fix1             = _mm256_setzero_ps();
1446         fiy1             = _mm256_setzero_ps();
1447         fiz1             = _mm256_setzero_ps();
1448         fix2             = _mm256_setzero_ps();
1449         fiy2             = _mm256_setzero_ps();
1450         fiz2             = _mm256_setzero_ps();
1451         fix3             = _mm256_setzero_ps();
1452         fiy3             = _mm256_setzero_ps();
1453         fiz3             = _mm256_setzero_ps();
1454
1455         /* Start inner kernel loop */
1456         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1457         {
1458
1459             /* Get j neighbor index, and coordinate index */
1460             jnrA             = jjnr[jidx];
1461             jnrB             = jjnr[jidx+1];
1462             jnrC             = jjnr[jidx+2];
1463             jnrD             = jjnr[jidx+3];
1464             jnrE             = jjnr[jidx+4];
1465             jnrF             = jjnr[jidx+5];
1466             jnrG             = jjnr[jidx+6];
1467             jnrH             = jjnr[jidx+7];
1468             j_coord_offsetA  = DIM*jnrA;
1469             j_coord_offsetB  = DIM*jnrB;
1470             j_coord_offsetC  = DIM*jnrC;
1471             j_coord_offsetD  = DIM*jnrD;
1472             j_coord_offsetE  = DIM*jnrE;
1473             j_coord_offsetF  = DIM*jnrF;
1474             j_coord_offsetG  = DIM*jnrG;
1475             j_coord_offsetH  = DIM*jnrH;
1476
1477             /* load j atom coordinates */
1478             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1479                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1480                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1481                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1482                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1483                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1484
1485             /* Calculate displacement vector */
1486             dx00             = _mm256_sub_ps(ix0,jx0);
1487             dy00             = _mm256_sub_ps(iy0,jy0);
1488             dz00             = _mm256_sub_ps(iz0,jz0);
1489             dx11             = _mm256_sub_ps(ix1,jx1);
1490             dy11             = _mm256_sub_ps(iy1,jy1);
1491             dz11             = _mm256_sub_ps(iz1,jz1);
1492             dx12             = _mm256_sub_ps(ix1,jx2);
1493             dy12             = _mm256_sub_ps(iy1,jy2);
1494             dz12             = _mm256_sub_ps(iz1,jz2);
1495             dx13             = _mm256_sub_ps(ix1,jx3);
1496             dy13             = _mm256_sub_ps(iy1,jy3);
1497             dz13             = _mm256_sub_ps(iz1,jz3);
1498             dx21             = _mm256_sub_ps(ix2,jx1);
1499             dy21             = _mm256_sub_ps(iy2,jy1);
1500             dz21             = _mm256_sub_ps(iz2,jz1);
1501             dx22             = _mm256_sub_ps(ix2,jx2);
1502             dy22             = _mm256_sub_ps(iy2,jy2);
1503             dz22             = _mm256_sub_ps(iz2,jz2);
1504             dx23             = _mm256_sub_ps(ix2,jx3);
1505             dy23             = _mm256_sub_ps(iy2,jy3);
1506             dz23             = _mm256_sub_ps(iz2,jz3);
1507             dx31             = _mm256_sub_ps(ix3,jx1);
1508             dy31             = _mm256_sub_ps(iy3,jy1);
1509             dz31             = _mm256_sub_ps(iz3,jz1);
1510             dx32             = _mm256_sub_ps(ix3,jx2);
1511             dy32             = _mm256_sub_ps(iy3,jy2);
1512             dz32             = _mm256_sub_ps(iz3,jz2);
1513             dx33             = _mm256_sub_ps(ix3,jx3);
1514             dy33             = _mm256_sub_ps(iy3,jy3);
1515             dz33             = _mm256_sub_ps(iz3,jz3);
1516
1517             /* Calculate squared distance and things based on it */
1518             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1519             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1520             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1521             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1522             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1523             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1524             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1525             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1526             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1527             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1528
1529             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1530             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1531             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1532             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1533             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1534             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1535             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1536             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1537             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1538
1539             rinvsq00         = gmx_mm256_inv_ps(rsq00);
1540             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1541             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1542             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1543             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1544             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1545             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1546             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1547             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1548             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1549
1550             fjx0             = _mm256_setzero_ps();
1551             fjy0             = _mm256_setzero_ps();
1552             fjz0             = _mm256_setzero_ps();
1553             fjx1             = _mm256_setzero_ps();
1554             fjy1             = _mm256_setzero_ps();
1555             fjz1             = _mm256_setzero_ps();
1556             fjx2             = _mm256_setzero_ps();
1557             fjy2             = _mm256_setzero_ps();
1558             fjz2             = _mm256_setzero_ps();
1559             fjx3             = _mm256_setzero_ps();
1560             fjy3             = _mm256_setzero_ps();
1561             fjz3             = _mm256_setzero_ps();
1562
1563             /**************************
1564              * CALCULATE INTERACTIONS *
1565              **************************/
1566
1567             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1568             {
1569
1570             /* LENNARD-JONES DISPERSION/REPULSION */
1571
1572             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1573             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1574
1575             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1576
1577             fscal            = fvdw;
1578
1579             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1580
1581             /* Calculate temporary vectorial force */
1582             tx               = _mm256_mul_ps(fscal,dx00);
1583             ty               = _mm256_mul_ps(fscal,dy00);
1584             tz               = _mm256_mul_ps(fscal,dz00);
1585
1586             /* Update vectorial force */
1587             fix0             = _mm256_add_ps(fix0,tx);
1588             fiy0             = _mm256_add_ps(fiy0,ty);
1589             fiz0             = _mm256_add_ps(fiz0,tz);
1590
1591             fjx0             = _mm256_add_ps(fjx0,tx);
1592             fjy0             = _mm256_add_ps(fjy0,ty);
1593             fjz0             = _mm256_add_ps(fjz0,tz);
1594
1595             }
1596
1597             /**************************
1598              * CALCULATE INTERACTIONS *
1599              **************************/
1600
1601             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1602             {
1603
1604             /* REACTION-FIELD ELECTROSTATICS */
1605             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1606
1607             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1608
1609             fscal            = felec;
1610
1611             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1612
1613             /* Calculate temporary vectorial force */
1614             tx               = _mm256_mul_ps(fscal,dx11);
1615             ty               = _mm256_mul_ps(fscal,dy11);
1616             tz               = _mm256_mul_ps(fscal,dz11);
1617
1618             /* Update vectorial force */
1619             fix1             = _mm256_add_ps(fix1,tx);
1620             fiy1             = _mm256_add_ps(fiy1,ty);
1621             fiz1             = _mm256_add_ps(fiz1,tz);
1622
1623             fjx1             = _mm256_add_ps(fjx1,tx);
1624             fjy1             = _mm256_add_ps(fjy1,ty);
1625             fjz1             = _mm256_add_ps(fjz1,tz);
1626
1627             }
1628
1629             /**************************
1630              * CALCULATE INTERACTIONS *
1631              **************************/
1632
1633             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1634             {
1635
1636             /* REACTION-FIELD ELECTROSTATICS */
1637             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1638
1639             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1640
1641             fscal            = felec;
1642
1643             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1644
1645             /* Calculate temporary vectorial force */
1646             tx               = _mm256_mul_ps(fscal,dx12);
1647             ty               = _mm256_mul_ps(fscal,dy12);
1648             tz               = _mm256_mul_ps(fscal,dz12);
1649
1650             /* Update vectorial force */
1651             fix1             = _mm256_add_ps(fix1,tx);
1652             fiy1             = _mm256_add_ps(fiy1,ty);
1653             fiz1             = _mm256_add_ps(fiz1,tz);
1654
1655             fjx2             = _mm256_add_ps(fjx2,tx);
1656             fjy2             = _mm256_add_ps(fjy2,ty);
1657             fjz2             = _mm256_add_ps(fjz2,tz);
1658
1659             }
1660
1661             /**************************
1662              * CALCULATE INTERACTIONS *
1663              **************************/
1664
1665             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1666             {
1667
1668             /* REACTION-FIELD ELECTROSTATICS */
1669             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1670
1671             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1672
1673             fscal            = felec;
1674
1675             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1676
1677             /* Calculate temporary vectorial force */
1678             tx               = _mm256_mul_ps(fscal,dx13);
1679             ty               = _mm256_mul_ps(fscal,dy13);
1680             tz               = _mm256_mul_ps(fscal,dz13);
1681
1682             /* Update vectorial force */
1683             fix1             = _mm256_add_ps(fix1,tx);
1684             fiy1             = _mm256_add_ps(fiy1,ty);
1685             fiz1             = _mm256_add_ps(fiz1,tz);
1686
1687             fjx3             = _mm256_add_ps(fjx3,tx);
1688             fjy3             = _mm256_add_ps(fjy3,ty);
1689             fjz3             = _mm256_add_ps(fjz3,tz);
1690
1691             }
1692
1693             /**************************
1694              * CALCULATE INTERACTIONS *
1695              **************************/
1696
1697             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1698             {
1699
1700             /* REACTION-FIELD ELECTROSTATICS */
1701             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1702
1703             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1704
1705             fscal            = felec;
1706
1707             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1708
1709             /* Calculate temporary vectorial force */
1710             tx               = _mm256_mul_ps(fscal,dx21);
1711             ty               = _mm256_mul_ps(fscal,dy21);
1712             tz               = _mm256_mul_ps(fscal,dz21);
1713
1714             /* Update vectorial force */
1715             fix2             = _mm256_add_ps(fix2,tx);
1716             fiy2             = _mm256_add_ps(fiy2,ty);
1717             fiz2             = _mm256_add_ps(fiz2,tz);
1718
1719             fjx1             = _mm256_add_ps(fjx1,tx);
1720             fjy1             = _mm256_add_ps(fjy1,ty);
1721             fjz1             = _mm256_add_ps(fjz1,tz);
1722
1723             }
1724
1725             /**************************
1726              * CALCULATE INTERACTIONS *
1727              **************************/
1728
1729             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1730             {
1731
1732             /* REACTION-FIELD ELECTROSTATICS */
1733             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1734
1735             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1736
1737             fscal            = felec;
1738
1739             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1740
1741             /* Calculate temporary vectorial force */
1742             tx               = _mm256_mul_ps(fscal,dx22);
1743             ty               = _mm256_mul_ps(fscal,dy22);
1744             tz               = _mm256_mul_ps(fscal,dz22);
1745
1746             /* Update vectorial force */
1747             fix2             = _mm256_add_ps(fix2,tx);
1748             fiy2             = _mm256_add_ps(fiy2,ty);
1749             fiz2             = _mm256_add_ps(fiz2,tz);
1750
1751             fjx2             = _mm256_add_ps(fjx2,tx);
1752             fjy2             = _mm256_add_ps(fjy2,ty);
1753             fjz2             = _mm256_add_ps(fjz2,tz);
1754
1755             }
1756
1757             /**************************
1758              * CALCULATE INTERACTIONS *
1759              **************************/
1760
1761             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1762             {
1763
1764             /* REACTION-FIELD ELECTROSTATICS */
1765             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1766
1767             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1768
1769             fscal            = felec;
1770
1771             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1772
1773             /* Calculate temporary vectorial force */
1774             tx               = _mm256_mul_ps(fscal,dx23);
1775             ty               = _mm256_mul_ps(fscal,dy23);
1776             tz               = _mm256_mul_ps(fscal,dz23);
1777
1778             /* Update vectorial force */
1779             fix2             = _mm256_add_ps(fix2,tx);
1780             fiy2             = _mm256_add_ps(fiy2,ty);
1781             fiz2             = _mm256_add_ps(fiz2,tz);
1782
1783             fjx3             = _mm256_add_ps(fjx3,tx);
1784             fjy3             = _mm256_add_ps(fjy3,ty);
1785             fjz3             = _mm256_add_ps(fjz3,tz);
1786
1787             }
1788
1789             /**************************
1790              * CALCULATE INTERACTIONS *
1791              **************************/
1792
1793             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1794             {
1795
1796             /* REACTION-FIELD ELECTROSTATICS */
1797             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1798
1799             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1800
1801             fscal            = felec;
1802
1803             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1804
1805             /* Calculate temporary vectorial force */
1806             tx               = _mm256_mul_ps(fscal,dx31);
1807             ty               = _mm256_mul_ps(fscal,dy31);
1808             tz               = _mm256_mul_ps(fscal,dz31);
1809
1810             /* Update vectorial force */
1811             fix3             = _mm256_add_ps(fix3,tx);
1812             fiy3             = _mm256_add_ps(fiy3,ty);
1813             fiz3             = _mm256_add_ps(fiz3,tz);
1814
1815             fjx1             = _mm256_add_ps(fjx1,tx);
1816             fjy1             = _mm256_add_ps(fjy1,ty);
1817             fjz1             = _mm256_add_ps(fjz1,tz);
1818
1819             }
1820
1821             /**************************
1822              * CALCULATE INTERACTIONS *
1823              **************************/
1824
1825             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1826             {
1827
1828             /* REACTION-FIELD ELECTROSTATICS */
1829             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1830
1831             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1832
1833             fscal            = felec;
1834
1835             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1836
1837             /* Calculate temporary vectorial force */
1838             tx               = _mm256_mul_ps(fscal,dx32);
1839             ty               = _mm256_mul_ps(fscal,dy32);
1840             tz               = _mm256_mul_ps(fscal,dz32);
1841
1842             /* Update vectorial force */
1843             fix3             = _mm256_add_ps(fix3,tx);
1844             fiy3             = _mm256_add_ps(fiy3,ty);
1845             fiz3             = _mm256_add_ps(fiz3,tz);
1846
1847             fjx2             = _mm256_add_ps(fjx2,tx);
1848             fjy2             = _mm256_add_ps(fjy2,ty);
1849             fjz2             = _mm256_add_ps(fjz2,tz);
1850
1851             }
1852
1853             /**************************
1854              * CALCULATE INTERACTIONS *
1855              **************************/
1856
1857             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1858             {
1859
1860             /* REACTION-FIELD ELECTROSTATICS */
1861             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1862
1863             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1864
1865             fscal            = felec;
1866
1867             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1868
1869             /* Calculate temporary vectorial force */
1870             tx               = _mm256_mul_ps(fscal,dx33);
1871             ty               = _mm256_mul_ps(fscal,dy33);
1872             tz               = _mm256_mul_ps(fscal,dz33);
1873
1874             /* Update vectorial force */
1875             fix3             = _mm256_add_ps(fix3,tx);
1876             fiy3             = _mm256_add_ps(fiy3,ty);
1877             fiz3             = _mm256_add_ps(fiz3,tz);
1878
1879             fjx3             = _mm256_add_ps(fjx3,tx);
1880             fjy3             = _mm256_add_ps(fjy3,ty);
1881             fjz3             = _mm256_add_ps(fjz3,tz);
1882
1883             }
1884
1885             fjptrA             = f+j_coord_offsetA;
1886             fjptrB             = f+j_coord_offsetB;
1887             fjptrC             = f+j_coord_offsetC;
1888             fjptrD             = f+j_coord_offsetD;
1889             fjptrE             = f+j_coord_offsetE;
1890             fjptrF             = f+j_coord_offsetF;
1891             fjptrG             = f+j_coord_offsetG;
1892             fjptrH             = f+j_coord_offsetH;
1893
1894             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1895                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1896                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1897
1898             /* Inner loop uses 303 flops */
1899         }
1900
1901         if(jidx<j_index_end)
1902         {
1903
1904             /* Get j neighbor index, and coordinate index */
1905             jnrlistA         = jjnr[jidx];
1906             jnrlistB         = jjnr[jidx+1];
1907             jnrlistC         = jjnr[jidx+2];
1908             jnrlistD         = jjnr[jidx+3];
1909             jnrlistE         = jjnr[jidx+4];
1910             jnrlistF         = jjnr[jidx+5];
1911             jnrlistG         = jjnr[jidx+6];
1912             jnrlistH         = jjnr[jidx+7];
1913             /* Sign of each element will be negative for non-real atoms.
1914              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1915              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1916              */
1917             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1918                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1919                                             
1920             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1921             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1922             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1923             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1924             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1925             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1926             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1927             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1928             j_coord_offsetA  = DIM*jnrA;
1929             j_coord_offsetB  = DIM*jnrB;
1930             j_coord_offsetC  = DIM*jnrC;
1931             j_coord_offsetD  = DIM*jnrD;
1932             j_coord_offsetE  = DIM*jnrE;
1933             j_coord_offsetF  = DIM*jnrF;
1934             j_coord_offsetG  = DIM*jnrG;
1935             j_coord_offsetH  = DIM*jnrH;
1936
1937             /* load j atom coordinates */
1938             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1939                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1940                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1941                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1942                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1943                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1944
1945             /* Calculate displacement vector */
1946             dx00             = _mm256_sub_ps(ix0,jx0);
1947             dy00             = _mm256_sub_ps(iy0,jy0);
1948             dz00             = _mm256_sub_ps(iz0,jz0);
1949             dx11             = _mm256_sub_ps(ix1,jx1);
1950             dy11             = _mm256_sub_ps(iy1,jy1);
1951             dz11             = _mm256_sub_ps(iz1,jz1);
1952             dx12             = _mm256_sub_ps(ix1,jx2);
1953             dy12             = _mm256_sub_ps(iy1,jy2);
1954             dz12             = _mm256_sub_ps(iz1,jz2);
1955             dx13             = _mm256_sub_ps(ix1,jx3);
1956             dy13             = _mm256_sub_ps(iy1,jy3);
1957             dz13             = _mm256_sub_ps(iz1,jz3);
1958             dx21             = _mm256_sub_ps(ix2,jx1);
1959             dy21             = _mm256_sub_ps(iy2,jy1);
1960             dz21             = _mm256_sub_ps(iz2,jz1);
1961             dx22             = _mm256_sub_ps(ix2,jx2);
1962             dy22             = _mm256_sub_ps(iy2,jy2);
1963             dz22             = _mm256_sub_ps(iz2,jz2);
1964             dx23             = _mm256_sub_ps(ix2,jx3);
1965             dy23             = _mm256_sub_ps(iy2,jy3);
1966             dz23             = _mm256_sub_ps(iz2,jz3);
1967             dx31             = _mm256_sub_ps(ix3,jx1);
1968             dy31             = _mm256_sub_ps(iy3,jy1);
1969             dz31             = _mm256_sub_ps(iz3,jz1);
1970             dx32             = _mm256_sub_ps(ix3,jx2);
1971             dy32             = _mm256_sub_ps(iy3,jy2);
1972             dz32             = _mm256_sub_ps(iz3,jz2);
1973             dx33             = _mm256_sub_ps(ix3,jx3);
1974             dy33             = _mm256_sub_ps(iy3,jy3);
1975             dz33             = _mm256_sub_ps(iz3,jz3);
1976
1977             /* Calculate squared distance and things based on it */
1978             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1979             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1980             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1981             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1982             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1983             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1984             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1985             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1986             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1987             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1988
1989             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1990             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1991             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1992             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1993             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1994             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1995             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1996             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1997             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1998
1999             rinvsq00         = gmx_mm256_inv_ps(rsq00);
2000             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2001             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2002             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
2003             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2004             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2005             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
2006             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
2007             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
2008             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
2009
2010             fjx0             = _mm256_setzero_ps();
2011             fjy0             = _mm256_setzero_ps();
2012             fjz0             = _mm256_setzero_ps();
2013             fjx1             = _mm256_setzero_ps();
2014             fjy1             = _mm256_setzero_ps();
2015             fjz1             = _mm256_setzero_ps();
2016             fjx2             = _mm256_setzero_ps();
2017             fjy2             = _mm256_setzero_ps();
2018             fjz2             = _mm256_setzero_ps();
2019             fjx3             = _mm256_setzero_ps();
2020             fjy3             = _mm256_setzero_ps();
2021             fjz3             = _mm256_setzero_ps();
2022
2023             /**************************
2024              * CALCULATE INTERACTIONS *
2025              **************************/
2026
2027             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2028             {
2029
2030             /* LENNARD-JONES DISPERSION/REPULSION */
2031
2032             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2033             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2034
2035             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2036
2037             fscal            = fvdw;
2038
2039             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2040
2041             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2042
2043             /* Calculate temporary vectorial force */
2044             tx               = _mm256_mul_ps(fscal,dx00);
2045             ty               = _mm256_mul_ps(fscal,dy00);
2046             tz               = _mm256_mul_ps(fscal,dz00);
2047
2048             /* Update vectorial force */
2049             fix0             = _mm256_add_ps(fix0,tx);
2050             fiy0             = _mm256_add_ps(fiy0,ty);
2051             fiz0             = _mm256_add_ps(fiz0,tz);
2052
2053             fjx0             = _mm256_add_ps(fjx0,tx);
2054             fjy0             = _mm256_add_ps(fjy0,ty);
2055             fjz0             = _mm256_add_ps(fjz0,tz);
2056
2057             }
2058
2059             /**************************
2060              * CALCULATE INTERACTIONS *
2061              **************************/
2062
2063             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2064             {
2065
2066             /* REACTION-FIELD ELECTROSTATICS */
2067             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
2068
2069             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2070
2071             fscal            = felec;
2072
2073             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2074
2075             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2076
2077             /* Calculate temporary vectorial force */
2078             tx               = _mm256_mul_ps(fscal,dx11);
2079             ty               = _mm256_mul_ps(fscal,dy11);
2080             tz               = _mm256_mul_ps(fscal,dz11);
2081
2082             /* Update vectorial force */
2083             fix1             = _mm256_add_ps(fix1,tx);
2084             fiy1             = _mm256_add_ps(fiy1,ty);
2085             fiz1             = _mm256_add_ps(fiz1,tz);
2086
2087             fjx1             = _mm256_add_ps(fjx1,tx);
2088             fjy1             = _mm256_add_ps(fjy1,ty);
2089             fjz1             = _mm256_add_ps(fjz1,tz);
2090
2091             }
2092
2093             /**************************
2094              * CALCULATE INTERACTIONS *
2095              **************************/
2096
2097             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2098             {
2099
2100             /* REACTION-FIELD ELECTROSTATICS */
2101             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
2102
2103             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2104
2105             fscal            = felec;
2106
2107             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2108
2109             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2110
2111             /* Calculate temporary vectorial force */
2112             tx               = _mm256_mul_ps(fscal,dx12);
2113             ty               = _mm256_mul_ps(fscal,dy12);
2114             tz               = _mm256_mul_ps(fscal,dz12);
2115
2116             /* Update vectorial force */
2117             fix1             = _mm256_add_ps(fix1,tx);
2118             fiy1             = _mm256_add_ps(fiy1,ty);
2119             fiz1             = _mm256_add_ps(fiz1,tz);
2120
2121             fjx2             = _mm256_add_ps(fjx2,tx);
2122             fjy2             = _mm256_add_ps(fjy2,ty);
2123             fjz2             = _mm256_add_ps(fjz2,tz);
2124
2125             }
2126
2127             /**************************
2128              * CALCULATE INTERACTIONS *
2129              **************************/
2130
2131             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2132             {
2133
2134             /* REACTION-FIELD ELECTROSTATICS */
2135             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
2136
2137             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2138
2139             fscal            = felec;
2140
2141             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2142
2143             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2144
2145             /* Calculate temporary vectorial force */
2146             tx               = _mm256_mul_ps(fscal,dx13);
2147             ty               = _mm256_mul_ps(fscal,dy13);
2148             tz               = _mm256_mul_ps(fscal,dz13);
2149
2150             /* Update vectorial force */
2151             fix1             = _mm256_add_ps(fix1,tx);
2152             fiy1             = _mm256_add_ps(fiy1,ty);
2153             fiz1             = _mm256_add_ps(fiz1,tz);
2154
2155             fjx3             = _mm256_add_ps(fjx3,tx);
2156             fjy3             = _mm256_add_ps(fjy3,ty);
2157             fjz3             = _mm256_add_ps(fjz3,tz);
2158
2159             }
2160
2161             /**************************
2162              * CALCULATE INTERACTIONS *
2163              **************************/
2164
2165             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2166             {
2167
2168             /* REACTION-FIELD ELECTROSTATICS */
2169             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2170
2171             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2172
2173             fscal            = felec;
2174
2175             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2176
2177             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2178
2179             /* Calculate temporary vectorial force */
2180             tx               = _mm256_mul_ps(fscal,dx21);
2181             ty               = _mm256_mul_ps(fscal,dy21);
2182             tz               = _mm256_mul_ps(fscal,dz21);
2183
2184             /* Update vectorial force */
2185             fix2             = _mm256_add_ps(fix2,tx);
2186             fiy2             = _mm256_add_ps(fiy2,ty);
2187             fiz2             = _mm256_add_ps(fiz2,tz);
2188
2189             fjx1             = _mm256_add_ps(fjx1,tx);
2190             fjy1             = _mm256_add_ps(fjy1,ty);
2191             fjz1             = _mm256_add_ps(fjz1,tz);
2192
2193             }
2194
2195             /**************************
2196              * CALCULATE INTERACTIONS *
2197              **************************/
2198
2199             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2200             {
2201
2202             /* REACTION-FIELD ELECTROSTATICS */
2203             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2204
2205             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2206
2207             fscal            = felec;
2208
2209             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2210
2211             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2212
2213             /* Calculate temporary vectorial force */
2214             tx               = _mm256_mul_ps(fscal,dx22);
2215             ty               = _mm256_mul_ps(fscal,dy22);
2216             tz               = _mm256_mul_ps(fscal,dz22);
2217
2218             /* Update vectorial force */
2219             fix2             = _mm256_add_ps(fix2,tx);
2220             fiy2             = _mm256_add_ps(fiy2,ty);
2221             fiz2             = _mm256_add_ps(fiz2,tz);
2222
2223             fjx2             = _mm256_add_ps(fjx2,tx);
2224             fjy2             = _mm256_add_ps(fjy2,ty);
2225             fjz2             = _mm256_add_ps(fjz2,tz);
2226
2227             }
2228
2229             /**************************
2230              * CALCULATE INTERACTIONS *
2231              **************************/
2232
2233             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2234             {
2235
2236             /* REACTION-FIELD ELECTROSTATICS */
2237             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
2238
2239             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2240
2241             fscal            = felec;
2242
2243             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2244
2245             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2246
2247             /* Calculate temporary vectorial force */
2248             tx               = _mm256_mul_ps(fscal,dx23);
2249             ty               = _mm256_mul_ps(fscal,dy23);
2250             tz               = _mm256_mul_ps(fscal,dz23);
2251
2252             /* Update vectorial force */
2253             fix2             = _mm256_add_ps(fix2,tx);
2254             fiy2             = _mm256_add_ps(fiy2,ty);
2255             fiz2             = _mm256_add_ps(fiz2,tz);
2256
2257             fjx3             = _mm256_add_ps(fjx3,tx);
2258             fjy3             = _mm256_add_ps(fjy3,ty);
2259             fjz3             = _mm256_add_ps(fjz3,tz);
2260
2261             }
2262
2263             /**************************
2264              * CALCULATE INTERACTIONS *
2265              **************************/
2266
2267             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2268             {
2269
2270             /* REACTION-FIELD ELECTROSTATICS */
2271             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
2272
2273             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2274
2275             fscal            = felec;
2276
2277             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2278
2279             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2280
2281             /* Calculate temporary vectorial force */
2282             tx               = _mm256_mul_ps(fscal,dx31);
2283             ty               = _mm256_mul_ps(fscal,dy31);
2284             tz               = _mm256_mul_ps(fscal,dz31);
2285
2286             /* Update vectorial force */
2287             fix3             = _mm256_add_ps(fix3,tx);
2288             fiy3             = _mm256_add_ps(fiy3,ty);
2289             fiz3             = _mm256_add_ps(fiz3,tz);
2290
2291             fjx1             = _mm256_add_ps(fjx1,tx);
2292             fjy1             = _mm256_add_ps(fjy1,ty);
2293             fjz1             = _mm256_add_ps(fjz1,tz);
2294
2295             }
2296
2297             /**************************
2298              * CALCULATE INTERACTIONS *
2299              **************************/
2300
2301             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2302             {
2303
2304             /* REACTION-FIELD ELECTROSTATICS */
2305             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
2306
2307             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2308
2309             fscal            = felec;
2310
2311             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2312
2313             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2314
2315             /* Calculate temporary vectorial force */
2316             tx               = _mm256_mul_ps(fscal,dx32);
2317             ty               = _mm256_mul_ps(fscal,dy32);
2318             tz               = _mm256_mul_ps(fscal,dz32);
2319
2320             /* Update vectorial force */
2321             fix3             = _mm256_add_ps(fix3,tx);
2322             fiy3             = _mm256_add_ps(fiy3,ty);
2323             fiz3             = _mm256_add_ps(fiz3,tz);
2324
2325             fjx2             = _mm256_add_ps(fjx2,tx);
2326             fjy2             = _mm256_add_ps(fjy2,ty);
2327             fjz2             = _mm256_add_ps(fjz2,tz);
2328
2329             }
2330
2331             /**************************
2332              * CALCULATE INTERACTIONS *
2333              **************************/
2334
2335             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2336             {
2337
2338             /* REACTION-FIELD ELECTROSTATICS */
2339             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
2340
2341             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2342
2343             fscal            = felec;
2344
2345             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2346
2347             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2348
2349             /* Calculate temporary vectorial force */
2350             tx               = _mm256_mul_ps(fscal,dx33);
2351             ty               = _mm256_mul_ps(fscal,dy33);
2352             tz               = _mm256_mul_ps(fscal,dz33);
2353
2354             /* Update vectorial force */
2355             fix3             = _mm256_add_ps(fix3,tx);
2356             fiy3             = _mm256_add_ps(fiy3,ty);
2357             fiz3             = _mm256_add_ps(fiz3,tz);
2358
2359             fjx3             = _mm256_add_ps(fjx3,tx);
2360             fjy3             = _mm256_add_ps(fjy3,ty);
2361             fjz3             = _mm256_add_ps(fjz3,tz);
2362
2363             }
2364
2365             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2366             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2367             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2368             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2369             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2370             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2371             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2372             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2373
2374             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2375                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2376                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2377
2378             /* Inner loop uses 303 flops */
2379         }
2380
2381         /* End of innermost loop */
2382
2383         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2384                                                  f+i_coord_offset,fshift+i_shift_offset);
2385
2386         /* Increment number of inner iterations */
2387         inneriter                  += j_index_end - j_index_start;
2388
2389         /* Outer loop uses 24 flops */
2390     }
2391
2392     /* Increment number of outer iterations */
2393     outeriter        += nri;
2394
2395     /* Update outer/inner flops */
2396
2397     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
2398 }