23f2fb891547683c76d8ad22a17f9d19b9405979
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecRF_VdwCSTab_GeomW4W4_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_single
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     real *           vdwioffsetptr3;
79     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
80     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
81     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
82     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
83     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
84     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
85     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
86     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
87     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
88     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
89     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
90     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
91     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
92     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
93     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
94     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
95     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
96     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
97     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
98     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
99     real             *charge;
100     int              nvdwtype;
101     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102     int              *vdwtype;
103     real             *vdwparam;
104     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
105     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
106     __m256i          vfitab;
107     __m128i          vfitab_lo,vfitab_hi;
108     __m128i          ifour       = _mm_set1_epi32(4);
109     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
110     real             *vftab;
111     __m256           dummy_mask,cutoff_mask;
112     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
113     __m256           one     = _mm256_set1_ps(1.0);
114     __m256           two     = _mm256_set1_ps(2.0);
115     x                = xx[0];
116     f                = ff[0];
117
118     nri              = nlist->nri;
119     iinr             = nlist->iinr;
120     jindex           = nlist->jindex;
121     jjnr             = nlist->jjnr;
122     shiftidx         = nlist->shift;
123     gid              = nlist->gid;
124     shiftvec         = fr->shift_vec[0];
125     fshift           = fr->fshift[0];
126     facel            = _mm256_set1_ps(fr->epsfac);
127     charge           = mdatoms->chargeA;
128     krf              = _mm256_set1_ps(fr->ic->k_rf);
129     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
130     crf              = _mm256_set1_ps(fr->ic->c_rf);
131     nvdwtype         = fr->ntype;
132     vdwparam         = fr->nbfp;
133     vdwtype          = mdatoms->typeA;
134
135     vftab            = kernel_data->table_vdw->data;
136     vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
137
138     /* Setup water-specific parameters */
139     inr              = nlist->iinr[0];
140     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
141     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
142     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
143     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
144
145     jq1              = _mm256_set1_ps(charge[inr+1]);
146     jq2              = _mm256_set1_ps(charge[inr+2]);
147     jq3              = _mm256_set1_ps(charge[inr+3]);
148     vdwjidx0A        = 2*vdwtype[inr+0];
149     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
150     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
151     qq11             = _mm256_mul_ps(iq1,jq1);
152     qq12             = _mm256_mul_ps(iq1,jq2);
153     qq13             = _mm256_mul_ps(iq1,jq3);
154     qq21             = _mm256_mul_ps(iq2,jq1);
155     qq22             = _mm256_mul_ps(iq2,jq2);
156     qq23             = _mm256_mul_ps(iq2,jq3);
157     qq31             = _mm256_mul_ps(iq3,jq1);
158     qq32             = _mm256_mul_ps(iq3,jq2);
159     qq33             = _mm256_mul_ps(iq3,jq3);
160
161     /* Avoid stupid compiler warnings */
162     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
163     j_coord_offsetA = 0;
164     j_coord_offsetB = 0;
165     j_coord_offsetC = 0;
166     j_coord_offsetD = 0;
167     j_coord_offsetE = 0;
168     j_coord_offsetF = 0;
169     j_coord_offsetG = 0;
170     j_coord_offsetH = 0;
171
172     outeriter        = 0;
173     inneriter        = 0;
174
175     for(iidx=0;iidx<4*DIM;iidx++)
176     {
177         scratch[iidx] = 0.0;
178     }
179
180     /* Start outer loop over neighborlists */
181     for(iidx=0; iidx<nri; iidx++)
182     {
183         /* Load shift vector for this list */
184         i_shift_offset   = DIM*shiftidx[iidx];
185
186         /* Load limits for loop over neighbors */
187         j_index_start    = jindex[iidx];
188         j_index_end      = jindex[iidx+1];
189
190         /* Get outer coordinate index */
191         inr              = iinr[iidx];
192         i_coord_offset   = DIM*inr;
193
194         /* Load i particle coords and add shift vector */
195         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
196                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
197
198         fix0             = _mm256_setzero_ps();
199         fiy0             = _mm256_setzero_ps();
200         fiz0             = _mm256_setzero_ps();
201         fix1             = _mm256_setzero_ps();
202         fiy1             = _mm256_setzero_ps();
203         fiz1             = _mm256_setzero_ps();
204         fix2             = _mm256_setzero_ps();
205         fiy2             = _mm256_setzero_ps();
206         fiz2             = _mm256_setzero_ps();
207         fix3             = _mm256_setzero_ps();
208         fiy3             = _mm256_setzero_ps();
209         fiz3             = _mm256_setzero_ps();
210
211         /* Reset potential sums */
212         velecsum         = _mm256_setzero_ps();
213         vvdwsum          = _mm256_setzero_ps();
214
215         /* Start inner kernel loop */
216         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
217         {
218
219             /* Get j neighbor index, and coordinate index */
220             jnrA             = jjnr[jidx];
221             jnrB             = jjnr[jidx+1];
222             jnrC             = jjnr[jidx+2];
223             jnrD             = jjnr[jidx+3];
224             jnrE             = jjnr[jidx+4];
225             jnrF             = jjnr[jidx+5];
226             jnrG             = jjnr[jidx+6];
227             jnrH             = jjnr[jidx+7];
228             j_coord_offsetA  = DIM*jnrA;
229             j_coord_offsetB  = DIM*jnrB;
230             j_coord_offsetC  = DIM*jnrC;
231             j_coord_offsetD  = DIM*jnrD;
232             j_coord_offsetE  = DIM*jnrE;
233             j_coord_offsetF  = DIM*jnrF;
234             j_coord_offsetG  = DIM*jnrG;
235             j_coord_offsetH  = DIM*jnrH;
236
237             /* load j atom coordinates */
238             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
239                                                  x+j_coord_offsetC,x+j_coord_offsetD,
240                                                  x+j_coord_offsetE,x+j_coord_offsetF,
241                                                  x+j_coord_offsetG,x+j_coord_offsetH,
242                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
243                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
244
245             /* Calculate displacement vector */
246             dx00             = _mm256_sub_ps(ix0,jx0);
247             dy00             = _mm256_sub_ps(iy0,jy0);
248             dz00             = _mm256_sub_ps(iz0,jz0);
249             dx11             = _mm256_sub_ps(ix1,jx1);
250             dy11             = _mm256_sub_ps(iy1,jy1);
251             dz11             = _mm256_sub_ps(iz1,jz1);
252             dx12             = _mm256_sub_ps(ix1,jx2);
253             dy12             = _mm256_sub_ps(iy1,jy2);
254             dz12             = _mm256_sub_ps(iz1,jz2);
255             dx13             = _mm256_sub_ps(ix1,jx3);
256             dy13             = _mm256_sub_ps(iy1,jy3);
257             dz13             = _mm256_sub_ps(iz1,jz3);
258             dx21             = _mm256_sub_ps(ix2,jx1);
259             dy21             = _mm256_sub_ps(iy2,jy1);
260             dz21             = _mm256_sub_ps(iz2,jz1);
261             dx22             = _mm256_sub_ps(ix2,jx2);
262             dy22             = _mm256_sub_ps(iy2,jy2);
263             dz22             = _mm256_sub_ps(iz2,jz2);
264             dx23             = _mm256_sub_ps(ix2,jx3);
265             dy23             = _mm256_sub_ps(iy2,jy3);
266             dz23             = _mm256_sub_ps(iz2,jz3);
267             dx31             = _mm256_sub_ps(ix3,jx1);
268             dy31             = _mm256_sub_ps(iy3,jy1);
269             dz31             = _mm256_sub_ps(iz3,jz1);
270             dx32             = _mm256_sub_ps(ix3,jx2);
271             dy32             = _mm256_sub_ps(iy3,jy2);
272             dz32             = _mm256_sub_ps(iz3,jz2);
273             dx33             = _mm256_sub_ps(ix3,jx3);
274             dy33             = _mm256_sub_ps(iy3,jy3);
275             dz33             = _mm256_sub_ps(iz3,jz3);
276
277             /* Calculate squared distance and things based on it */
278             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
279             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
280             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
281             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
282             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
283             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
284             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
285             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
286             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
287             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
288
289             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
290             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
291             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
292             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
293             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
294             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
295             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
296             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
297             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
298             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
299
300             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
301             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
302             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
303             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
304             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
305             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
306             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
307             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
308             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
309
310             fjx0             = _mm256_setzero_ps();
311             fjy0             = _mm256_setzero_ps();
312             fjz0             = _mm256_setzero_ps();
313             fjx1             = _mm256_setzero_ps();
314             fjy1             = _mm256_setzero_ps();
315             fjz1             = _mm256_setzero_ps();
316             fjx2             = _mm256_setzero_ps();
317             fjy2             = _mm256_setzero_ps();
318             fjz2             = _mm256_setzero_ps();
319             fjx3             = _mm256_setzero_ps();
320             fjy3             = _mm256_setzero_ps();
321             fjz3             = _mm256_setzero_ps();
322
323             /**************************
324              * CALCULATE INTERACTIONS *
325              **************************/
326
327             r00              = _mm256_mul_ps(rsq00,rinv00);
328
329             /* Calculate table index by multiplying r with table scale and truncate to integer */
330             rt               = _mm256_mul_ps(r00,vftabscale);
331             vfitab           = _mm256_cvttps_epi32(rt);
332             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
333             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
334             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
335             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
336             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
337             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
338
339             /* CUBIC SPLINE TABLE DISPERSION */
340             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
341                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
342             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
343                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
344             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
345                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
346             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
347                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
348             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
349             Heps             = _mm256_mul_ps(vfeps,H);
350             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
351             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
352             vvdw6            = _mm256_mul_ps(c6_00,VV);
353             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
354             fvdw6            = _mm256_mul_ps(c6_00,FF);
355
356             /* CUBIC SPLINE TABLE REPULSION */
357             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
358             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
359             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
360                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
361             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
362                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
363             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
364                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
365             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
366                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
367             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
368             Heps             = _mm256_mul_ps(vfeps,H);
369             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
370             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
371             vvdw12           = _mm256_mul_ps(c12_00,VV);
372             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
373             fvdw12           = _mm256_mul_ps(c12_00,FF);
374             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
375             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
376
377             /* Update potential sum for this i atom from the interaction with this j atom. */
378             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
379
380             fscal            = fvdw;
381
382             /* Calculate temporary vectorial force */
383             tx               = _mm256_mul_ps(fscal,dx00);
384             ty               = _mm256_mul_ps(fscal,dy00);
385             tz               = _mm256_mul_ps(fscal,dz00);
386
387             /* Update vectorial force */
388             fix0             = _mm256_add_ps(fix0,tx);
389             fiy0             = _mm256_add_ps(fiy0,ty);
390             fiz0             = _mm256_add_ps(fiz0,tz);
391
392             fjx0             = _mm256_add_ps(fjx0,tx);
393             fjy0             = _mm256_add_ps(fjy0,ty);
394             fjz0             = _mm256_add_ps(fjz0,tz);
395
396             /**************************
397              * CALCULATE INTERACTIONS *
398              **************************/
399
400             /* REACTION-FIELD ELECTROSTATICS */
401             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
402             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
403
404             /* Update potential sum for this i atom from the interaction with this j atom. */
405             velecsum         = _mm256_add_ps(velecsum,velec);
406
407             fscal            = felec;
408
409             /* Calculate temporary vectorial force */
410             tx               = _mm256_mul_ps(fscal,dx11);
411             ty               = _mm256_mul_ps(fscal,dy11);
412             tz               = _mm256_mul_ps(fscal,dz11);
413
414             /* Update vectorial force */
415             fix1             = _mm256_add_ps(fix1,tx);
416             fiy1             = _mm256_add_ps(fiy1,ty);
417             fiz1             = _mm256_add_ps(fiz1,tz);
418
419             fjx1             = _mm256_add_ps(fjx1,tx);
420             fjy1             = _mm256_add_ps(fjy1,ty);
421             fjz1             = _mm256_add_ps(fjz1,tz);
422
423             /**************************
424              * CALCULATE INTERACTIONS *
425              **************************/
426
427             /* REACTION-FIELD ELECTROSTATICS */
428             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
429             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
430
431             /* Update potential sum for this i atom from the interaction with this j atom. */
432             velecsum         = _mm256_add_ps(velecsum,velec);
433
434             fscal            = felec;
435
436             /* Calculate temporary vectorial force */
437             tx               = _mm256_mul_ps(fscal,dx12);
438             ty               = _mm256_mul_ps(fscal,dy12);
439             tz               = _mm256_mul_ps(fscal,dz12);
440
441             /* Update vectorial force */
442             fix1             = _mm256_add_ps(fix1,tx);
443             fiy1             = _mm256_add_ps(fiy1,ty);
444             fiz1             = _mm256_add_ps(fiz1,tz);
445
446             fjx2             = _mm256_add_ps(fjx2,tx);
447             fjy2             = _mm256_add_ps(fjy2,ty);
448             fjz2             = _mm256_add_ps(fjz2,tz);
449
450             /**************************
451              * CALCULATE INTERACTIONS *
452              **************************/
453
454             /* REACTION-FIELD ELECTROSTATICS */
455             velec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
456             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
457
458             /* Update potential sum for this i atom from the interaction with this j atom. */
459             velecsum         = _mm256_add_ps(velecsum,velec);
460
461             fscal            = felec;
462
463             /* Calculate temporary vectorial force */
464             tx               = _mm256_mul_ps(fscal,dx13);
465             ty               = _mm256_mul_ps(fscal,dy13);
466             tz               = _mm256_mul_ps(fscal,dz13);
467
468             /* Update vectorial force */
469             fix1             = _mm256_add_ps(fix1,tx);
470             fiy1             = _mm256_add_ps(fiy1,ty);
471             fiz1             = _mm256_add_ps(fiz1,tz);
472
473             fjx3             = _mm256_add_ps(fjx3,tx);
474             fjy3             = _mm256_add_ps(fjy3,ty);
475             fjz3             = _mm256_add_ps(fjz3,tz);
476
477             /**************************
478              * CALCULATE INTERACTIONS *
479              **************************/
480
481             /* REACTION-FIELD ELECTROSTATICS */
482             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
483             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
484
485             /* Update potential sum for this i atom from the interaction with this j atom. */
486             velecsum         = _mm256_add_ps(velecsum,velec);
487
488             fscal            = felec;
489
490             /* Calculate temporary vectorial force */
491             tx               = _mm256_mul_ps(fscal,dx21);
492             ty               = _mm256_mul_ps(fscal,dy21);
493             tz               = _mm256_mul_ps(fscal,dz21);
494
495             /* Update vectorial force */
496             fix2             = _mm256_add_ps(fix2,tx);
497             fiy2             = _mm256_add_ps(fiy2,ty);
498             fiz2             = _mm256_add_ps(fiz2,tz);
499
500             fjx1             = _mm256_add_ps(fjx1,tx);
501             fjy1             = _mm256_add_ps(fjy1,ty);
502             fjz1             = _mm256_add_ps(fjz1,tz);
503
504             /**************************
505              * CALCULATE INTERACTIONS *
506              **************************/
507
508             /* REACTION-FIELD ELECTROSTATICS */
509             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
510             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
511
512             /* Update potential sum for this i atom from the interaction with this j atom. */
513             velecsum         = _mm256_add_ps(velecsum,velec);
514
515             fscal            = felec;
516
517             /* Calculate temporary vectorial force */
518             tx               = _mm256_mul_ps(fscal,dx22);
519             ty               = _mm256_mul_ps(fscal,dy22);
520             tz               = _mm256_mul_ps(fscal,dz22);
521
522             /* Update vectorial force */
523             fix2             = _mm256_add_ps(fix2,tx);
524             fiy2             = _mm256_add_ps(fiy2,ty);
525             fiz2             = _mm256_add_ps(fiz2,tz);
526
527             fjx2             = _mm256_add_ps(fjx2,tx);
528             fjy2             = _mm256_add_ps(fjy2,ty);
529             fjz2             = _mm256_add_ps(fjz2,tz);
530
531             /**************************
532              * CALCULATE INTERACTIONS *
533              **************************/
534
535             /* REACTION-FIELD ELECTROSTATICS */
536             velec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
537             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
538
539             /* Update potential sum for this i atom from the interaction with this j atom. */
540             velecsum         = _mm256_add_ps(velecsum,velec);
541
542             fscal            = felec;
543
544             /* Calculate temporary vectorial force */
545             tx               = _mm256_mul_ps(fscal,dx23);
546             ty               = _mm256_mul_ps(fscal,dy23);
547             tz               = _mm256_mul_ps(fscal,dz23);
548
549             /* Update vectorial force */
550             fix2             = _mm256_add_ps(fix2,tx);
551             fiy2             = _mm256_add_ps(fiy2,ty);
552             fiz2             = _mm256_add_ps(fiz2,tz);
553
554             fjx3             = _mm256_add_ps(fjx3,tx);
555             fjy3             = _mm256_add_ps(fjy3,ty);
556             fjz3             = _mm256_add_ps(fjz3,tz);
557
558             /**************************
559              * CALCULATE INTERACTIONS *
560              **************************/
561
562             /* REACTION-FIELD ELECTROSTATICS */
563             velec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
564             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
565
566             /* Update potential sum for this i atom from the interaction with this j atom. */
567             velecsum         = _mm256_add_ps(velecsum,velec);
568
569             fscal            = felec;
570
571             /* Calculate temporary vectorial force */
572             tx               = _mm256_mul_ps(fscal,dx31);
573             ty               = _mm256_mul_ps(fscal,dy31);
574             tz               = _mm256_mul_ps(fscal,dz31);
575
576             /* Update vectorial force */
577             fix3             = _mm256_add_ps(fix3,tx);
578             fiy3             = _mm256_add_ps(fiy3,ty);
579             fiz3             = _mm256_add_ps(fiz3,tz);
580
581             fjx1             = _mm256_add_ps(fjx1,tx);
582             fjy1             = _mm256_add_ps(fjy1,ty);
583             fjz1             = _mm256_add_ps(fjz1,tz);
584
585             /**************************
586              * CALCULATE INTERACTIONS *
587              **************************/
588
589             /* REACTION-FIELD ELECTROSTATICS */
590             velec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
591             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
592
593             /* Update potential sum for this i atom from the interaction with this j atom. */
594             velecsum         = _mm256_add_ps(velecsum,velec);
595
596             fscal            = felec;
597
598             /* Calculate temporary vectorial force */
599             tx               = _mm256_mul_ps(fscal,dx32);
600             ty               = _mm256_mul_ps(fscal,dy32);
601             tz               = _mm256_mul_ps(fscal,dz32);
602
603             /* Update vectorial force */
604             fix3             = _mm256_add_ps(fix3,tx);
605             fiy3             = _mm256_add_ps(fiy3,ty);
606             fiz3             = _mm256_add_ps(fiz3,tz);
607
608             fjx2             = _mm256_add_ps(fjx2,tx);
609             fjy2             = _mm256_add_ps(fjy2,ty);
610             fjz2             = _mm256_add_ps(fjz2,tz);
611
612             /**************************
613              * CALCULATE INTERACTIONS *
614              **************************/
615
616             /* REACTION-FIELD ELECTROSTATICS */
617             velec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
618             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
619
620             /* Update potential sum for this i atom from the interaction with this j atom. */
621             velecsum         = _mm256_add_ps(velecsum,velec);
622
623             fscal            = felec;
624
625             /* Calculate temporary vectorial force */
626             tx               = _mm256_mul_ps(fscal,dx33);
627             ty               = _mm256_mul_ps(fscal,dy33);
628             tz               = _mm256_mul_ps(fscal,dz33);
629
630             /* Update vectorial force */
631             fix3             = _mm256_add_ps(fix3,tx);
632             fiy3             = _mm256_add_ps(fiy3,ty);
633             fiz3             = _mm256_add_ps(fiz3,tz);
634
635             fjx3             = _mm256_add_ps(fjx3,tx);
636             fjy3             = _mm256_add_ps(fjy3,ty);
637             fjz3             = _mm256_add_ps(fjz3,tz);
638
639             fjptrA             = f+j_coord_offsetA;
640             fjptrB             = f+j_coord_offsetB;
641             fjptrC             = f+j_coord_offsetC;
642             fjptrD             = f+j_coord_offsetD;
643             fjptrE             = f+j_coord_offsetE;
644             fjptrF             = f+j_coord_offsetF;
645             fjptrG             = f+j_coord_offsetG;
646             fjptrH             = f+j_coord_offsetH;
647
648             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
649                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
650                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
651
652             /* Inner loop uses 347 flops */
653         }
654
655         if(jidx<j_index_end)
656         {
657
658             /* Get j neighbor index, and coordinate index */
659             jnrlistA         = jjnr[jidx];
660             jnrlistB         = jjnr[jidx+1];
661             jnrlistC         = jjnr[jidx+2];
662             jnrlistD         = jjnr[jidx+3];
663             jnrlistE         = jjnr[jidx+4];
664             jnrlistF         = jjnr[jidx+5];
665             jnrlistG         = jjnr[jidx+6];
666             jnrlistH         = jjnr[jidx+7];
667             /* Sign of each element will be negative for non-real atoms.
668              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
669              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
670              */
671             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
672                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
673                                             
674             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
675             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
676             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
677             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
678             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
679             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
680             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
681             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
682             j_coord_offsetA  = DIM*jnrA;
683             j_coord_offsetB  = DIM*jnrB;
684             j_coord_offsetC  = DIM*jnrC;
685             j_coord_offsetD  = DIM*jnrD;
686             j_coord_offsetE  = DIM*jnrE;
687             j_coord_offsetF  = DIM*jnrF;
688             j_coord_offsetG  = DIM*jnrG;
689             j_coord_offsetH  = DIM*jnrH;
690
691             /* load j atom coordinates */
692             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
693                                                  x+j_coord_offsetC,x+j_coord_offsetD,
694                                                  x+j_coord_offsetE,x+j_coord_offsetF,
695                                                  x+j_coord_offsetG,x+j_coord_offsetH,
696                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
697                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
698
699             /* Calculate displacement vector */
700             dx00             = _mm256_sub_ps(ix0,jx0);
701             dy00             = _mm256_sub_ps(iy0,jy0);
702             dz00             = _mm256_sub_ps(iz0,jz0);
703             dx11             = _mm256_sub_ps(ix1,jx1);
704             dy11             = _mm256_sub_ps(iy1,jy1);
705             dz11             = _mm256_sub_ps(iz1,jz1);
706             dx12             = _mm256_sub_ps(ix1,jx2);
707             dy12             = _mm256_sub_ps(iy1,jy2);
708             dz12             = _mm256_sub_ps(iz1,jz2);
709             dx13             = _mm256_sub_ps(ix1,jx3);
710             dy13             = _mm256_sub_ps(iy1,jy3);
711             dz13             = _mm256_sub_ps(iz1,jz3);
712             dx21             = _mm256_sub_ps(ix2,jx1);
713             dy21             = _mm256_sub_ps(iy2,jy1);
714             dz21             = _mm256_sub_ps(iz2,jz1);
715             dx22             = _mm256_sub_ps(ix2,jx2);
716             dy22             = _mm256_sub_ps(iy2,jy2);
717             dz22             = _mm256_sub_ps(iz2,jz2);
718             dx23             = _mm256_sub_ps(ix2,jx3);
719             dy23             = _mm256_sub_ps(iy2,jy3);
720             dz23             = _mm256_sub_ps(iz2,jz3);
721             dx31             = _mm256_sub_ps(ix3,jx1);
722             dy31             = _mm256_sub_ps(iy3,jy1);
723             dz31             = _mm256_sub_ps(iz3,jz1);
724             dx32             = _mm256_sub_ps(ix3,jx2);
725             dy32             = _mm256_sub_ps(iy3,jy2);
726             dz32             = _mm256_sub_ps(iz3,jz2);
727             dx33             = _mm256_sub_ps(ix3,jx3);
728             dy33             = _mm256_sub_ps(iy3,jy3);
729             dz33             = _mm256_sub_ps(iz3,jz3);
730
731             /* Calculate squared distance and things based on it */
732             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
733             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
734             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
735             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
736             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
737             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
738             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
739             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
740             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
741             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
742
743             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
744             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
745             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
746             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
747             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
748             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
749             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
750             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
751             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
752             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
753
754             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
755             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
756             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
757             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
758             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
759             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
760             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
761             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
762             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
763
764             fjx0             = _mm256_setzero_ps();
765             fjy0             = _mm256_setzero_ps();
766             fjz0             = _mm256_setzero_ps();
767             fjx1             = _mm256_setzero_ps();
768             fjy1             = _mm256_setzero_ps();
769             fjz1             = _mm256_setzero_ps();
770             fjx2             = _mm256_setzero_ps();
771             fjy2             = _mm256_setzero_ps();
772             fjz2             = _mm256_setzero_ps();
773             fjx3             = _mm256_setzero_ps();
774             fjy3             = _mm256_setzero_ps();
775             fjz3             = _mm256_setzero_ps();
776
777             /**************************
778              * CALCULATE INTERACTIONS *
779              **************************/
780
781             r00              = _mm256_mul_ps(rsq00,rinv00);
782             r00              = _mm256_andnot_ps(dummy_mask,r00);
783
784             /* Calculate table index by multiplying r with table scale and truncate to integer */
785             rt               = _mm256_mul_ps(r00,vftabscale);
786             vfitab           = _mm256_cvttps_epi32(rt);
787             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
788             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
789             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
790             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
791             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
792             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
793
794             /* CUBIC SPLINE TABLE DISPERSION */
795             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
796                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
797             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
798                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
799             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
800                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
801             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
802                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
803             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
804             Heps             = _mm256_mul_ps(vfeps,H);
805             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
806             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
807             vvdw6            = _mm256_mul_ps(c6_00,VV);
808             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
809             fvdw6            = _mm256_mul_ps(c6_00,FF);
810
811             /* CUBIC SPLINE TABLE REPULSION */
812             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
813             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
814             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
815                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
816             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
817                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
818             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
819                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
820             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
821                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
822             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
823             Heps             = _mm256_mul_ps(vfeps,H);
824             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
825             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
826             vvdw12           = _mm256_mul_ps(c12_00,VV);
827             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
828             fvdw12           = _mm256_mul_ps(c12_00,FF);
829             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
830             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
831
832             /* Update potential sum for this i atom from the interaction with this j atom. */
833             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
834             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
835
836             fscal            = fvdw;
837
838             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
839
840             /* Calculate temporary vectorial force */
841             tx               = _mm256_mul_ps(fscal,dx00);
842             ty               = _mm256_mul_ps(fscal,dy00);
843             tz               = _mm256_mul_ps(fscal,dz00);
844
845             /* Update vectorial force */
846             fix0             = _mm256_add_ps(fix0,tx);
847             fiy0             = _mm256_add_ps(fiy0,ty);
848             fiz0             = _mm256_add_ps(fiz0,tz);
849
850             fjx0             = _mm256_add_ps(fjx0,tx);
851             fjy0             = _mm256_add_ps(fjy0,ty);
852             fjz0             = _mm256_add_ps(fjz0,tz);
853
854             /**************************
855              * CALCULATE INTERACTIONS *
856              **************************/
857
858             /* REACTION-FIELD ELECTROSTATICS */
859             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
860             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
861
862             /* Update potential sum for this i atom from the interaction with this j atom. */
863             velec            = _mm256_andnot_ps(dummy_mask,velec);
864             velecsum         = _mm256_add_ps(velecsum,velec);
865
866             fscal            = felec;
867
868             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
869
870             /* Calculate temporary vectorial force */
871             tx               = _mm256_mul_ps(fscal,dx11);
872             ty               = _mm256_mul_ps(fscal,dy11);
873             tz               = _mm256_mul_ps(fscal,dz11);
874
875             /* Update vectorial force */
876             fix1             = _mm256_add_ps(fix1,tx);
877             fiy1             = _mm256_add_ps(fiy1,ty);
878             fiz1             = _mm256_add_ps(fiz1,tz);
879
880             fjx1             = _mm256_add_ps(fjx1,tx);
881             fjy1             = _mm256_add_ps(fjy1,ty);
882             fjz1             = _mm256_add_ps(fjz1,tz);
883
884             /**************************
885              * CALCULATE INTERACTIONS *
886              **************************/
887
888             /* REACTION-FIELD ELECTROSTATICS */
889             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
890             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
891
892             /* Update potential sum for this i atom from the interaction with this j atom. */
893             velec            = _mm256_andnot_ps(dummy_mask,velec);
894             velecsum         = _mm256_add_ps(velecsum,velec);
895
896             fscal            = felec;
897
898             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
899
900             /* Calculate temporary vectorial force */
901             tx               = _mm256_mul_ps(fscal,dx12);
902             ty               = _mm256_mul_ps(fscal,dy12);
903             tz               = _mm256_mul_ps(fscal,dz12);
904
905             /* Update vectorial force */
906             fix1             = _mm256_add_ps(fix1,tx);
907             fiy1             = _mm256_add_ps(fiy1,ty);
908             fiz1             = _mm256_add_ps(fiz1,tz);
909
910             fjx2             = _mm256_add_ps(fjx2,tx);
911             fjy2             = _mm256_add_ps(fjy2,ty);
912             fjz2             = _mm256_add_ps(fjz2,tz);
913
914             /**************************
915              * CALCULATE INTERACTIONS *
916              **************************/
917
918             /* REACTION-FIELD ELECTROSTATICS */
919             velec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
920             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
921
922             /* Update potential sum for this i atom from the interaction with this j atom. */
923             velec            = _mm256_andnot_ps(dummy_mask,velec);
924             velecsum         = _mm256_add_ps(velecsum,velec);
925
926             fscal            = felec;
927
928             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
929
930             /* Calculate temporary vectorial force */
931             tx               = _mm256_mul_ps(fscal,dx13);
932             ty               = _mm256_mul_ps(fscal,dy13);
933             tz               = _mm256_mul_ps(fscal,dz13);
934
935             /* Update vectorial force */
936             fix1             = _mm256_add_ps(fix1,tx);
937             fiy1             = _mm256_add_ps(fiy1,ty);
938             fiz1             = _mm256_add_ps(fiz1,tz);
939
940             fjx3             = _mm256_add_ps(fjx3,tx);
941             fjy3             = _mm256_add_ps(fjy3,ty);
942             fjz3             = _mm256_add_ps(fjz3,tz);
943
944             /**************************
945              * CALCULATE INTERACTIONS *
946              **************************/
947
948             /* REACTION-FIELD ELECTROSTATICS */
949             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
950             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
951
952             /* Update potential sum for this i atom from the interaction with this j atom. */
953             velec            = _mm256_andnot_ps(dummy_mask,velec);
954             velecsum         = _mm256_add_ps(velecsum,velec);
955
956             fscal            = felec;
957
958             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
959
960             /* Calculate temporary vectorial force */
961             tx               = _mm256_mul_ps(fscal,dx21);
962             ty               = _mm256_mul_ps(fscal,dy21);
963             tz               = _mm256_mul_ps(fscal,dz21);
964
965             /* Update vectorial force */
966             fix2             = _mm256_add_ps(fix2,tx);
967             fiy2             = _mm256_add_ps(fiy2,ty);
968             fiz2             = _mm256_add_ps(fiz2,tz);
969
970             fjx1             = _mm256_add_ps(fjx1,tx);
971             fjy1             = _mm256_add_ps(fjy1,ty);
972             fjz1             = _mm256_add_ps(fjz1,tz);
973
974             /**************************
975              * CALCULATE INTERACTIONS *
976              **************************/
977
978             /* REACTION-FIELD ELECTROSTATICS */
979             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
980             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
981
982             /* Update potential sum for this i atom from the interaction with this j atom. */
983             velec            = _mm256_andnot_ps(dummy_mask,velec);
984             velecsum         = _mm256_add_ps(velecsum,velec);
985
986             fscal            = felec;
987
988             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
989
990             /* Calculate temporary vectorial force */
991             tx               = _mm256_mul_ps(fscal,dx22);
992             ty               = _mm256_mul_ps(fscal,dy22);
993             tz               = _mm256_mul_ps(fscal,dz22);
994
995             /* Update vectorial force */
996             fix2             = _mm256_add_ps(fix2,tx);
997             fiy2             = _mm256_add_ps(fiy2,ty);
998             fiz2             = _mm256_add_ps(fiz2,tz);
999
1000             fjx2             = _mm256_add_ps(fjx2,tx);
1001             fjy2             = _mm256_add_ps(fjy2,ty);
1002             fjz2             = _mm256_add_ps(fjz2,tz);
1003
1004             /**************************
1005              * CALCULATE INTERACTIONS *
1006              **************************/
1007
1008             /* REACTION-FIELD ELECTROSTATICS */
1009             velec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
1010             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1011
1012             /* Update potential sum for this i atom from the interaction with this j atom. */
1013             velec            = _mm256_andnot_ps(dummy_mask,velec);
1014             velecsum         = _mm256_add_ps(velecsum,velec);
1015
1016             fscal            = felec;
1017
1018             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1019
1020             /* Calculate temporary vectorial force */
1021             tx               = _mm256_mul_ps(fscal,dx23);
1022             ty               = _mm256_mul_ps(fscal,dy23);
1023             tz               = _mm256_mul_ps(fscal,dz23);
1024
1025             /* Update vectorial force */
1026             fix2             = _mm256_add_ps(fix2,tx);
1027             fiy2             = _mm256_add_ps(fiy2,ty);
1028             fiz2             = _mm256_add_ps(fiz2,tz);
1029
1030             fjx3             = _mm256_add_ps(fjx3,tx);
1031             fjy3             = _mm256_add_ps(fjy3,ty);
1032             fjz3             = _mm256_add_ps(fjz3,tz);
1033
1034             /**************************
1035              * CALCULATE INTERACTIONS *
1036              **************************/
1037
1038             /* REACTION-FIELD ELECTROSTATICS */
1039             velec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
1040             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1041
1042             /* Update potential sum for this i atom from the interaction with this j atom. */
1043             velec            = _mm256_andnot_ps(dummy_mask,velec);
1044             velecsum         = _mm256_add_ps(velecsum,velec);
1045
1046             fscal            = felec;
1047
1048             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1049
1050             /* Calculate temporary vectorial force */
1051             tx               = _mm256_mul_ps(fscal,dx31);
1052             ty               = _mm256_mul_ps(fscal,dy31);
1053             tz               = _mm256_mul_ps(fscal,dz31);
1054
1055             /* Update vectorial force */
1056             fix3             = _mm256_add_ps(fix3,tx);
1057             fiy3             = _mm256_add_ps(fiy3,ty);
1058             fiz3             = _mm256_add_ps(fiz3,tz);
1059
1060             fjx1             = _mm256_add_ps(fjx1,tx);
1061             fjy1             = _mm256_add_ps(fjy1,ty);
1062             fjz1             = _mm256_add_ps(fjz1,tz);
1063
1064             /**************************
1065              * CALCULATE INTERACTIONS *
1066              **************************/
1067
1068             /* REACTION-FIELD ELECTROSTATICS */
1069             velec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
1070             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1071
1072             /* Update potential sum for this i atom from the interaction with this j atom. */
1073             velec            = _mm256_andnot_ps(dummy_mask,velec);
1074             velecsum         = _mm256_add_ps(velecsum,velec);
1075
1076             fscal            = felec;
1077
1078             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1079
1080             /* Calculate temporary vectorial force */
1081             tx               = _mm256_mul_ps(fscal,dx32);
1082             ty               = _mm256_mul_ps(fscal,dy32);
1083             tz               = _mm256_mul_ps(fscal,dz32);
1084
1085             /* Update vectorial force */
1086             fix3             = _mm256_add_ps(fix3,tx);
1087             fiy3             = _mm256_add_ps(fiy3,ty);
1088             fiz3             = _mm256_add_ps(fiz3,tz);
1089
1090             fjx2             = _mm256_add_ps(fjx2,tx);
1091             fjy2             = _mm256_add_ps(fjy2,ty);
1092             fjz2             = _mm256_add_ps(fjz2,tz);
1093
1094             /**************************
1095              * CALCULATE INTERACTIONS *
1096              **************************/
1097
1098             /* REACTION-FIELD ELECTROSTATICS */
1099             velec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
1100             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1101
1102             /* Update potential sum for this i atom from the interaction with this j atom. */
1103             velec            = _mm256_andnot_ps(dummy_mask,velec);
1104             velecsum         = _mm256_add_ps(velecsum,velec);
1105
1106             fscal            = felec;
1107
1108             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1109
1110             /* Calculate temporary vectorial force */
1111             tx               = _mm256_mul_ps(fscal,dx33);
1112             ty               = _mm256_mul_ps(fscal,dy33);
1113             tz               = _mm256_mul_ps(fscal,dz33);
1114
1115             /* Update vectorial force */
1116             fix3             = _mm256_add_ps(fix3,tx);
1117             fiy3             = _mm256_add_ps(fiy3,ty);
1118             fiz3             = _mm256_add_ps(fiz3,tz);
1119
1120             fjx3             = _mm256_add_ps(fjx3,tx);
1121             fjy3             = _mm256_add_ps(fjy3,ty);
1122             fjz3             = _mm256_add_ps(fjz3,tz);
1123
1124             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1125             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1126             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1127             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1128             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1129             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1130             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1131             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1132
1133             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1134                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1135                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1136
1137             /* Inner loop uses 348 flops */
1138         }
1139
1140         /* End of innermost loop */
1141
1142         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1143                                                  f+i_coord_offset,fshift+i_shift_offset);
1144
1145         ggid                        = gid[iidx];
1146         /* Update potential energies */
1147         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1148         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1149
1150         /* Increment number of inner iterations */
1151         inneriter                  += j_index_end - j_index_start;
1152
1153         /* Outer loop uses 26 flops */
1154     }
1155
1156     /* Increment number of outer iterations */
1157     outeriter        += nri;
1158
1159     /* Update outer/inner flops */
1160
1161     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*348);
1162 }
1163 /*
1164  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_single
1165  * Electrostatics interaction: ReactionField
1166  * VdW interaction:            CubicSplineTable
1167  * Geometry:                   Water4-Water4
1168  * Calculate force/pot:        Force
1169  */
1170 void
1171 nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_single
1172                     (t_nblist * gmx_restrict                nlist,
1173                      rvec * gmx_restrict                    xx,
1174                      rvec * gmx_restrict                    ff,
1175                      t_forcerec * gmx_restrict              fr,
1176                      t_mdatoms * gmx_restrict               mdatoms,
1177                      nb_kernel_data_t * gmx_restrict        kernel_data,
1178                      t_nrnb * gmx_restrict                  nrnb)
1179 {
1180     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1181      * just 0 for non-waters.
1182      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1183      * jnr indices corresponding to data put in the four positions in the SIMD register.
1184      */
1185     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1186     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1187     int              jnrA,jnrB,jnrC,jnrD;
1188     int              jnrE,jnrF,jnrG,jnrH;
1189     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1190     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1191     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1192     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1193     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1194     real             rcutoff_scalar;
1195     real             *shiftvec,*fshift,*x,*f;
1196     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1197     real             scratch[4*DIM];
1198     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1199     real *           vdwioffsetptr0;
1200     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1201     real *           vdwioffsetptr1;
1202     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1203     real *           vdwioffsetptr2;
1204     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1205     real *           vdwioffsetptr3;
1206     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1207     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1208     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1209     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1210     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1211     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1212     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1213     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1214     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1215     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1216     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1217     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1218     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1219     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1220     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1221     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1222     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1223     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1224     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1225     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1226     real             *charge;
1227     int              nvdwtype;
1228     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1229     int              *vdwtype;
1230     real             *vdwparam;
1231     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1232     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1233     __m256i          vfitab;
1234     __m128i          vfitab_lo,vfitab_hi;
1235     __m128i          ifour       = _mm_set1_epi32(4);
1236     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1237     real             *vftab;
1238     __m256           dummy_mask,cutoff_mask;
1239     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1240     __m256           one     = _mm256_set1_ps(1.0);
1241     __m256           two     = _mm256_set1_ps(2.0);
1242     x                = xx[0];
1243     f                = ff[0];
1244
1245     nri              = nlist->nri;
1246     iinr             = nlist->iinr;
1247     jindex           = nlist->jindex;
1248     jjnr             = nlist->jjnr;
1249     shiftidx         = nlist->shift;
1250     gid              = nlist->gid;
1251     shiftvec         = fr->shift_vec[0];
1252     fshift           = fr->fshift[0];
1253     facel            = _mm256_set1_ps(fr->epsfac);
1254     charge           = mdatoms->chargeA;
1255     krf              = _mm256_set1_ps(fr->ic->k_rf);
1256     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1257     crf              = _mm256_set1_ps(fr->ic->c_rf);
1258     nvdwtype         = fr->ntype;
1259     vdwparam         = fr->nbfp;
1260     vdwtype          = mdatoms->typeA;
1261
1262     vftab            = kernel_data->table_vdw->data;
1263     vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
1264
1265     /* Setup water-specific parameters */
1266     inr              = nlist->iinr[0];
1267     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1268     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1269     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1270     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1271
1272     jq1              = _mm256_set1_ps(charge[inr+1]);
1273     jq2              = _mm256_set1_ps(charge[inr+2]);
1274     jq3              = _mm256_set1_ps(charge[inr+3]);
1275     vdwjidx0A        = 2*vdwtype[inr+0];
1276     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1277     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1278     qq11             = _mm256_mul_ps(iq1,jq1);
1279     qq12             = _mm256_mul_ps(iq1,jq2);
1280     qq13             = _mm256_mul_ps(iq1,jq3);
1281     qq21             = _mm256_mul_ps(iq2,jq1);
1282     qq22             = _mm256_mul_ps(iq2,jq2);
1283     qq23             = _mm256_mul_ps(iq2,jq3);
1284     qq31             = _mm256_mul_ps(iq3,jq1);
1285     qq32             = _mm256_mul_ps(iq3,jq2);
1286     qq33             = _mm256_mul_ps(iq3,jq3);
1287
1288     /* Avoid stupid compiler warnings */
1289     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1290     j_coord_offsetA = 0;
1291     j_coord_offsetB = 0;
1292     j_coord_offsetC = 0;
1293     j_coord_offsetD = 0;
1294     j_coord_offsetE = 0;
1295     j_coord_offsetF = 0;
1296     j_coord_offsetG = 0;
1297     j_coord_offsetH = 0;
1298
1299     outeriter        = 0;
1300     inneriter        = 0;
1301
1302     for(iidx=0;iidx<4*DIM;iidx++)
1303     {
1304         scratch[iidx] = 0.0;
1305     }
1306
1307     /* Start outer loop over neighborlists */
1308     for(iidx=0; iidx<nri; iidx++)
1309     {
1310         /* Load shift vector for this list */
1311         i_shift_offset   = DIM*shiftidx[iidx];
1312
1313         /* Load limits for loop over neighbors */
1314         j_index_start    = jindex[iidx];
1315         j_index_end      = jindex[iidx+1];
1316
1317         /* Get outer coordinate index */
1318         inr              = iinr[iidx];
1319         i_coord_offset   = DIM*inr;
1320
1321         /* Load i particle coords and add shift vector */
1322         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1323                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1324
1325         fix0             = _mm256_setzero_ps();
1326         fiy0             = _mm256_setzero_ps();
1327         fiz0             = _mm256_setzero_ps();
1328         fix1             = _mm256_setzero_ps();
1329         fiy1             = _mm256_setzero_ps();
1330         fiz1             = _mm256_setzero_ps();
1331         fix2             = _mm256_setzero_ps();
1332         fiy2             = _mm256_setzero_ps();
1333         fiz2             = _mm256_setzero_ps();
1334         fix3             = _mm256_setzero_ps();
1335         fiy3             = _mm256_setzero_ps();
1336         fiz3             = _mm256_setzero_ps();
1337
1338         /* Start inner kernel loop */
1339         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1340         {
1341
1342             /* Get j neighbor index, and coordinate index */
1343             jnrA             = jjnr[jidx];
1344             jnrB             = jjnr[jidx+1];
1345             jnrC             = jjnr[jidx+2];
1346             jnrD             = jjnr[jidx+3];
1347             jnrE             = jjnr[jidx+4];
1348             jnrF             = jjnr[jidx+5];
1349             jnrG             = jjnr[jidx+6];
1350             jnrH             = jjnr[jidx+7];
1351             j_coord_offsetA  = DIM*jnrA;
1352             j_coord_offsetB  = DIM*jnrB;
1353             j_coord_offsetC  = DIM*jnrC;
1354             j_coord_offsetD  = DIM*jnrD;
1355             j_coord_offsetE  = DIM*jnrE;
1356             j_coord_offsetF  = DIM*jnrF;
1357             j_coord_offsetG  = DIM*jnrG;
1358             j_coord_offsetH  = DIM*jnrH;
1359
1360             /* load j atom coordinates */
1361             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1362                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1363                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1364                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1365                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1366                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1367
1368             /* Calculate displacement vector */
1369             dx00             = _mm256_sub_ps(ix0,jx0);
1370             dy00             = _mm256_sub_ps(iy0,jy0);
1371             dz00             = _mm256_sub_ps(iz0,jz0);
1372             dx11             = _mm256_sub_ps(ix1,jx1);
1373             dy11             = _mm256_sub_ps(iy1,jy1);
1374             dz11             = _mm256_sub_ps(iz1,jz1);
1375             dx12             = _mm256_sub_ps(ix1,jx2);
1376             dy12             = _mm256_sub_ps(iy1,jy2);
1377             dz12             = _mm256_sub_ps(iz1,jz2);
1378             dx13             = _mm256_sub_ps(ix1,jx3);
1379             dy13             = _mm256_sub_ps(iy1,jy3);
1380             dz13             = _mm256_sub_ps(iz1,jz3);
1381             dx21             = _mm256_sub_ps(ix2,jx1);
1382             dy21             = _mm256_sub_ps(iy2,jy1);
1383             dz21             = _mm256_sub_ps(iz2,jz1);
1384             dx22             = _mm256_sub_ps(ix2,jx2);
1385             dy22             = _mm256_sub_ps(iy2,jy2);
1386             dz22             = _mm256_sub_ps(iz2,jz2);
1387             dx23             = _mm256_sub_ps(ix2,jx3);
1388             dy23             = _mm256_sub_ps(iy2,jy3);
1389             dz23             = _mm256_sub_ps(iz2,jz3);
1390             dx31             = _mm256_sub_ps(ix3,jx1);
1391             dy31             = _mm256_sub_ps(iy3,jy1);
1392             dz31             = _mm256_sub_ps(iz3,jz1);
1393             dx32             = _mm256_sub_ps(ix3,jx2);
1394             dy32             = _mm256_sub_ps(iy3,jy2);
1395             dz32             = _mm256_sub_ps(iz3,jz2);
1396             dx33             = _mm256_sub_ps(ix3,jx3);
1397             dy33             = _mm256_sub_ps(iy3,jy3);
1398             dz33             = _mm256_sub_ps(iz3,jz3);
1399
1400             /* Calculate squared distance and things based on it */
1401             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1402             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1403             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1404             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1405             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1406             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1407             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1408             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1409             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1410             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1411
1412             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1413             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1414             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1415             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1416             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1417             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1418             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1419             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1420             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1421             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1422
1423             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1424             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1425             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1426             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1427             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1428             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1429             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1430             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1431             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1432
1433             fjx0             = _mm256_setzero_ps();
1434             fjy0             = _mm256_setzero_ps();
1435             fjz0             = _mm256_setzero_ps();
1436             fjx1             = _mm256_setzero_ps();
1437             fjy1             = _mm256_setzero_ps();
1438             fjz1             = _mm256_setzero_ps();
1439             fjx2             = _mm256_setzero_ps();
1440             fjy2             = _mm256_setzero_ps();
1441             fjz2             = _mm256_setzero_ps();
1442             fjx3             = _mm256_setzero_ps();
1443             fjy3             = _mm256_setzero_ps();
1444             fjz3             = _mm256_setzero_ps();
1445
1446             /**************************
1447              * CALCULATE INTERACTIONS *
1448              **************************/
1449
1450             r00              = _mm256_mul_ps(rsq00,rinv00);
1451
1452             /* Calculate table index by multiplying r with table scale and truncate to integer */
1453             rt               = _mm256_mul_ps(r00,vftabscale);
1454             vfitab           = _mm256_cvttps_epi32(rt);
1455             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1456             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1457             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1458             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1459             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
1460             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
1461
1462             /* CUBIC SPLINE TABLE DISPERSION */
1463             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1464                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1465             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1466                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1467             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1468                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1469             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1470                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1471             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1472             Heps             = _mm256_mul_ps(vfeps,H);
1473             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1474             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1475             fvdw6            = _mm256_mul_ps(c6_00,FF);
1476
1477             /* CUBIC SPLINE TABLE REPULSION */
1478             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1479             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1480             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1481                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1482             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1483                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1484             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1485                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1486             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1487                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1488             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1489             Heps             = _mm256_mul_ps(vfeps,H);
1490             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1491             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1492             fvdw12           = _mm256_mul_ps(c12_00,FF);
1493             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1494
1495             fscal            = fvdw;
1496
1497             /* Calculate temporary vectorial force */
1498             tx               = _mm256_mul_ps(fscal,dx00);
1499             ty               = _mm256_mul_ps(fscal,dy00);
1500             tz               = _mm256_mul_ps(fscal,dz00);
1501
1502             /* Update vectorial force */
1503             fix0             = _mm256_add_ps(fix0,tx);
1504             fiy0             = _mm256_add_ps(fiy0,ty);
1505             fiz0             = _mm256_add_ps(fiz0,tz);
1506
1507             fjx0             = _mm256_add_ps(fjx0,tx);
1508             fjy0             = _mm256_add_ps(fjy0,ty);
1509             fjz0             = _mm256_add_ps(fjz0,tz);
1510
1511             /**************************
1512              * CALCULATE INTERACTIONS *
1513              **************************/
1514
1515             /* REACTION-FIELD ELECTROSTATICS */
1516             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1517
1518             fscal            = felec;
1519
1520             /* Calculate temporary vectorial force */
1521             tx               = _mm256_mul_ps(fscal,dx11);
1522             ty               = _mm256_mul_ps(fscal,dy11);
1523             tz               = _mm256_mul_ps(fscal,dz11);
1524
1525             /* Update vectorial force */
1526             fix1             = _mm256_add_ps(fix1,tx);
1527             fiy1             = _mm256_add_ps(fiy1,ty);
1528             fiz1             = _mm256_add_ps(fiz1,tz);
1529
1530             fjx1             = _mm256_add_ps(fjx1,tx);
1531             fjy1             = _mm256_add_ps(fjy1,ty);
1532             fjz1             = _mm256_add_ps(fjz1,tz);
1533
1534             /**************************
1535              * CALCULATE INTERACTIONS *
1536              **************************/
1537
1538             /* REACTION-FIELD ELECTROSTATICS */
1539             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1540
1541             fscal            = felec;
1542
1543             /* Calculate temporary vectorial force */
1544             tx               = _mm256_mul_ps(fscal,dx12);
1545             ty               = _mm256_mul_ps(fscal,dy12);
1546             tz               = _mm256_mul_ps(fscal,dz12);
1547
1548             /* Update vectorial force */
1549             fix1             = _mm256_add_ps(fix1,tx);
1550             fiy1             = _mm256_add_ps(fiy1,ty);
1551             fiz1             = _mm256_add_ps(fiz1,tz);
1552
1553             fjx2             = _mm256_add_ps(fjx2,tx);
1554             fjy2             = _mm256_add_ps(fjy2,ty);
1555             fjz2             = _mm256_add_ps(fjz2,tz);
1556
1557             /**************************
1558              * CALCULATE INTERACTIONS *
1559              **************************/
1560
1561             /* REACTION-FIELD ELECTROSTATICS */
1562             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1563
1564             fscal            = felec;
1565
1566             /* Calculate temporary vectorial force */
1567             tx               = _mm256_mul_ps(fscal,dx13);
1568             ty               = _mm256_mul_ps(fscal,dy13);
1569             tz               = _mm256_mul_ps(fscal,dz13);
1570
1571             /* Update vectorial force */
1572             fix1             = _mm256_add_ps(fix1,tx);
1573             fiy1             = _mm256_add_ps(fiy1,ty);
1574             fiz1             = _mm256_add_ps(fiz1,tz);
1575
1576             fjx3             = _mm256_add_ps(fjx3,tx);
1577             fjy3             = _mm256_add_ps(fjy3,ty);
1578             fjz3             = _mm256_add_ps(fjz3,tz);
1579
1580             /**************************
1581              * CALCULATE INTERACTIONS *
1582              **************************/
1583
1584             /* REACTION-FIELD ELECTROSTATICS */
1585             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1586
1587             fscal            = felec;
1588
1589             /* Calculate temporary vectorial force */
1590             tx               = _mm256_mul_ps(fscal,dx21);
1591             ty               = _mm256_mul_ps(fscal,dy21);
1592             tz               = _mm256_mul_ps(fscal,dz21);
1593
1594             /* Update vectorial force */
1595             fix2             = _mm256_add_ps(fix2,tx);
1596             fiy2             = _mm256_add_ps(fiy2,ty);
1597             fiz2             = _mm256_add_ps(fiz2,tz);
1598
1599             fjx1             = _mm256_add_ps(fjx1,tx);
1600             fjy1             = _mm256_add_ps(fjy1,ty);
1601             fjz1             = _mm256_add_ps(fjz1,tz);
1602
1603             /**************************
1604              * CALCULATE INTERACTIONS *
1605              **************************/
1606
1607             /* REACTION-FIELD ELECTROSTATICS */
1608             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1609
1610             fscal            = felec;
1611
1612             /* Calculate temporary vectorial force */
1613             tx               = _mm256_mul_ps(fscal,dx22);
1614             ty               = _mm256_mul_ps(fscal,dy22);
1615             tz               = _mm256_mul_ps(fscal,dz22);
1616
1617             /* Update vectorial force */
1618             fix2             = _mm256_add_ps(fix2,tx);
1619             fiy2             = _mm256_add_ps(fiy2,ty);
1620             fiz2             = _mm256_add_ps(fiz2,tz);
1621
1622             fjx2             = _mm256_add_ps(fjx2,tx);
1623             fjy2             = _mm256_add_ps(fjy2,ty);
1624             fjz2             = _mm256_add_ps(fjz2,tz);
1625
1626             /**************************
1627              * CALCULATE INTERACTIONS *
1628              **************************/
1629
1630             /* REACTION-FIELD ELECTROSTATICS */
1631             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1632
1633             fscal            = felec;
1634
1635             /* Calculate temporary vectorial force */
1636             tx               = _mm256_mul_ps(fscal,dx23);
1637             ty               = _mm256_mul_ps(fscal,dy23);
1638             tz               = _mm256_mul_ps(fscal,dz23);
1639
1640             /* Update vectorial force */
1641             fix2             = _mm256_add_ps(fix2,tx);
1642             fiy2             = _mm256_add_ps(fiy2,ty);
1643             fiz2             = _mm256_add_ps(fiz2,tz);
1644
1645             fjx3             = _mm256_add_ps(fjx3,tx);
1646             fjy3             = _mm256_add_ps(fjy3,ty);
1647             fjz3             = _mm256_add_ps(fjz3,tz);
1648
1649             /**************************
1650              * CALCULATE INTERACTIONS *
1651              **************************/
1652
1653             /* REACTION-FIELD ELECTROSTATICS */
1654             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1655
1656             fscal            = felec;
1657
1658             /* Calculate temporary vectorial force */
1659             tx               = _mm256_mul_ps(fscal,dx31);
1660             ty               = _mm256_mul_ps(fscal,dy31);
1661             tz               = _mm256_mul_ps(fscal,dz31);
1662
1663             /* Update vectorial force */
1664             fix3             = _mm256_add_ps(fix3,tx);
1665             fiy3             = _mm256_add_ps(fiy3,ty);
1666             fiz3             = _mm256_add_ps(fiz3,tz);
1667
1668             fjx1             = _mm256_add_ps(fjx1,tx);
1669             fjy1             = _mm256_add_ps(fjy1,ty);
1670             fjz1             = _mm256_add_ps(fjz1,tz);
1671
1672             /**************************
1673              * CALCULATE INTERACTIONS *
1674              **************************/
1675
1676             /* REACTION-FIELD ELECTROSTATICS */
1677             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1678
1679             fscal            = felec;
1680
1681             /* Calculate temporary vectorial force */
1682             tx               = _mm256_mul_ps(fscal,dx32);
1683             ty               = _mm256_mul_ps(fscal,dy32);
1684             tz               = _mm256_mul_ps(fscal,dz32);
1685
1686             /* Update vectorial force */
1687             fix3             = _mm256_add_ps(fix3,tx);
1688             fiy3             = _mm256_add_ps(fiy3,ty);
1689             fiz3             = _mm256_add_ps(fiz3,tz);
1690
1691             fjx2             = _mm256_add_ps(fjx2,tx);
1692             fjy2             = _mm256_add_ps(fjy2,ty);
1693             fjz2             = _mm256_add_ps(fjz2,tz);
1694
1695             /**************************
1696              * CALCULATE INTERACTIONS *
1697              **************************/
1698
1699             /* REACTION-FIELD ELECTROSTATICS */
1700             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1701
1702             fscal            = felec;
1703
1704             /* Calculate temporary vectorial force */
1705             tx               = _mm256_mul_ps(fscal,dx33);
1706             ty               = _mm256_mul_ps(fscal,dy33);
1707             tz               = _mm256_mul_ps(fscal,dz33);
1708
1709             /* Update vectorial force */
1710             fix3             = _mm256_add_ps(fix3,tx);
1711             fiy3             = _mm256_add_ps(fiy3,ty);
1712             fiz3             = _mm256_add_ps(fiz3,tz);
1713
1714             fjx3             = _mm256_add_ps(fjx3,tx);
1715             fjy3             = _mm256_add_ps(fjy3,ty);
1716             fjz3             = _mm256_add_ps(fjz3,tz);
1717
1718             fjptrA             = f+j_coord_offsetA;
1719             fjptrB             = f+j_coord_offsetB;
1720             fjptrC             = f+j_coord_offsetC;
1721             fjptrD             = f+j_coord_offsetD;
1722             fjptrE             = f+j_coord_offsetE;
1723             fjptrF             = f+j_coord_offsetF;
1724             fjptrG             = f+j_coord_offsetG;
1725             fjptrH             = f+j_coord_offsetH;
1726
1727             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1728                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1729                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1730
1731             /* Inner loop uses 294 flops */
1732         }
1733
1734         if(jidx<j_index_end)
1735         {
1736
1737             /* Get j neighbor index, and coordinate index */
1738             jnrlistA         = jjnr[jidx];
1739             jnrlistB         = jjnr[jidx+1];
1740             jnrlistC         = jjnr[jidx+2];
1741             jnrlistD         = jjnr[jidx+3];
1742             jnrlistE         = jjnr[jidx+4];
1743             jnrlistF         = jjnr[jidx+5];
1744             jnrlistG         = jjnr[jidx+6];
1745             jnrlistH         = jjnr[jidx+7];
1746             /* Sign of each element will be negative for non-real atoms.
1747              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1748              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1749              */
1750             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1751                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1752                                             
1753             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1754             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1755             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1756             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1757             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1758             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1759             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1760             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1761             j_coord_offsetA  = DIM*jnrA;
1762             j_coord_offsetB  = DIM*jnrB;
1763             j_coord_offsetC  = DIM*jnrC;
1764             j_coord_offsetD  = DIM*jnrD;
1765             j_coord_offsetE  = DIM*jnrE;
1766             j_coord_offsetF  = DIM*jnrF;
1767             j_coord_offsetG  = DIM*jnrG;
1768             j_coord_offsetH  = DIM*jnrH;
1769
1770             /* load j atom coordinates */
1771             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1772                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1773                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1774                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1775                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1776                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1777
1778             /* Calculate displacement vector */
1779             dx00             = _mm256_sub_ps(ix0,jx0);
1780             dy00             = _mm256_sub_ps(iy0,jy0);
1781             dz00             = _mm256_sub_ps(iz0,jz0);
1782             dx11             = _mm256_sub_ps(ix1,jx1);
1783             dy11             = _mm256_sub_ps(iy1,jy1);
1784             dz11             = _mm256_sub_ps(iz1,jz1);
1785             dx12             = _mm256_sub_ps(ix1,jx2);
1786             dy12             = _mm256_sub_ps(iy1,jy2);
1787             dz12             = _mm256_sub_ps(iz1,jz2);
1788             dx13             = _mm256_sub_ps(ix1,jx3);
1789             dy13             = _mm256_sub_ps(iy1,jy3);
1790             dz13             = _mm256_sub_ps(iz1,jz3);
1791             dx21             = _mm256_sub_ps(ix2,jx1);
1792             dy21             = _mm256_sub_ps(iy2,jy1);
1793             dz21             = _mm256_sub_ps(iz2,jz1);
1794             dx22             = _mm256_sub_ps(ix2,jx2);
1795             dy22             = _mm256_sub_ps(iy2,jy2);
1796             dz22             = _mm256_sub_ps(iz2,jz2);
1797             dx23             = _mm256_sub_ps(ix2,jx3);
1798             dy23             = _mm256_sub_ps(iy2,jy3);
1799             dz23             = _mm256_sub_ps(iz2,jz3);
1800             dx31             = _mm256_sub_ps(ix3,jx1);
1801             dy31             = _mm256_sub_ps(iy3,jy1);
1802             dz31             = _mm256_sub_ps(iz3,jz1);
1803             dx32             = _mm256_sub_ps(ix3,jx2);
1804             dy32             = _mm256_sub_ps(iy3,jy2);
1805             dz32             = _mm256_sub_ps(iz3,jz2);
1806             dx33             = _mm256_sub_ps(ix3,jx3);
1807             dy33             = _mm256_sub_ps(iy3,jy3);
1808             dz33             = _mm256_sub_ps(iz3,jz3);
1809
1810             /* Calculate squared distance and things based on it */
1811             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1812             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1813             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1814             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1815             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1816             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1817             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1818             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1819             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1820             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1821
1822             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1823             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1824             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1825             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1826             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1827             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1828             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1829             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1830             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1831             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1832
1833             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1834             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1835             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1836             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1837             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1838             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1839             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1840             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1841             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1842
1843             fjx0             = _mm256_setzero_ps();
1844             fjy0             = _mm256_setzero_ps();
1845             fjz0             = _mm256_setzero_ps();
1846             fjx1             = _mm256_setzero_ps();
1847             fjy1             = _mm256_setzero_ps();
1848             fjz1             = _mm256_setzero_ps();
1849             fjx2             = _mm256_setzero_ps();
1850             fjy2             = _mm256_setzero_ps();
1851             fjz2             = _mm256_setzero_ps();
1852             fjx3             = _mm256_setzero_ps();
1853             fjy3             = _mm256_setzero_ps();
1854             fjz3             = _mm256_setzero_ps();
1855
1856             /**************************
1857              * CALCULATE INTERACTIONS *
1858              **************************/
1859
1860             r00              = _mm256_mul_ps(rsq00,rinv00);
1861             r00              = _mm256_andnot_ps(dummy_mask,r00);
1862
1863             /* Calculate table index by multiplying r with table scale and truncate to integer */
1864             rt               = _mm256_mul_ps(r00,vftabscale);
1865             vfitab           = _mm256_cvttps_epi32(rt);
1866             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1867             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1868             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1869             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1870             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
1871             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
1872
1873             /* CUBIC SPLINE TABLE DISPERSION */
1874             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1875                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1876             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1877                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1878             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1879                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1880             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1881                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1882             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1883             Heps             = _mm256_mul_ps(vfeps,H);
1884             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1885             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1886             fvdw6            = _mm256_mul_ps(c6_00,FF);
1887
1888             /* CUBIC SPLINE TABLE REPULSION */
1889             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1890             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1891             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1892                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1893             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1894                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1895             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1896                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1897             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1898                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1899             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1900             Heps             = _mm256_mul_ps(vfeps,H);
1901             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1902             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1903             fvdw12           = _mm256_mul_ps(c12_00,FF);
1904             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1905
1906             fscal            = fvdw;
1907
1908             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1909
1910             /* Calculate temporary vectorial force */
1911             tx               = _mm256_mul_ps(fscal,dx00);
1912             ty               = _mm256_mul_ps(fscal,dy00);
1913             tz               = _mm256_mul_ps(fscal,dz00);
1914
1915             /* Update vectorial force */
1916             fix0             = _mm256_add_ps(fix0,tx);
1917             fiy0             = _mm256_add_ps(fiy0,ty);
1918             fiz0             = _mm256_add_ps(fiz0,tz);
1919
1920             fjx0             = _mm256_add_ps(fjx0,tx);
1921             fjy0             = _mm256_add_ps(fjy0,ty);
1922             fjz0             = _mm256_add_ps(fjz0,tz);
1923
1924             /**************************
1925              * CALCULATE INTERACTIONS *
1926              **************************/
1927
1928             /* REACTION-FIELD ELECTROSTATICS */
1929             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1930
1931             fscal            = felec;
1932
1933             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1934
1935             /* Calculate temporary vectorial force */
1936             tx               = _mm256_mul_ps(fscal,dx11);
1937             ty               = _mm256_mul_ps(fscal,dy11);
1938             tz               = _mm256_mul_ps(fscal,dz11);
1939
1940             /* Update vectorial force */
1941             fix1             = _mm256_add_ps(fix1,tx);
1942             fiy1             = _mm256_add_ps(fiy1,ty);
1943             fiz1             = _mm256_add_ps(fiz1,tz);
1944
1945             fjx1             = _mm256_add_ps(fjx1,tx);
1946             fjy1             = _mm256_add_ps(fjy1,ty);
1947             fjz1             = _mm256_add_ps(fjz1,tz);
1948
1949             /**************************
1950              * CALCULATE INTERACTIONS *
1951              **************************/
1952
1953             /* REACTION-FIELD ELECTROSTATICS */
1954             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1955
1956             fscal            = felec;
1957
1958             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1959
1960             /* Calculate temporary vectorial force */
1961             tx               = _mm256_mul_ps(fscal,dx12);
1962             ty               = _mm256_mul_ps(fscal,dy12);
1963             tz               = _mm256_mul_ps(fscal,dz12);
1964
1965             /* Update vectorial force */
1966             fix1             = _mm256_add_ps(fix1,tx);
1967             fiy1             = _mm256_add_ps(fiy1,ty);
1968             fiz1             = _mm256_add_ps(fiz1,tz);
1969
1970             fjx2             = _mm256_add_ps(fjx2,tx);
1971             fjy2             = _mm256_add_ps(fjy2,ty);
1972             fjz2             = _mm256_add_ps(fjz2,tz);
1973
1974             /**************************
1975              * CALCULATE INTERACTIONS *
1976              **************************/
1977
1978             /* REACTION-FIELD ELECTROSTATICS */
1979             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1980
1981             fscal            = felec;
1982
1983             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1984
1985             /* Calculate temporary vectorial force */
1986             tx               = _mm256_mul_ps(fscal,dx13);
1987             ty               = _mm256_mul_ps(fscal,dy13);
1988             tz               = _mm256_mul_ps(fscal,dz13);
1989
1990             /* Update vectorial force */
1991             fix1             = _mm256_add_ps(fix1,tx);
1992             fiy1             = _mm256_add_ps(fiy1,ty);
1993             fiz1             = _mm256_add_ps(fiz1,tz);
1994
1995             fjx3             = _mm256_add_ps(fjx3,tx);
1996             fjy3             = _mm256_add_ps(fjy3,ty);
1997             fjz3             = _mm256_add_ps(fjz3,tz);
1998
1999             /**************************
2000              * CALCULATE INTERACTIONS *
2001              **************************/
2002
2003             /* REACTION-FIELD ELECTROSTATICS */
2004             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2005
2006             fscal            = felec;
2007
2008             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2009
2010             /* Calculate temporary vectorial force */
2011             tx               = _mm256_mul_ps(fscal,dx21);
2012             ty               = _mm256_mul_ps(fscal,dy21);
2013             tz               = _mm256_mul_ps(fscal,dz21);
2014
2015             /* Update vectorial force */
2016             fix2             = _mm256_add_ps(fix2,tx);
2017             fiy2             = _mm256_add_ps(fiy2,ty);
2018             fiz2             = _mm256_add_ps(fiz2,tz);
2019
2020             fjx1             = _mm256_add_ps(fjx1,tx);
2021             fjy1             = _mm256_add_ps(fjy1,ty);
2022             fjz1             = _mm256_add_ps(fjz1,tz);
2023
2024             /**************************
2025              * CALCULATE INTERACTIONS *
2026              **************************/
2027
2028             /* REACTION-FIELD ELECTROSTATICS */
2029             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2030
2031             fscal            = felec;
2032
2033             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2034
2035             /* Calculate temporary vectorial force */
2036             tx               = _mm256_mul_ps(fscal,dx22);
2037             ty               = _mm256_mul_ps(fscal,dy22);
2038             tz               = _mm256_mul_ps(fscal,dz22);
2039
2040             /* Update vectorial force */
2041             fix2             = _mm256_add_ps(fix2,tx);
2042             fiy2             = _mm256_add_ps(fiy2,ty);
2043             fiz2             = _mm256_add_ps(fiz2,tz);
2044
2045             fjx2             = _mm256_add_ps(fjx2,tx);
2046             fjy2             = _mm256_add_ps(fjy2,ty);
2047             fjz2             = _mm256_add_ps(fjz2,tz);
2048
2049             /**************************
2050              * CALCULATE INTERACTIONS *
2051              **************************/
2052
2053             /* REACTION-FIELD ELECTROSTATICS */
2054             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
2055
2056             fscal            = felec;
2057
2058             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2059
2060             /* Calculate temporary vectorial force */
2061             tx               = _mm256_mul_ps(fscal,dx23);
2062             ty               = _mm256_mul_ps(fscal,dy23);
2063             tz               = _mm256_mul_ps(fscal,dz23);
2064
2065             /* Update vectorial force */
2066             fix2             = _mm256_add_ps(fix2,tx);
2067             fiy2             = _mm256_add_ps(fiy2,ty);
2068             fiz2             = _mm256_add_ps(fiz2,tz);
2069
2070             fjx3             = _mm256_add_ps(fjx3,tx);
2071             fjy3             = _mm256_add_ps(fjy3,ty);
2072             fjz3             = _mm256_add_ps(fjz3,tz);
2073
2074             /**************************
2075              * CALCULATE INTERACTIONS *
2076              **************************/
2077
2078             /* REACTION-FIELD ELECTROSTATICS */
2079             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
2080
2081             fscal            = felec;
2082
2083             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2084
2085             /* Calculate temporary vectorial force */
2086             tx               = _mm256_mul_ps(fscal,dx31);
2087             ty               = _mm256_mul_ps(fscal,dy31);
2088             tz               = _mm256_mul_ps(fscal,dz31);
2089
2090             /* Update vectorial force */
2091             fix3             = _mm256_add_ps(fix3,tx);
2092             fiy3             = _mm256_add_ps(fiy3,ty);
2093             fiz3             = _mm256_add_ps(fiz3,tz);
2094
2095             fjx1             = _mm256_add_ps(fjx1,tx);
2096             fjy1             = _mm256_add_ps(fjy1,ty);
2097             fjz1             = _mm256_add_ps(fjz1,tz);
2098
2099             /**************************
2100              * CALCULATE INTERACTIONS *
2101              **************************/
2102
2103             /* REACTION-FIELD ELECTROSTATICS */
2104             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
2105
2106             fscal            = felec;
2107
2108             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2109
2110             /* Calculate temporary vectorial force */
2111             tx               = _mm256_mul_ps(fscal,dx32);
2112             ty               = _mm256_mul_ps(fscal,dy32);
2113             tz               = _mm256_mul_ps(fscal,dz32);
2114
2115             /* Update vectorial force */
2116             fix3             = _mm256_add_ps(fix3,tx);
2117             fiy3             = _mm256_add_ps(fiy3,ty);
2118             fiz3             = _mm256_add_ps(fiz3,tz);
2119
2120             fjx2             = _mm256_add_ps(fjx2,tx);
2121             fjy2             = _mm256_add_ps(fjy2,ty);
2122             fjz2             = _mm256_add_ps(fjz2,tz);
2123
2124             /**************************
2125              * CALCULATE INTERACTIONS *
2126              **************************/
2127
2128             /* REACTION-FIELD ELECTROSTATICS */
2129             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
2130
2131             fscal            = felec;
2132
2133             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2134
2135             /* Calculate temporary vectorial force */
2136             tx               = _mm256_mul_ps(fscal,dx33);
2137             ty               = _mm256_mul_ps(fscal,dy33);
2138             tz               = _mm256_mul_ps(fscal,dz33);
2139
2140             /* Update vectorial force */
2141             fix3             = _mm256_add_ps(fix3,tx);
2142             fiy3             = _mm256_add_ps(fiy3,ty);
2143             fiz3             = _mm256_add_ps(fiz3,tz);
2144
2145             fjx3             = _mm256_add_ps(fjx3,tx);
2146             fjy3             = _mm256_add_ps(fjy3,ty);
2147             fjz3             = _mm256_add_ps(fjz3,tz);
2148
2149             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2150             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2151             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2152             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2153             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2154             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2155             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2156             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2157
2158             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2159                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2160                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2161
2162             /* Inner loop uses 295 flops */
2163         }
2164
2165         /* End of innermost loop */
2166
2167         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2168                                                  f+i_coord_offset,fshift+i_shift_offset);
2169
2170         /* Increment number of inner iterations */
2171         inneriter                  += j_index_end - j_index_start;
2172
2173         /* Outer loop uses 24 flops */
2174     }
2175
2176     /* Increment number of outer iterations */
2177     outeriter        += nri;
2178
2179     /* Update outer/inner flops */
2180
2181     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*295);
2182 }