Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LJEwald
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrE,jnrF,jnrG,jnrH;
78     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
83     real             rcutoff_scalar;
84     real             *shiftvec,*fshift,*x,*f;
85     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
86     real             scratch[4*DIM];
87     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88     real *           vdwioffsetptr0;
89     real *           vdwgridioffsetptr0;
90     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
91     real *           vdwioffsetptr1;
92     real *           vdwgridioffsetptr1;
93     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
94     real *           vdwioffsetptr2;
95     real *           vdwgridioffsetptr2;
96     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
97     real *           vdwioffsetptr3;
98     real *           vdwgridioffsetptr3;
99     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
100     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
101     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
102     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
103     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
104     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
105     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
106     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
107     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
108     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
109     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
110     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
111     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
112     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
113     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
114     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
115     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
116     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
117     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
118     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
119     real             *charge;
120     int              nvdwtype;
121     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
122     int              *vdwtype;
123     real             *vdwparam;
124     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
125     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
126     __m256           c6grid_00;
127     __m256           c6grid_11;
128     __m256           c6grid_12;
129     __m256           c6grid_13;
130     __m256           c6grid_21;
131     __m256           c6grid_22;
132     __m256           c6grid_23;
133     __m256           c6grid_31;
134     __m256           c6grid_32;
135     __m256           c6grid_33;
136     real             *vdwgridparam;
137     __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
138     __m256           one_half  = _mm256_set1_ps(0.5);
139     __m256           minus_one = _mm256_set1_ps(-1.0);
140     __m256i          ewitab;
141     __m128i          ewitab_lo,ewitab_hi;
142     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
143     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
144     real             *ewtab;
145     __m256           dummy_mask,cutoff_mask;
146     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
147     __m256           one     = _mm256_set1_ps(1.0);
148     __m256           two     = _mm256_set1_ps(2.0);
149     x                = xx[0];
150     f                = ff[0];
151
152     nri              = nlist->nri;
153     iinr             = nlist->iinr;
154     jindex           = nlist->jindex;
155     jjnr             = nlist->jjnr;
156     shiftidx         = nlist->shift;
157     gid              = nlist->gid;
158     shiftvec         = fr->shift_vec[0];
159     fshift           = fr->fshift[0];
160     facel            = _mm256_set1_ps(fr->epsfac);
161     charge           = mdatoms->chargeA;
162     nvdwtype         = fr->ntype;
163     vdwparam         = fr->nbfp;
164     vdwtype          = mdatoms->typeA;
165     vdwgridparam     = fr->ljpme_c6grid;
166     sh_lj_ewald      = _mm256_set1_ps(fr->ic->sh_lj_ewald);
167     ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
168     ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
169
170     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
171     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
172     beta2            = _mm256_mul_ps(beta,beta);
173     beta3            = _mm256_mul_ps(beta,beta2);
174
175     ewtab            = fr->ic->tabq_coul_FDV0;
176     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
177     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
178
179     /* Setup water-specific parameters */
180     inr              = nlist->iinr[0];
181     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
182     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
183     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
184     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
185     vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
186
187     jq1              = _mm256_set1_ps(charge[inr+1]);
188     jq2              = _mm256_set1_ps(charge[inr+2]);
189     jq3              = _mm256_set1_ps(charge[inr+3]);
190     vdwjidx0A        = 2*vdwtype[inr+0];
191     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
192     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
193     c6grid_00        = _mm256_set1_ps(vdwgridioffsetptr0[vdwjidx0A]);
194     qq11             = _mm256_mul_ps(iq1,jq1);
195     qq12             = _mm256_mul_ps(iq1,jq2);
196     qq13             = _mm256_mul_ps(iq1,jq3);
197     qq21             = _mm256_mul_ps(iq2,jq1);
198     qq22             = _mm256_mul_ps(iq2,jq2);
199     qq23             = _mm256_mul_ps(iq2,jq3);
200     qq31             = _mm256_mul_ps(iq3,jq1);
201     qq32             = _mm256_mul_ps(iq3,jq2);
202     qq33             = _mm256_mul_ps(iq3,jq3);
203
204     /* Avoid stupid compiler warnings */
205     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
206     j_coord_offsetA = 0;
207     j_coord_offsetB = 0;
208     j_coord_offsetC = 0;
209     j_coord_offsetD = 0;
210     j_coord_offsetE = 0;
211     j_coord_offsetF = 0;
212     j_coord_offsetG = 0;
213     j_coord_offsetH = 0;
214
215     outeriter        = 0;
216     inneriter        = 0;
217
218     for(iidx=0;iidx<4*DIM;iidx++)
219     {
220         scratch[iidx] = 0.0;
221     }
222
223     /* Start outer loop over neighborlists */
224     for(iidx=0; iidx<nri; iidx++)
225     {
226         /* Load shift vector for this list */
227         i_shift_offset   = DIM*shiftidx[iidx];
228
229         /* Load limits for loop over neighbors */
230         j_index_start    = jindex[iidx];
231         j_index_end      = jindex[iidx+1];
232
233         /* Get outer coordinate index */
234         inr              = iinr[iidx];
235         i_coord_offset   = DIM*inr;
236
237         /* Load i particle coords and add shift vector */
238         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
239                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
240
241         fix0             = _mm256_setzero_ps();
242         fiy0             = _mm256_setzero_ps();
243         fiz0             = _mm256_setzero_ps();
244         fix1             = _mm256_setzero_ps();
245         fiy1             = _mm256_setzero_ps();
246         fiz1             = _mm256_setzero_ps();
247         fix2             = _mm256_setzero_ps();
248         fiy2             = _mm256_setzero_ps();
249         fiz2             = _mm256_setzero_ps();
250         fix3             = _mm256_setzero_ps();
251         fiy3             = _mm256_setzero_ps();
252         fiz3             = _mm256_setzero_ps();
253
254         /* Reset potential sums */
255         velecsum         = _mm256_setzero_ps();
256         vvdwsum          = _mm256_setzero_ps();
257
258         /* Start inner kernel loop */
259         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
260         {
261
262             /* Get j neighbor index, and coordinate index */
263             jnrA             = jjnr[jidx];
264             jnrB             = jjnr[jidx+1];
265             jnrC             = jjnr[jidx+2];
266             jnrD             = jjnr[jidx+3];
267             jnrE             = jjnr[jidx+4];
268             jnrF             = jjnr[jidx+5];
269             jnrG             = jjnr[jidx+6];
270             jnrH             = jjnr[jidx+7];
271             j_coord_offsetA  = DIM*jnrA;
272             j_coord_offsetB  = DIM*jnrB;
273             j_coord_offsetC  = DIM*jnrC;
274             j_coord_offsetD  = DIM*jnrD;
275             j_coord_offsetE  = DIM*jnrE;
276             j_coord_offsetF  = DIM*jnrF;
277             j_coord_offsetG  = DIM*jnrG;
278             j_coord_offsetH  = DIM*jnrH;
279
280             /* load j atom coordinates */
281             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
282                                                  x+j_coord_offsetC,x+j_coord_offsetD,
283                                                  x+j_coord_offsetE,x+j_coord_offsetF,
284                                                  x+j_coord_offsetG,x+j_coord_offsetH,
285                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
286                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
287
288             /* Calculate displacement vector */
289             dx00             = _mm256_sub_ps(ix0,jx0);
290             dy00             = _mm256_sub_ps(iy0,jy0);
291             dz00             = _mm256_sub_ps(iz0,jz0);
292             dx11             = _mm256_sub_ps(ix1,jx1);
293             dy11             = _mm256_sub_ps(iy1,jy1);
294             dz11             = _mm256_sub_ps(iz1,jz1);
295             dx12             = _mm256_sub_ps(ix1,jx2);
296             dy12             = _mm256_sub_ps(iy1,jy2);
297             dz12             = _mm256_sub_ps(iz1,jz2);
298             dx13             = _mm256_sub_ps(ix1,jx3);
299             dy13             = _mm256_sub_ps(iy1,jy3);
300             dz13             = _mm256_sub_ps(iz1,jz3);
301             dx21             = _mm256_sub_ps(ix2,jx1);
302             dy21             = _mm256_sub_ps(iy2,jy1);
303             dz21             = _mm256_sub_ps(iz2,jz1);
304             dx22             = _mm256_sub_ps(ix2,jx2);
305             dy22             = _mm256_sub_ps(iy2,jy2);
306             dz22             = _mm256_sub_ps(iz2,jz2);
307             dx23             = _mm256_sub_ps(ix2,jx3);
308             dy23             = _mm256_sub_ps(iy2,jy3);
309             dz23             = _mm256_sub_ps(iz2,jz3);
310             dx31             = _mm256_sub_ps(ix3,jx1);
311             dy31             = _mm256_sub_ps(iy3,jy1);
312             dz31             = _mm256_sub_ps(iz3,jz1);
313             dx32             = _mm256_sub_ps(ix3,jx2);
314             dy32             = _mm256_sub_ps(iy3,jy2);
315             dz32             = _mm256_sub_ps(iz3,jz2);
316             dx33             = _mm256_sub_ps(ix3,jx3);
317             dy33             = _mm256_sub_ps(iy3,jy3);
318             dz33             = _mm256_sub_ps(iz3,jz3);
319
320             /* Calculate squared distance and things based on it */
321             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
322             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
323             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
324             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
325             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
326             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
327             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
328             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
329             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
330             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
331
332             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
333             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
334             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
335             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
336             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
337             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
338             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
339             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
340             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
341             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
342
343             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
344             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
345             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
346             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
347             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
348             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
349             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
350             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
351             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
352             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
353
354             fjx0             = _mm256_setzero_ps();
355             fjy0             = _mm256_setzero_ps();
356             fjz0             = _mm256_setzero_ps();
357             fjx1             = _mm256_setzero_ps();
358             fjy1             = _mm256_setzero_ps();
359             fjz1             = _mm256_setzero_ps();
360             fjx2             = _mm256_setzero_ps();
361             fjy2             = _mm256_setzero_ps();
362             fjz2             = _mm256_setzero_ps();
363             fjx3             = _mm256_setzero_ps();
364             fjy3             = _mm256_setzero_ps();
365             fjz3             = _mm256_setzero_ps();
366
367             /**************************
368              * CALCULATE INTERACTIONS *
369              **************************/
370
371             r00              = _mm256_mul_ps(rsq00,rinv00);
372
373             /* Analytical LJ-PME */
374             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
375             ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
376             ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
377             exponent         = gmx_simd_exp_r(ewcljrsq);
378             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
379             poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
380             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
381             vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
382             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
383             vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
384             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
385             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
386
387             /* Update potential sum for this i atom from the interaction with this j atom. */
388             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
389
390             fscal            = fvdw;
391
392             /* Calculate temporary vectorial force */
393             tx               = _mm256_mul_ps(fscal,dx00);
394             ty               = _mm256_mul_ps(fscal,dy00);
395             tz               = _mm256_mul_ps(fscal,dz00);
396
397             /* Update vectorial force */
398             fix0             = _mm256_add_ps(fix0,tx);
399             fiy0             = _mm256_add_ps(fiy0,ty);
400             fiz0             = _mm256_add_ps(fiz0,tz);
401
402             fjx0             = _mm256_add_ps(fjx0,tx);
403             fjy0             = _mm256_add_ps(fjy0,ty);
404             fjz0             = _mm256_add_ps(fjz0,tz);
405
406             /**************************
407              * CALCULATE INTERACTIONS *
408              **************************/
409
410             r11              = _mm256_mul_ps(rsq11,rinv11);
411
412             /* EWALD ELECTROSTATICS */
413             
414             /* Analytical PME correction */
415             zeta2            = _mm256_mul_ps(beta2,rsq11);
416             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
417             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
418             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
419             felec            = _mm256_mul_ps(qq11,felec);
420             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
421             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
422             velec            = _mm256_sub_ps(rinv11,pmecorrV);
423             velec            = _mm256_mul_ps(qq11,velec);
424             
425             /* Update potential sum for this i atom from the interaction with this j atom. */
426             velecsum         = _mm256_add_ps(velecsum,velec);
427
428             fscal            = felec;
429
430             /* Calculate temporary vectorial force */
431             tx               = _mm256_mul_ps(fscal,dx11);
432             ty               = _mm256_mul_ps(fscal,dy11);
433             tz               = _mm256_mul_ps(fscal,dz11);
434
435             /* Update vectorial force */
436             fix1             = _mm256_add_ps(fix1,tx);
437             fiy1             = _mm256_add_ps(fiy1,ty);
438             fiz1             = _mm256_add_ps(fiz1,tz);
439
440             fjx1             = _mm256_add_ps(fjx1,tx);
441             fjy1             = _mm256_add_ps(fjy1,ty);
442             fjz1             = _mm256_add_ps(fjz1,tz);
443
444             /**************************
445              * CALCULATE INTERACTIONS *
446              **************************/
447
448             r12              = _mm256_mul_ps(rsq12,rinv12);
449
450             /* EWALD ELECTROSTATICS */
451             
452             /* Analytical PME correction */
453             zeta2            = _mm256_mul_ps(beta2,rsq12);
454             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
455             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
456             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
457             felec            = _mm256_mul_ps(qq12,felec);
458             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
459             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
460             velec            = _mm256_sub_ps(rinv12,pmecorrV);
461             velec            = _mm256_mul_ps(qq12,velec);
462             
463             /* Update potential sum for this i atom from the interaction with this j atom. */
464             velecsum         = _mm256_add_ps(velecsum,velec);
465
466             fscal            = felec;
467
468             /* Calculate temporary vectorial force */
469             tx               = _mm256_mul_ps(fscal,dx12);
470             ty               = _mm256_mul_ps(fscal,dy12);
471             tz               = _mm256_mul_ps(fscal,dz12);
472
473             /* Update vectorial force */
474             fix1             = _mm256_add_ps(fix1,tx);
475             fiy1             = _mm256_add_ps(fiy1,ty);
476             fiz1             = _mm256_add_ps(fiz1,tz);
477
478             fjx2             = _mm256_add_ps(fjx2,tx);
479             fjy2             = _mm256_add_ps(fjy2,ty);
480             fjz2             = _mm256_add_ps(fjz2,tz);
481
482             /**************************
483              * CALCULATE INTERACTIONS *
484              **************************/
485
486             r13              = _mm256_mul_ps(rsq13,rinv13);
487
488             /* EWALD ELECTROSTATICS */
489             
490             /* Analytical PME correction */
491             zeta2            = _mm256_mul_ps(beta2,rsq13);
492             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
493             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
494             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
495             felec            = _mm256_mul_ps(qq13,felec);
496             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
497             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
498             velec            = _mm256_sub_ps(rinv13,pmecorrV);
499             velec            = _mm256_mul_ps(qq13,velec);
500             
501             /* Update potential sum for this i atom from the interaction with this j atom. */
502             velecsum         = _mm256_add_ps(velecsum,velec);
503
504             fscal            = felec;
505
506             /* Calculate temporary vectorial force */
507             tx               = _mm256_mul_ps(fscal,dx13);
508             ty               = _mm256_mul_ps(fscal,dy13);
509             tz               = _mm256_mul_ps(fscal,dz13);
510
511             /* Update vectorial force */
512             fix1             = _mm256_add_ps(fix1,tx);
513             fiy1             = _mm256_add_ps(fiy1,ty);
514             fiz1             = _mm256_add_ps(fiz1,tz);
515
516             fjx3             = _mm256_add_ps(fjx3,tx);
517             fjy3             = _mm256_add_ps(fjy3,ty);
518             fjz3             = _mm256_add_ps(fjz3,tz);
519
520             /**************************
521              * CALCULATE INTERACTIONS *
522              **************************/
523
524             r21              = _mm256_mul_ps(rsq21,rinv21);
525
526             /* EWALD ELECTROSTATICS */
527             
528             /* Analytical PME correction */
529             zeta2            = _mm256_mul_ps(beta2,rsq21);
530             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
531             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
532             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
533             felec            = _mm256_mul_ps(qq21,felec);
534             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
535             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
536             velec            = _mm256_sub_ps(rinv21,pmecorrV);
537             velec            = _mm256_mul_ps(qq21,velec);
538             
539             /* Update potential sum for this i atom from the interaction with this j atom. */
540             velecsum         = _mm256_add_ps(velecsum,velec);
541
542             fscal            = felec;
543
544             /* Calculate temporary vectorial force */
545             tx               = _mm256_mul_ps(fscal,dx21);
546             ty               = _mm256_mul_ps(fscal,dy21);
547             tz               = _mm256_mul_ps(fscal,dz21);
548
549             /* Update vectorial force */
550             fix2             = _mm256_add_ps(fix2,tx);
551             fiy2             = _mm256_add_ps(fiy2,ty);
552             fiz2             = _mm256_add_ps(fiz2,tz);
553
554             fjx1             = _mm256_add_ps(fjx1,tx);
555             fjy1             = _mm256_add_ps(fjy1,ty);
556             fjz1             = _mm256_add_ps(fjz1,tz);
557
558             /**************************
559              * CALCULATE INTERACTIONS *
560              **************************/
561
562             r22              = _mm256_mul_ps(rsq22,rinv22);
563
564             /* EWALD ELECTROSTATICS */
565             
566             /* Analytical PME correction */
567             zeta2            = _mm256_mul_ps(beta2,rsq22);
568             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
569             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
570             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
571             felec            = _mm256_mul_ps(qq22,felec);
572             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
573             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
574             velec            = _mm256_sub_ps(rinv22,pmecorrV);
575             velec            = _mm256_mul_ps(qq22,velec);
576             
577             /* Update potential sum for this i atom from the interaction with this j atom. */
578             velecsum         = _mm256_add_ps(velecsum,velec);
579
580             fscal            = felec;
581
582             /* Calculate temporary vectorial force */
583             tx               = _mm256_mul_ps(fscal,dx22);
584             ty               = _mm256_mul_ps(fscal,dy22);
585             tz               = _mm256_mul_ps(fscal,dz22);
586
587             /* Update vectorial force */
588             fix2             = _mm256_add_ps(fix2,tx);
589             fiy2             = _mm256_add_ps(fiy2,ty);
590             fiz2             = _mm256_add_ps(fiz2,tz);
591
592             fjx2             = _mm256_add_ps(fjx2,tx);
593             fjy2             = _mm256_add_ps(fjy2,ty);
594             fjz2             = _mm256_add_ps(fjz2,tz);
595
596             /**************************
597              * CALCULATE INTERACTIONS *
598              **************************/
599
600             r23              = _mm256_mul_ps(rsq23,rinv23);
601
602             /* EWALD ELECTROSTATICS */
603             
604             /* Analytical PME correction */
605             zeta2            = _mm256_mul_ps(beta2,rsq23);
606             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
607             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
608             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
609             felec            = _mm256_mul_ps(qq23,felec);
610             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
611             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
612             velec            = _mm256_sub_ps(rinv23,pmecorrV);
613             velec            = _mm256_mul_ps(qq23,velec);
614             
615             /* Update potential sum for this i atom from the interaction with this j atom. */
616             velecsum         = _mm256_add_ps(velecsum,velec);
617
618             fscal            = felec;
619
620             /* Calculate temporary vectorial force */
621             tx               = _mm256_mul_ps(fscal,dx23);
622             ty               = _mm256_mul_ps(fscal,dy23);
623             tz               = _mm256_mul_ps(fscal,dz23);
624
625             /* Update vectorial force */
626             fix2             = _mm256_add_ps(fix2,tx);
627             fiy2             = _mm256_add_ps(fiy2,ty);
628             fiz2             = _mm256_add_ps(fiz2,tz);
629
630             fjx3             = _mm256_add_ps(fjx3,tx);
631             fjy3             = _mm256_add_ps(fjy3,ty);
632             fjz3             = _mm256_add_ps(fjz3,tz);
633
634             /**************************
635              * CALCULATE INTERACTIONS *
636              **************************/
637
638             r31              = _mm256_mul_ps(rsq31,rinv31);
639
640             /* EWALD ELECTROSTATICS */
641             
642             /* Analytical PME correction */
643             zeta2            = _mm256_mul_ps(beta2,rsq31);
644             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
645             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
646             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
647             felec            = _mm256_mul_ps(qq31,felec);
648             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
649             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
650             velec            = _mm256_sub_ps(rinv31,pmecorrV);
651             velec            = _mm256_mul_ps(qq31,velec);
652             
653             /* Update potential sum for this i atom from the interaction with this j atom. */
654             velecsum         = _mm256_add_ps(velecsum,velec);
655
656             fscal            = felec;
657
658             /* Calculate temporary vectorial force */
659             tx               = _mm256_mul_ps(fscal,dx31);
660             ty               = _mm256_mul_ps(fscal,dy31);
661             tz               = _mm256_mul_ps(fscal,dz31);
662
663             /* Update vectorial force */
664             fix3             = _mm256_add_ps(fix3,tx);
665             fiy3             = _mm256_add_ps(fiy3,ty);
666             fiz3             = _mm256_add_ps(fiz3,tz);
667
668             fjx1             = _mm256_add_ps(fjx1,tx);
669             fjy1             = _mm256_add_ps(fjy1,ty);
670             fjz1             = _mm256_add_ps(fjz1,tz);
671
672             /**************************
673              * CALCULATE INTERACTIONS *
674              **************************/
675
676             r32              = _mm256_mul_ps(rsq32,rinv32);
677
678             /* EWALD ELECTROSTATICS */
679             
680             /* Analytical PME correction */
681             zeta2            = _mm256_mul_ps(beta2,rsq32);
682             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
683             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
684             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
685             felec            = _mm256_mul_ps(qq32,felec);
686             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
687             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
688             velec            = _mm256_sub_ps(rinv32,pmecorrV);
689             velec            = _mm256_mul_ps(qq32,velec);
690             
691             /* Update potential sum for this i atom from the interaction with this j atom. */
692             velecsum         = _mm256_add_ps(velecsum,velec);
693
694             fscal            = felec;
695
696             /* Calculate temporary vectorial force */
697             tx               = _mm256_mul_ps(fscal,dx32);
698             ty               = _mm256_mul_ps(fscal,dy32);
699             tz               = _mm256_mul_ps(fscal,dz32);
700
701             /* Update vectorial force */
702             fix3             = _mm256_add_ps(fix3,tx);
703             fiy3             = _mm256_add_ps(fiy3,ty);
704             fiz3             = _mm256_add_ps(fiz3,tz);
705
706             fjx2             = _mm256_add_ps(fjx2,tx);
707             fjy2             = _mm256_add_ps(fjy2,ty);
708             fjz2             = _mm256_add_ps(fjz2,tz);
709
710             /**************************
711              * CALCULATE INTERACTIONS *
712              **************************/
713
714             r33              = _mm256_mul_ps(rsq33,rinv33);
715
716             /* EWALD ELECTROSTATICS */
717             
718             /* Analytical PME correction */
719             zeta2            = _mm256_mul_ps(beta2,rsq33);
720             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
721             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
722             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
723             felec            = _mm256_mul_ps(qq33,felec);
724             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
725             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
726             velec            = _mm256_sub_ps(rinv33,pmecorrV);
727             velec            = _mm256_mul_ps(qq33,velec);
728             
729             /* Update potential sum for this i atom from the interaction with this j atom. */
730             velecsum         = _mm256_add_ps(velecsum,velec);
731
732             fscal            = felec;
733
734             /* Calculate temporary vectorial force */
735             tx               = _mm256_mul_ps(fscal,dx33);
736             ty               = _mm256_mul_ps(fscal,dy33);
737             tz               = _mm256_mul_ps(fscal,dz33);
738
739             /* Update vectorial force */
740             fix3             = _mm256_add_ps(fix3,tx);
741             fiy3             = _mm256_add_ps(fiy3,ty);
742             fiz3             = _mm256_add_ps(fiz3,tz);
743
744             fjx3             = _mm256_add_ps(fjx3,tx);
745             fjy3             = _mm256_add_ps(fjy3,ty);
746             fjz3             = _mm256_add_ps(fjz3,tz);
747
748             fjptrA             = f+j_coord_offsetA;
749             fjptrB             = f+j_coord_offsetB;
750             fjptrC             = f+j_coord_offsetC;
751             fjptrD             = f+j_coord_offsetD;
752             fjptrE             = f+j_coord_offsetE;
753             fjptrF             = f+j_coord_offsetF;
754             fjptrG             = f+j_coord_offsetG;
755             fjptrH             = f+j_coord_offsetH;
756
757             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
758                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
759                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
760
761             /* Inner loop uses 810 flops */
762         }
763
764         if(jidx<j_index_end)
765         {
766
767             /* Get j neighbor index, and coordinate index */
768             jnrlistA         = jjnr[jidx];
769             jnrlistB         = jjnr[jidx+1];
770             jnrlistC         = jjnr[jidx+2];
771             jnrlistD         = jjnr[jidx+3];
772             jnrlistE         = jjnr[jidx+4];
773             jnrlistF         = jjnr[jidx+5];
774             jnrlistG         = jjnr[jidx+6];
775             jnrlistH         = jjnr[jidx+7];
776             /* Sign of each element will be negative for non-real atoms.
777              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
778              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
779              */
780             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
781                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
782                                             
783             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
784             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
785             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
786             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
787             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
788             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
789             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
790             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
791             j_coord_offsetA  = DIM*jnrA;
792             j_coord_offsetB  = DIM*jnrB;
793             j_coord_offsetC  = DIM*jnrC;
794             j_coord_offsetD  = DIM*jnrD;
795             j_coord_offsetE  = DIM*jnrE;
796             j_coord_offsetF  = DIM*jnrF;
797             j_coord_offsetG  = DIM*jnrG;
798             j_coord_offsetH  = DIM*jnrH;
799
800             /* load j atom coordinates */
801             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
802                                                  x+j_coord_offsetC,x+j_coord_offsetD,
803                                                  x+j_coord_offsetE,x+j_coord_offsetF,
804                                                  x+j_coord_offsetG,x+j_coord_offsetH,
805                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
806                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
807
808             /* Calculate displacement vector */
809             dx00             = _mm256_sub_ps(ix0,jx0);
810             dy00             = _mm256_sub_ps(iy0,jy0);
811             dz00             = _mm256_sub_ps(iz0,jz0);
812             dx11             = _mm256_sub_ps(ix1,jx1);
813             dy11             = _mm256_sub_ps(iy1,jy1);
814             dz11             = _mm256_sub_ps(iz1,jz1);
815             dx12             = _mm256_sub_ps(ix1,jx2);
816             dy12             = _mm256_sub_ps(iy1,jy2);
817             dz12             = _mm256_sub_ps(iz1,jz2);
818             dx13             = _mm256_sub_ps(ix1,jx3);
819             dy13             = _mm256_sub_ps(iy1,jy3);
820             dz13             = _mm256_sub_ps(iz1,jz3);
821             dx21             = _mm256_sub_ps(ix2,jx1);
822             dy21             = _mm256_sub_ps(iy2,jy1);
823             dz21             = _mm256_sub_ps(iz2,jz1);
824             dx22             = _mm256_sub_ps(ix2,jx2);
825             dy22             = _mm256_sub_ps(iy2,jy2);
826             dz22             = _mm256_sub_ps(iz2,jz2);
827             dx23             = _mm256_sub_ps(ix2,jx3);
828             dy23             = _mm256_sub_ps(iy2,jy3);
829             dz23             = _mm256_sub_ps(iz2,jz3);
830             dx31             = _mm256_sub_ps(ix3,jx1);
831             dy31             = _mm256_sub_ps(iy3,jy1);
832             dz31             = _mm256_sub_ps(iz3,jz1);
833             dx32             = _mm256_sub_ps(ix3,jx2);
834             dy32             = _mm256_sub_ps(iy3,jy2);
835             dz32             = _mm256_sub_ps(iz3,jz2);
836             dx33             = _mm256_sub_ps(ix3,jx3);
837             dy33             = _mm256_sub_ps(iy3,jy3);
838             dz33             = _mm256_sub_ps(iz3,jz3);
839
840             /* Calculate squared distance and things based on it */
841             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
842             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
843             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
844             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
845             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
846             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
847             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
848             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
849             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
850             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
851
852             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
853             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
854             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
855             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
856             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
857             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
858             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
859             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
860             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
861             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
862
863             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
864             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
865             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
866             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
867             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
868             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
869             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
870             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
871             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
872             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
873
874             fjx0             = _mm256_setzero_ps();
875             fjy0             = _mm256_setzero_ps();
876             fjz0             = _mm256_setzero_ps();
877             fjx1             = _mm256_setzero_ps();
878             fjy1             = _mm256_setzero_ps();
879             fjz1             = _mm256_setzero_ps();
880             fjx2             = _mm256_setzero_ps();
881             fjy2             = _mm256_setzero_ps();
882             fjz2             = _mm256_setzero_ps();
883             fjx3             = _mm256_setzero_ps();
884             fjy3             = _mm256_setzero_ps();
885             fjz3             = _mm256_setzero_ps();
886
887             /**************************
888              * CALCULATE INTERACTIONS *
889              **************************/
890
891             r00              = _mm256_mul_ps(rsq00,rinv00);
892             r00              = _mm256_andnot_ps(dummy_mask,r00);
893
894             /* Analytical LJ-PME */
895             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
896             ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
897             ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
898             exponent         = gmx_simd_exp_r(ewcljrsq);
899             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
900             poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
901             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
902             vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
903             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
904             vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
905             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
906             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
907
908             /* Update potential sum for this i atom from the interaction with this j atom. */
909             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
910             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
911
912             fscal            = fvdw;
913
914             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
915
916             /* Calculate temporary vectorial force */
917             tx               = _mm256_mul_ps(fscal,dx00);
918             ty               = _mm256_mul_ps(fscal,dy00);
919             tz               = _mm256_mul_ps(fscal,dz00);
920
921             /* Update vectorial force */
922             fix0             = _mm256_add_ps(fix0,tx);
923             fiy0             = _mm256_add_ps(fiy0,ty);
924             fiz0             = _mm256_add_ps(fiz0,tz);
925
926             fjx0             = _mm256_add_ps(fjx0,tx);
927             fjy0             = _mm256_add_ps(fjy0,ty);
928             fjz0             = _mm256_add_ps(fjz0,tz);
929
930             /**************************
931              * CALCULATE INTERACTIONS *
932              **************************/
933
934             r11              = _mm256_mul_ps(rsq11,rinv11);
935             r11              = _mm256_andnot_ps(dummy_mask,r11);
936
937             /* EWALD ELECTROSTATICS */
938             
939             /* Analytical PME correction */
940             zeta2            = _mm256_mul_ps(beta2,rsq11);
941             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
942             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
943             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
944             felec            = _mm256_mul_ps(qq11,felec);
945             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
946             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
947             velec            = _mm256_sub_ps(rinv11,pmecorrV);
948             velec            = _mm256_mul_ps(qq11,velec);
949             
950             /* Update potential sum for this i atom from the interaction with this j atom. */
951             velec            = _mm256_andnot_ps(dummy_mask,velec);
952             velecsum         = _mm256_add_ps(velecsum,velec);
953
954             fscal            = felec;
955
956             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
957
958             /* Calculate temporary vectorial force */
959             tx               = _mm256_mul_ps(fscal,dx11);
960             ty               = _mm256_mul_ps(fscal,dy11);
961             tz               = _mm256_mul_ps(fscal,dz11);
962
963             /* Update vectorial force */
964             fix1             = _mm256_add_ps(fix1,tx);
965             fiy1             = _mm256_add_ps(fiy1,ty);
966             fiz1             = _mm256_add_ps(fiz1,tz);
967
968             fjx1             = _mm256_add_ps(fjx1,tx);
969             fjy1             = _mm256_add_ps(fjy1,ty);
970             fjz1             = _mm256_add_ps(fjz1,tz);
971
972             /**************************
973              * CALCULATE INTERACTIONS *
974              **************************/
975
976             r12              = _mm256_mul_ps(rsq12,rinv12);
977             r12              = _mm256_andnot_ps(dummy_mask,r12);
978
979             /* EWALD ELECTROSTATICS */
980             
981             /* Analytical PME correction */
982             zeta2            = _mm256_mul_ps(beta2,rsq12);
983             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
984             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
985             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
986             felec            = _mm256_mul_ps(qq12,felec);
987             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
988             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
989             velec            = _mm256_sub_ps(rinv12,pmecorrV);
990             velec            = _mm256_mul_ps(qq12,velec);
991             
992             /* Update potential sum for this i atom from the interaction with this j atom. */
993             velec            = _mm256_andnot_ps(dummy_mask,velec);
994             velecsum         = _mm256_add_ps(velecsum,velec);
995
996             fscal            = felec;
997
998             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
999
1000             /* Calculate temporary vectorial force */
1001             tx               = _mm256_mul_ps(fscal,dx12);
1002             ty               = _mm256_mul_ps(fscal,dy12);
1003             tz               = _mm256_mul_ps(fscal,dz12);
1004
1005             /* Update vectorial force */
1006             fix1             = _mm256_add_ps(fix1,tx);
1007             fiy1             = _mm256_add_ps(fiy1,ty);
1008             fiz1             = _mm256_add_ps(fiz1,tz);
1009
1010             fjx2             = _mm256_add_ps(fjx2,tx);
1011             fjy2             = _mm256_add_ps(fjy2,ty);
1012             fjz2             = _mm256_add_ps(fjz2,tz);
1013
1014             /**************************
1015              * CALCULATE INTERACTIONS *
1016              **************************/
1017
1018             r13              = _mm256_mul_ps(rsq13,rinv13);
1019             r13              = _mm256_andnot_ps(dummy_mask,r13);
1020
1021             /* EWALD ELECTROSTATICS */
1022             
1023             /* Analytical PME correction */
1024             zeta2            = _mm256_mul_ps(beta2,rsq13);
1025             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
1026             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1027             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1028             felec            = _mm256_mul_ps(qq13,felec);
1029             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1030             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1031             velec            = _mm256_sub_ps(rinv13,pmecorrV);
1032             velec            = _mm256_mul_ps(qq13,velec);
1033             
1034             /* Update potential sum for this i atom from the interaction with this j atom. */
1035             velec            = _mm256_andnot_ps(dummy_mask,velec);
1036             velecsum         = _mm256_add_ps(velecsum,velec);
1037
1038             fscal            = felec;
1039
1040             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1041
1042             /* Calculate temporary vectorial force */
1043             tx               = _mm256_mul_ps(fscal,dx13);
1044             ty               = _mm256_mul_ps(fscal,dy13);
1045             tz               = _mm256_mul_ps(fscal,dz13);
1046
1047             /* Update vectorial force */
1048             fix1             = _mm256_add_ps(fix1,tx);
1049             fiy1             = _mm256_add_ps(fiy1,ty);
1050             fiz1             = _mm256_add_ps(fiz1,tz);
1051
1052             fjx3             = _mm256_add_ps(fjx3,tx);
1053             fjy3             = _mm256_add_ps(fjy3,ty);
1054             fjz3             = _mm256_add_ps(fjz3,tz);
1055
1056             /**************************
1057              * CALCULATE INTERACTIONS *
1058              **************************/
1059
1060             r21              = _mm256_mul_ps(rsq21,rinv21);
1061             r21              = _mm256_andnot_ps(dummy_mask,r21);
1062
1063             /* EWALD ELECTROSTATICS */
1064             
1065             /* Analytical PME correction */
1066             zeta2            = _mm256_mul_ps(beta2,rsq21);
1067             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1068             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1069             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1070             felec            = _mm256_mul_ps(qq21,felec);
1071             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1072             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1073             velec            = _mm256_sub_ps(rinv21,pmecorrV);
1074             velec            = _mm256_mul_ps(qq21,velec);
1075             
1076             /* Update potential sum for this i atom from the interaction with this j atom. */
1077             velec            = _mm256_andnot_ps(dummy_mask,velec);
1078             velecsum         = _mm256_add_ps(velecsum,velec);
1079
1080             fscal            = felec;
1081
1082             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1083
1084             /* Calculate temporary vectorial force */
1085             tx               = _mm256_mul_ps(fscal,dx21);
1086             ty               = _mm256_mul_ps(fscal,dy21);
1087             tz               = _mm256_mul_ps(fscal,dz21);
1088
1089             /* Update vectorial force */
1090             fix2             = _mm256_add_ps(fix2,tx);
1091             fiy2             = _mm256_add_ps(fiy2,ty);
1092             fiz2             = _mm256_add_ps(fiz2,tz);
1093
1094             fjx1             = _mm256_add_ps(fjx1,tx);
1095             fjy1             = _mm256_add_ps(fjy1,ty);
1096             fjz1             = _mm256_add_ps(fjz1,tz);
1097
1098             /**************************
1099              * CALCULATE INTERACTIONS *
1100              **************************/
1101
1102             r22              = _mm256_mul_ps(rsq22,rinv22);
1103             r22              = _mm256_andnot_ps(dummy_mask,r22);
1104
1105             /* EWALD ELECTROSTATICS */
1106             
1107             /* Analytical PME correction */
1108             zeta2            = _mm256_mul_ps(beta2,rsq22);
1109             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1110             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1111             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1112             felec            = _mm256_mul_ps(qq22,felec);
1113             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1114             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1115             velec            = _mm256_sub_ps(rinv22,pmecorrV);
1116             velec            = _mm256_mul_ps(qq22,velec);
1117             
1118             /* Update potential sum for this i atom from the interaction with this j atom. */
1119             velec            = _mm256_andnot_ps(dummy_mask,velec);
1120             velecsum         = _mm256_add_ps(velecsum,velec);
1121
1122             fscal            = felec;
1123
1124             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1125
1126             /* Calculate temporary vectorial force */
1127             tx               = _mm256_mul_ps(fscal,dx22);
1128             ty               = _mm256_mul_ps(fscal,dy22);
1129             tz               = _mm256_mul_ps(fscal,dz22);
1130
1131             /* Update vectorial force */
1132             fix2             = _mm256_add_ps(fix2,tx);
1133             fiy2             = _mm256_add_ps(fiy2,ty);
1134             fiz2             = _mm256_add_ps(fiz2,tz);
1135
1136             fjx2             = _mm256_add_ps(fjx2,tx);
1137             fjy2             = _mm256_add_ps(fjy2,ty);
1138             fjz2             = _mm256_add_ps(fjz2,tz);
1139
1140             /**************************
1141              * CALCULATE INTERACTIONS *
1142              **************************/
1143
1144             r23              = _mm256_mul_ps(rsq23,rinv23);
1145             r23              = _mm256_andnot_ps(dummy_mask,r23);
1146
1147             /* EWALD ELECTROSTATICS */
1148             
1149             /* Analytical PME correction */
1150             zeta2            = _mm256_mul_ps(beta2,rsq23);
1151             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
1152             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1153             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1154             felec            = _mm256_mul_ps(qq23,felec);
1155             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1156             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1157             velec            = _mm256_sub_ps(rinv23,pmecorrV);
1158             velec            = _mm256_mul_ps(qq23,velec);
1159             
1160             /* Update potential sum for this i atom from the interaction with this j atom. */
1161             velec            = _mm256_andnot_ps(dummy_mask,velec);
1162             velecsum         = _mm256_add_ps(velecsum,velec);
1163
1164             fscal            = felec;
1165
1166             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1167
1168             /* Calculate temporary vectorial force */
1169             tx               = _mm256_mul_ps(fscal,dx23);
1170             ty               = _mm256_mul_ps(fscal,dy23);
1171             tz               = _mm256_mul_ps(fscal,dz23);
1172
1173             /* Update vectorial force */
1174             fix2             = _mm256_add_ps(fix2,tx);
1175             fiy2             = _mm256_add_ps(fiy2,ty);
1176             fiz2             = _mm256_add_ps(fiz2,tz);
1177
1178             fjx3             = _mm256_add_ps(fjx3,tx);
1179             fjy3             = _mm256_add_ps(fjy3,ty);
1180             fjz3             = _mm256_add_ps(fjz3,tz);
1181
1182             /**************************
1183              * CALCULATE INTERACTIONS *
1184              **************************/
1185
1186             r31              = _mm256_mul_ps(rsq31,rinv31);
1187             r31              = _mm256_andnot_ps(dummy_mask,r31);
1188
1189             /* EWALD ELECTROSTATICS */
1190             
1191             /* Analytical PME correction */
1192             zeta2            = _mm256_mul_ps(beta2,rsq31);
1193             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
1194             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1195             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1196             felec            = _mm256_mul_ps(qq31,felec);
1197             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1198             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1199             velec            = _mm256_sub_ps(rinv31,pmecorrV);
1200             velec            = _mm256_mul_ps(qq31,velec);
1201             
1202             /* Update potential sum for this i atom from the interaction with this j atom. */
1203             velec            = _mm256_andnot_ps(dummy_mask,velec);
1204             velecsum         = _mm256_add_ps(velecsum,velec);
1205
1206             fscal            = felec;
1207
1208             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1209
1210             /* Calculate temporary vectorial force */
1211             tx               = _mm256_mul_ps(fscal,dx31);
1212             ty               = _mm256_mul_ps(fscal,dy31);
1213             tz               = _mm256_mul_ps(fscal,dz31);
1214
1215             /* Update vectorial force */
1216             fix3             = _mm256_add_ps(fix3,tx);
1217             fiy3             = _mm256_add_ps(fiy3,ty);
1218             fiz3             = _mm256_add_ps(fiz3,tz);
1219
1220             fjx1             = _mm256_add_ps(fjx1,tx);
1221             fjy1             = _mm256_add_ps(fjy1,ty);
1222             fjz1             = _mm256_add_ps(fjz1,tz);
1223
1224             /**************************
1225              * CALCULATE INTERACTIONS *
1226              **************************/
1227
1228             r32              = _mm256_mul_ps(rsq32,rinv32);
1229             r32              = _mm256_andnot_ps(dummy_mask,r32);
1230
1231             /* EWALD ELECTROSTATICS */
1232             
1233             /* Analytical PME correction */
1234             zeta2            = _mm256_mul_ps(beta2,rsq32);
1235             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
1236             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1237             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1238             felec            = _mm256_mul_ps(qq32,felec);
1239             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1240             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1241             velec            = _mm256_sub_ps(rinv32,pmecorrV);
1242             velec            = _mm256_mul_ps(qq32,velec);
1243             
1244             /* Update potential sum for this i atom from the interaction with this j atom. */
1245             velec            = _mm256_andnot_ps(dummy_mask,velec);
1246             velecsum         = _mm256_add_ps(velecsum,velec);
1247
1248             fscal            = felec;
1249
1250             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1251
1252             /* Calculate temporary vectorial force */
1253             tx               = _mm256_mul_ps(fscal,dx32);
1254             ty               = _mm256_mul_ps(fscal,dy32);
1255             tz               = _mm256_mul_ps(fscal,dz32);
1256
1257             /* Update vectorial force */
1258             fix3             = _mm256_add_ps(fix3,tx);
1259             fiy3             = _mm256_add_ps(fiy3,ty);
1260             fiz3             = _mm256_add_ps(fiz3,tz);
1261
1262             fjx2             = _mm256_add_ps(fjx2,tx);
1263             fjy2             = _mm256_add_ps(fjy2,ty);
1264             fjz2             = _mm256_add_ps(fjz2,tz);
1265
1266             /**************************
1267              * CALCULATE INTERACTIONS *
1268              **************************/
1269
1270             r33              = _mm256_mul_ps(rsq33,rinv33);
1271             r33              = _mm256_andnot_ps(dummy_mask,r33);
1272
1273             /* EWALD ELECTROSTATICS */
1274             
1275             /* Analytical PME correction */
1276             zeta2            = _mm256_mul_ps(beta2,rsq33);
1277             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
1278             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1279             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1280             felec            = _mm256_mul_ps(qq33,felec);
1281             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1282             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1283             velec            = _mm256_sub_ps(rinv33,pmecorrV);
1284             velec            = _mm256_mul_ps(qq33,velec);
1285             
1286             /* Update potential sum for this i atom from the interaction with this j atom. */
1287             velec            = _mm256_andnot_ps(dummy_mask,velec);
1288             velecsum         = _mm256_add_ps(velecsum,velec);
1289
1290             fscal            = felec;
1291
1292             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1293
1294             /* Calculate temporary vectorial force */
1295             tx               = _mm256_mul_ps(fscal,dx33);
1296             ty               = _mm256_mul_ps(fscal,dy33);
1297             tz               = _mm256_mul_ps(fscal,dz33);
1298
1299             /* Update vectorial force */
1300             fix3             = _mm256_add_ps(fix3,tx);
1301             fiy3             = _mm256_add_ps(fiy3,ty);
1302             fiz3             = _mm256_add_ps(fiz3,tz);
1303
1304             fjx3             = _mm256_add_ps(fjx3,tx);
1305             fjy3             = _mm256_add_ps(fjy3,ty);
1306             fjz3             = _mm256_add_ps(fjz3,tz);
1307
1308             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1309             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1310             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1311             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1312             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1313             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1314             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1315             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1316
1317             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1318                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1319                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1320
1321             /* Inner loop uses 820 flops */
1322         }
1323
1324         /* End of innermost loop */
1325
1326         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1327                                                  f+i_coord_offset,fshift+i_shift_offset);
1328
1329         ggid                        = gid[iidx];
1330         /* Update potential energies */
1331         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1332         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1333
1334         /* Increment number of inner iterations */
1335         inneriter                  += j_index_end - j_index_start;
1336
1337         /* Outer loop uses 26 flops */
1338     }
1339
1340     /* Increment number of outer iterations */
1341     outeriter        += nri;
1342
1343     /* Update outer/inner flops */
1344
1345     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*820);
1346 }
1347 /*
1348  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_single
1349  * Electrostatics interaction: Ewald
1350  * VdW interaction:            LJEwald
1351  * Geometry:                   Water4-Water4
1352  * Calculate force/pot:        Force
1353  */
1354 void
1355 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_single
1356                     (t_nblist                    * gmx_restrict       nlist,
1357                      rvec                        * gmx_restrict          xx,
1358                      rvec                        * gmx_restrict          ff,
1359                      t_forcerec                  * gmx_restrict          fr,
1360                      t_mdatoms                   * gmx_restrict     mdatoms,
1361                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1362                      t_nrnb                      * gmx_restrict        nrnb)
1363 {
1364     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1365      * just 0 for non-waters.
1366      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1367      * jnr indices corresponding to data put in the four positions in the SIMD register.
1368      */
1369     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1370     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1371     int              jnrA,jnrB,jnrC,jnrD;
1372     int              jnrE,jnrF,jnrG,jnrH;
1373     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1374     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1375     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1376     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1377     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1378     real             rcutoff_scalar;
1379     real             *shiftvec,*fshift,*x,*f;
1380     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1381     real             scratch[4*DIM];
1382     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1383     real *           vdwioffsetptr0;
1384     real *           vdwgridioffsetptr0;
1385     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1386     real *           vdwioffsetptr1;
1387     real *           vdwgridioffsetptr1;
1388     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1389     real *           vdwioffsetptr2;
1390     real *           vdwgridioffsetptr2;
1391     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1392     real *           vdwioffsetptr3;
1393     real *           vdwgridioffsetptr3;
1394     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1395     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1396     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1397     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1398     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1399     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1400     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1401     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1402     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1403     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1404     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1405     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1406     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1407     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1408     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1409     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1410     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1411     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1412     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1413     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1414     real             *charge;
1415     int              nvdwtype;
1416     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1417     int              *vdwtype;
1418     real             *vdwparam;
1419     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1420     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1421     __m256           c6grid_00;
1422     __m256           c6grid_11;
1423     __m256           c6grid_12;
1424     __m256           c6grid_13;
1425     __m256           c6grid_21;
1426     __m256           c6grid_22;
1427     __m256           c6grid_23;
1428     __m256           c6grid_31;
1429     __m256           c6grid_32;
1430     __m256           c6grid_33;
1431     real             *vdwgridparam;
1432     __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1433     __m256           one_half  = _mm256_set1_ps(0.5);
1434     __m256           minus_one = _mm256_set1_ps(-1.0);
1435     __m256i          ewitab;
1436     __m128i          ewitab_lo,ewitab_hi;
1437     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1438     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1439     real             *ewtab;
1440     __m256           dummy_mask,cutoff_mask;
1441     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1442     __m256           one     = _mm256_set1_ps(1.0);
1443     __m256           two     = _mm256_set1_ps(2.0);
1444     x                = xx[0];
1445     f                = ff[0];
1446
1447     nri              = nlist->nri;
1448     iinr             = nlist->iinr;
1449     jindex           = nlist->jindex;
1450     jjnr             = nlist->jjnr;
1451     shiftidx         = nlist->shift;
1452     gid              = nlist->gid;
1453     shiftvec         = fr->shift_vec[0];
1454     fshift           = fr->fshift[0];
1455     facel            = _mm256_set1_ps(fr->epsfac);
1456     charge           = mdatoms->chargeA;
1457     nvdwtype         = fr->ntype;
1458     vdwparam         = fr->nbfp;
1459     vdwtype          = mdatoms->typeA;
1460     vdwgridparam     = fr->ljpme_c6grid;
1461     sh_lj_ewald      = _mm256_set1_ps(fr->ic->sh_lj_ewald);
1462     ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
1463     ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
1464
1465     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1466     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1467     beta2            = _mm256_mul_ps(beta,beta);
1468     beta3            = _mm256_mul_ps(beta,beta2);
1469
1470     ewtab            = fr->ic->tabq_coul_F;
1471     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1472     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1473
1474     /* Setup water-specific parameters */
1475     inr              = nlist->iinr[0];
1476     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1477     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1478     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1479     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1480     vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
1481
1482     jq1              = _mm256_set1_ps(charge[inr+1]);
1483     jq2              = _mm256_set1_ps(charge[inr+2]);
1484     jq3              = _mm256_set1_ps(charge[inr+3]);
1485     vdwjidx0A        = 2*vdwtype[inr+0];
1486     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1487     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1488     c6grid_00        = _mm256_set1_ps(vdwgridioffsetptr0[vdwjidx0A]);
1489     qq11             = _mm256_mul_ps(iq1,jq1);
1490     qq12             = _mm256_mul_ps(iq1,jq2);
1491     qq13             = _mm256_mul_ps(iq1,jq3);
1492     qq21             = _mm256_mul_ps(iq2,jq1);
1493     qq22             = _mm256_mul_ps(iq2,jq2);
1494     qq23             = _mm256_mul_ps(iq2,jq3);
1495     qq31             = _mm256_mul_ps(iq3,jq1);
1496     qq32             = _mm256_mul_ps(iq3,jq2);
1497     qq33             = _mm256_mul_ps(iq3,jq3);
1498
1499     /* Avoid stupid compiler warnings */
1500     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1501     j_coord_offsetA = 0;
1502     j_coord_offsetB = 0;
1503     j_coord_offsetC = 0;
1504     j_coord_offsetD = 0;
1505     j_coord_offsetE = 0;
1506     j_coord_offsetF = 0;
1507     j_coord_offsetG = 0;
1508     j_coord_offsetH = 0;
1509
1510     outeriter        = 0;
1511     inneriter        = 0;
1512
1513     for(iidx=0;iidx<4*DIM;iidx++)
1514     {
1515         scratch[iidx] = 0.0;
1516     }
1517
1518     /* Start outer loop over neighborlists */
1519     for(iidx=0; iidx<nri; iidx++)
1520     {
1521         /* Load shift vector for this list */
1522         i_shift_offset   = DIM*shiftidx[iidx];
1523
1524         /* Load limits for loop over neighbors */
1525         j_index_start    = jindex[iidx];
1526         j_index_end      = jindex[iidx+1];
1527
1528         /* Get outer coordinate index */
1529         inr              = iinr[iidx];
1530         i_coord_offset   = DIM*inr;
1531
1532         /* Load i particle coords and add shift vector */
1533         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1534                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1535
1536         fix0             = _mm256_setzero_ps();
1537         fiy0             = _mm256_setzero_ps();
1538         fiz0             = _mm256_setzero_ps();
1539         fix1             = _mm256_setzero_ps();
1540         fiy1             = _mm256_setzero_ps();
1541         fiz1             = _mm256_setzero_ps();
1542         fix2             = _mm256_setzero_ps();
1543         fiy2             = _mm256_setzero_ps();
1544         fiz2             = _mm256_setzero_ps();
1545         fix3             = _mm256_setzero_ps();
1546         fiy3             = _mm256_setzero_ps();
1547         fiz3             = _mm256_setzero_ps();
1548
1549         /* Start inner kernel loop */
1550         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1551         {
1552
1553             /* Get j neighbor index, and coordinate index */
1554             jnrA             = jjnr[jidx];
1555             jnrB             = jjnr[jidx+1];
1556             jnrC             = jjnr[jidx+2];
1557             jnrD             = jjnr[jidx+3];
1558             jnrE             = jjnr[jidx+4];
1559             jnrF             = jjnr[jidx+5];
1560             jnrG             = jjnr[jidx+6];
1561             jnrH             = jjnr[jidx+7];
1562             j_coord_offsetA  = DIM*jnrA;
1563             j_coord_offsetB  = DIM*jnrB;
1564             j_coord_offsetC  = DIM*jnrC;
1565             j_coord_offsetD  = DIM*jnrD;
1566             j_coord_offsetE  = DIM*jnrE;
1567             j_coord_offsetF  = DIM*jnrF;
1568             j_coord_offsetG  = DIM*jnrG;
1569             j_coord_offsetH  = DIM*jnrH;
1570
1571             /* load j atom coordinates */
1572             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1573                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1574                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1575                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1576                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1577                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1578
1579             /* Calculate displacement vector */
1580             dx00             = _mm256_sub_ps(ix0,jx0);
1581             dy00             = _mm256_sub_ps(iy0,jy0);
1582             dz00             = _mm256_sub_ps(iz0,jz0);
1583             dx11             = _mm256_sub_ps(ix1,jx1);
1584             dy11             = _mm256_sub_ps(iy1,jy1);
1585             dz11             = _mm256_sub_ps(iz1,jz1);
1586             dx12             = _mm256_sub_ps(ix1,jx2);
1587             dy12             = _mm256_sub_ps(iy1,jy2);
1588             dz12             = _mm256_sub_ps(iz1,jz2);
1589             dx13             = _mm256_sub_ps(ix1,jx3);
1590             dy13             = _mm256_sub_ps(iy1,jy3);
1591             dz13             = _mm256_sub_ps(iz1,jz3);
1592             dx21             = _mm256_sub_ps(ix2,jx1);
1593             dy21             = _mm256_sub_ps(iy2,jy1);
1594             dz21             = _mm256_sub_ps(iz2,jz1);
1595             dx22             = _mm256_sub_ps(ix2,jx2);
1596             dy22             = _mm256_sub_ps(iy2,jy2);
1597             dz22             = _mm256_sub_ps(iz2,jz2);
1598             dx23             = _mm256_sub_ps(ix2,jx3);
1599             dy23             = _mm256_sub_ps(iy2,jy3);
1600             dz23             = _mm256_sub_ps(iz2,jz3);
1601             dx31             = _mm256_sub_ps(ix3,jx1);
1602             dy31             = _mm256_sub_ps(iy3,jy1);
1603             dz31             = _mm256_sub_ps(iz3,jz1);
1604             dx32             = _mm256_sub_ps(ix3,jx2);
1605             dy32             = _mm256_sub_ps(iy3,jy2);
1606             dz32             = _mm256_sub_ps(iz3,jz2);
1607             dx33             = _mm256_sub_ps(ix3,jx3);
1608             dy33             = _mm256_sub_ps(iy3,jy3);
1609             dz33             = _mm256_sub_ps(iz3,jz3);
1610
1611             /* Calculate squared distance and things based on it */
1612             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1613             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1614             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1615             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1616             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1617             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1618             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1619             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1620             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1621             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1622
1623             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1624             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1625             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1626             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1627             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1628             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1629             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1630             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1631             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1632             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1633
1634             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1635             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1636             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1637             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1638             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1639             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1640             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1641             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1642             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1643             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1644
1645             fjx0             = _mm256_setzero_ps();
1646             fjy0             = _mm256_setzero_ps();
1647             fjz0             = _mm256_setzero_ps();
1648             fjx1             = _mm256_setzero_ps();
1649             fjy1             = _mm256_setzero_ps();
1650             fjz1             = _mm256_setzero_ps();
1651             fjx2             = _mm256_setzero_ps();
1652             fjy2             = _mm256_setzero_ps();
1653             fjz2             = _mm256_setzero_ps();
1654             fjx3             = _mm256_setzero_ps();
1655             fjy3             = _mm256_setzero_ps();
1656             fjz3             = _mm256_setzero_ps();
1657
1658             /**************************
1659              * CALCULATE INTERACTIONS *
1660              **************************/
1661
1662             r00              = _mm256_mul_ps(rsq00,rinv00);
1663
1664             /* Analytical LJ-PME */
1665             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1666             ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
1667             ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
1668             exponent         = gmx_simd_exp_r(ewcljrsq);
1669             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1670             poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1671             /* f6A = 6 * C6grid * (1 - poly) */
1672             f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
1673             /* f6B = C6grid * exponent * beta^6 */
1674             f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
1675             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1676             fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1677
1678             fscal            = fvdw;
1679
1680             /* Calculate temporary vectorial force */
1681             tx               = _mm256_mul_ps(fscal,dx00);
1682             ty               = _mm256_mul_ps(fscal,dy00);
1683             tz               = _mm256_mul_ps(fscal,dz00);
1684
1685             /* Update vectorial force */
1686             fix0             = _mm256_add_ps(fix0,tx);
1687             fiy0             = _mm256_add_ps(fiy0,ty);
1688             fiz0             = _mm256_add_ps(fiz0,tz);
1689
1690             fjx0             = _mm256_add_ps(fjx0,tx);
1691             fjy0             = _mm256_add_ps(fjy0,ty);
1692             fjz0             = _mm256_add_ps(fjz0,tz);
1693
1694             /**************************
1695              * CALCULATE INTERACTIONS *
1696              **************************/
1697
1698             r11              = _mm256_mul_ps(rsq11,rinv11);
1699
1700             /* EWALD ELECTROSTATICS */
1701             
1702             /* Analytical PME correction */
1703             zeta2            = _mm256_mul_ps(beta2,rsq11);
1704             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1705             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1706             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1707             felec            = _mm256_mul_ps(qq11,felec);
1708             
1709             fscal            = felec;
1710
1711             /* Calculate temporary vectorial force */
1712             tx               = _mm256_mul_ps(fscal,dx11);
1713             ty               = _mm256_mul_ps(fscal,dy11);
1714             tz               = _mm256_mul_ps(fscal,dz11);
1715
1716             /* Update vectorial force */
1717             fix1             = _mm256_add_ps(fix1,tx);
1718             fiy1             = _mm256_add_ps(fiy1,ty);
1719             fiz1             = _mm256_add_ps(fiz1,tz);
1720
1721             fjx1             = _mm256_add_ps(fjx1,tx);
1722             fjy1             = _mm256_add_ps(fjy1,ty);
1723             fjz1             = _mm256_add_ps(fjz1,tz);
1724
1725             /**************************
1726              * CALCULATE INTERACTIONS *
1727              **************************/
1728
1729             r12              = _mm256_mul_ps(rsq12,rinv12);
1730
1731             /* EWALD ELECTROSTATICS */
1732             
1733             /* Analytical PME correction */
1734             zeta2            = _mm256_mul_ps(beta2,rsq12);
1735             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1736             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1737             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1738             felec            = _mm256_mul_ps(qq12,felec);
1739             
1740             fscal            = felec;
1741
1742             /* Calculate temporary vectorial force */
1743             tx               = _mm256_mul_ps(fscal,dx12);
1744             ty               = _mm256_mul_ps(fscal,dy12);
1745             tz               = _mm256_mul_ps(fscal,dz12);
1746
1747             /* Update vectorial force */
1748             fix1             = _mm256_add_ps(fix1,tx);
1749             fiy1             = _mm256_add_ps(fiy1,ty);
1750             fiz1             = _mm256_add_ps(fiz1,tz);
1751
1752             fjx2             = _mm256_add_ps(fjx2,tx);
1753             fjy2             = _mm256_add_ps(fjy2,ty);
1754             fjz2             = _mm256_add_ps(fjz2,tz);
1755
1756             /**************************
1757              * CALCULATE INTERACTIONS *
1758              **************************/
1759
1760             r13              = _mm256_mul_ps(rsq13,rinv13);
1761
1762             /* EWALD ELECTROSTATICS */
1763             
1764             /* Analytical PME correction */
1765             zeta2            = _mm256_mul_ps(beta2,rsq13);
1766             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
1767             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1768             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1769             felec            = _mm256_mul_ps(qq13,felec);
1770             
1771             fscal            = felec;
1772
1773             /* Calculate temporary vectorial force */
1774             tx               = _mm256_mul_ps(fscal,dx13);
1775             ty               = _mm256_mul_ps(fscal,dy13);
1776             tz               = _mm256_mul_ps(fscal,dz13);
1777
1778             /* Update vectorial force */
1779             fix1             = _mm256_add_ps(fix1,tx);
1780             fiy1             = _mm256_add_ps(fiy1,ty);
1781             fiz1             = _mm256_add_ps(fiz1,tz);
1782
1783             fjx3             = _mm256_add_ps(fjx3,tx);
1784             fjy3             = _mm256_add_ps(fjy3,ty);
1785             fjz3             = _mm256_add_ps(fjz3,tz);
1786
1787             /**************************
1788              * CALCULATE INTERACTIONS *
1789              **************************/
1790
1791             r21              = _mm256_mul_ps(rsq21,rinv21);
1792
1793             /* EWALD ELECTROSTATICS */
1794             
1795             /* Analytical PME correction */
1796             zeta2            = _mm256_mul_ps(beta2,rsq21);
1797             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1798             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1799             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1800             felec            = _mm256_mul_ps(qq21,felec);
1801             
1802             fscal            = felec;
1803
1804             /* Calculate temporary vectorial force */
1805             tx               = _mm256_mul_ps(fscal,dx21);
1806             ty               = _mm256_mul_ps(fscal,dy21);
1807             tz               = _mm256_mul_ps(fscal,dz21);
1808
1809             /* Update vectorial force */
1810             fix2             = _mm256_add_ps(fix2,tx);
1811             fiy2             = _mm256_add_ps(fiy2,ty);
1812             fiz2             = _mm256_add_ps(fiz2,tz);
1813
1814             fjx1             = _mm256_add_ps(fjx1,tx);
1815             fjy1             = _mm256_add_ps(fjy1,ty);
1816             fjz1             = _mm256_add_ps(fjz1,tz);
1817
1818             /**************************
1819              * CALCULATE INTERACTIONS *
1820              **************************/
1821
1822             r22              = _mm256_mul_ps(rsq22,rinv22);
1823
1824             /* EWALD ELECTROSTATICS */
1825             
1826             /* Analytical PME correction */
1827             zeta2            = _mm256_mul_ps(beta2,rsq22);
1828             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1829             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1830             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1831             felec            = _mm256_mul_ps(qq22,felec);
1832             
1833             fscal            = felec;
1834
1835             /* Calculate temporary vectorial force */
1836             tx               = _mm256_mul_ps(fscal,dx22);
1837             ty               = _mm256_mul_ps(fscal,dy22);
1838             tz               = _mm256_mul_ps(fscal,dz22);
1839
1840             /* Update vectorial force */
1841             fix2             = _mm256_add_ps(fix2,tx);
1842             fiy2             = _mm256_add_ps(fiy2,ty);
1843             fiz2             = _mm256_add_ps(fiz2,tz);
1844
1845             fjx2             = _mm256_add_ps(fjx2,tx);
1846             fjy2             = _mm256_add_ps(fjy2,ty);
1847             fjz2             = _mm256_add_ps(fjz2,tz);
1848
1849             /**************************
1850              * CALCULATE INTERACTIONS *
1851              **************************/
1852
1853             r23              = _mm256_mul_ps(rsq23,rinv23);
1854
1855             /* EWALD ELECTROSTATICS */
1856             
1857             /* Analytical PME correction */
1858             zeta2            = _mm256_mul_ps(beta2,rsq23);
1859             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
1860             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1861             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1862             felec            = _mm256_mul_ps(qq23,felec);
1863             
1864             fscal            = felec;
1865
1866             /* Calculate temporary vectorial force */
1867             tx               = _mm256_mul_ps(fscal,dx23);
1868             ty               = _mm256_mul_ps(fscal,dy23);
1869             tz               = _mm256_mul_ps(fscal,dz23);
1870
1871             /* Update vectorial force */
1872             fix2             = _mm256_add_ps(fix2,tx);
1873             fiy2             = _mm256_add_ps(fiy2,ty);
1874             fiz2             = _mm256_add_ps(fiz2,tz);
1875
1876             fjx3             = _mm256_add_ps(fjx3,tx);
1877             fjy3             = _mm256_add_ps(fjy3,ty);
1878             fjz3             = _mm256_add_ps(fjz3,tz);
1879
1880             /**************************
1881              * CALCULATE INTERACTIONS *
1882              **************************/
1883
1884             r31              = _mm256_mul_ps(rsq31,rinv31);
1885
1886             /* EWALD ELECTROSTATICS */
1887             
1888             /* Analytical PME correction */
1889             zeta2            = _mm256_mul_ps(beta2,rsq31);
1890             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
1891             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1892             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1893             felec            = _mm256_mul_ps(qq31,felec);
1894             
1895             fscal            = felec;
1896
1897             /* Calculate temporary vectorial force */
1898             tx               = _mm256_mul_ps(fscal,dx31);
1899             ty               = _mm256_mul_ps(fscal,dy31);
1900             tz               = _mm256_mul_ps(fscal,dz31);
1901
1902             /* Update vectorial force */
1903             fix3             = _mm256_add_ps(fix3,tx);
1904             fiy3             = _mm256_add_ps(fiy3,ty);
1905             fiz3             = _mm256_add_ps(fiz3,tz);
1906
1907             fjx1             = _mm256_add_ps(fjx1,tx);
1908             fjy1             = _mm256_add_ps(fjy1,ty);
1909             fjz1             = _mm256_add_ps(fjz1,tz);
1910
1911             /**************************
1912              * CALCULATE INTERACTIONS *
1913              **************************/
1914
1915             r32              = _mm256_mul_ps(rsq32,rinv32);
1916
1917             /* EWALD ELECTROSTATICS */
1918             
1919             /* Analytical PME correction */
1920             zeta2            = _mm256_mul_ps(beta2,rsq32);
1921             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
1922             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1923             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1924             felec            = _mm256_mul_ps(qq32,felec);
1925             
1926             fscal            = felec;
1927
1928             /* Calculate temporary vectorial force */
1929             tx               = _mm256_mul_ps(fscal,dx32);
1930             ty               = _mm256_mul_ps(fscal,dy32);
1931             tz               = _mm256_mul_ps(fscal,dz32);
1932
1933             /* Update vectorial force */
1934             fix3             = _mm256_add_ps(fix3,tx);
1935             fiy3             = _mm256_add_ps(fiy3,ty);
1936             fiz3             = _mm256_add_ps(fiz3,tz);
1937
1938             fjx2             = _mm256_add_ps(fjx2,tx);
1939             fjy2             = _mm256_add_ps(fjy2,ty);
1940             fjz2             = _mm256_add_ps(fjz2,tz);
1941
1942             /**************************
1943              * CALCULATE INTERACTIONS *
1944              **************************/
1945
1946             r33              = _mm256_mul_ps(rsq33,rinv33);
1947
1948             /* EWALD ELECTROSTATICS */
1949             
1950             /* Analytical PME correction */
1951             zeta2            = _mm256_mul_ps(beta2,rsq33);
1952             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
1953             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1954             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1955             felec            = _mm256_mul_ps(qq33,felec);
1956             
1957             fscal            = felec;
1958
1959             /* Calculate temporary vectorial force */
1960             tx               = _mm256_mul_ps(fscal,dx33);
1961             ty               = _mm256_mul_ps(fscal,dy33);
1962             tz               = _mm256_mul_ps(fscal,dz33);
1963
1964             /* Update vectorial force */
1965             fix3             = _mm256_add_ps(fix3,tx);
1966             fiy3             = _mm256_add_ps(fiy3,ty);
1967             fiz3             = _mm256_add_ps(fiz3,tz);
1968
1969             fjx3             = _mm256_add_ps(fjx3,tx);
1970             fjy3             = _mm256_add_ps(fjy3,ty);
1971             fjz3             = _mm256_add_ps(fjz3,tz);
1972
1973             fjptrA             = f+j_coord_offsetA;
1974             fjptrB             = f+j_coord_offsetB;
1975             fjptrC             = f+j_coord_offsetC;
1976             fjptrD             = f+j_coord_offsetD;
1977             fjptrE             = f+j_coord_offsetE;
1978             fjptrF             = f+j_coord_offsetF;
1979             fjptrG             = f+j_coord_offsetG;
1980             fjptrH             = f+j_coord_offsetH;
1981
1982             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1983                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1984                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1985
1986             /* Inner loop uses 553 flops */
1987         }
1988
1989         if(jidx<j_index_end)
1990         {
1991
1992             /* Get j neighbor index, and coordinate index */
1993             jnrlistA         = jjnr[jidx];
1994             jnrlistB         = jjnr[jidx+1];
1995             jnrlistC         = jjnr[jidx+2];
1996             jnrlistD         = jjnr[jidx+3];
1997             jnrlistE         = jjnr[jidx+4];
1998             jnrlistF         = jjnr[jidx+5];
1999             jnrlistG         = jjnr[jidx+6];
2000             jnrlistH         = jjnr[jidx+7];
2001             /* Sign of each element will be negative for non-real atoms.
2002              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2003              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2004              */
2005             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2006                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2007                                             
2008             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2009             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2010             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2011             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2012             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2013             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2014             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2015             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2016             j_coord_offsetA  = DIM*jnrA;
2017             j_coord_offsetB  = DIM*jnrB;
2018             j_coord_offsetC  = DIM*jnrC;
2019             j_coord_offsetD  = DIM*jnrD;
2020             j_coord_offsetE  = DIM*jnrE;
2021             j_coord_offsetF  = DIM*jnrF;
2022             j_coord_offsetG  = DIM*jnrG;
2023             j_coord_offsetH  = DIM*jnrH;
2024
2025             /* load j atom coordinates */
2026             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2027                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2028                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2029                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2030                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2031                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2032
2033             /* Calculate displacement vector */
2034             dx00             = _mm256_sub_ps(ix0,jx0);
2035             dy00             = _mm256_sub_ps(iy0,jy0);
2036             dz00             = _mm256_sub_ps(iz0,jz0);
2037             dx11             = _mm256_sub_ps(ix1,jx1);
2038             dy11             = _mm256_sub_ps(iy1,jy1);
2039             dz11             = _mm256_sub_ps(iz1,jz1);
2040             dx12             = _mm256_sub_ps(ix1,jx2);
2041             dy12             = _mm256_sub_ps(iy1,jy2);
2042             dz12             = _mm256_sub_ps(iz1,jz2);
2043             dx13             = _mm256_sub_ps(ix1,jx3);
2044             dy13             = _mm256_sub_ps(iy1,jy3);
2045             dz13             = _mm256_sub_ps(iz1,jz3);
2046             dx21             = _mm256_sub_ps(ix2,jx1);
2047             dy21             = _mm256_sub_ps(iy2,jy1);
2048             dz21             = _mm256_sub_ps(iz2,jz1);
2049             dx22             = _mm256_sub_ps(ix2,jx2);
2050             dy22             = _mm256_sub_ps(iy2,jy2);
2051             dz22             = _mm256_sub_ps(iz2,jz2);
2052             dx23             = _mm256_sub_ps(ix2,jx3);
2053             dy23             = _mm256_sub_ps(iy2,jy3);
2054             dz23             = _mm256_sub_ps(iz2,jz3);
2055             dx31             = _mm256_sub_ps(ix3,jx1);
2056             dy31             = _mm256_sub_ps(iy3,jy1);
2057             dz31             = _mm256_sub_ps(iz3,jz1);
2058             dx32             = _mm256_sub_ps(ix3,jx2);
2059             dy32             = _mm256_sub_ps(iy3,jy2);
2060             dz32             = _mm256_sub_ps(iz3,jz2);
2061             dx33             = _mm256_sub_ps(ix3,jx3);
2062             dy33             = _mm256_sub_ps(iy3,jy3);
2063             dz33             = _mm256_sub_ps(iz3,jz3);
2064
2065             /* Calculate squared distance and things based on it */
2066             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2067             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2068             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2069             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2070             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2071             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2072             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2073             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2074             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2075             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2076
2077             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2078             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2079             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2080             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
2081             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2082             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2083             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
2084             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
2085             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
2086             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
2087
2088             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
2089             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2090             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2091             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
2092             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2093             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2094             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
2095             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
2096             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
2097             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
2098
2099             fjx0             = _mm256_setzero_ps();
2100             fjy0             = _mm256_setzero_ps();
2101             fjz0             = _mm256_setzero_ps();
2102             fjx1             = _mm256_setzero_ps();
2103             fjy1             = _mm256_setzero_ps();
2104             fjz1             = _mm256_setzero_ps();
2105             fjx2             = _mm256_setzero_ps();
2106             fjy2             = _mm256_setzero_ps();
2107             fjz2             = _mm256_setzero_ps();
2108             fjx3             = _mm256_setzero_ps();
2109             fjy3             = _mm256_setzero_ps();
2110             fjz3             = _mm256_setzero_ps();
2111
2112             /**************************
2113              * CALCULATE INTERACTIONS *
2114              **************************/
2115
2116             r00              = _mm256_mul_ps(rsq00,rinv00);
2117             r00              = _mm256_andnot_ps(dummy_mask,r00);
2118
2119             /* Analytical LJ-PME */
2120             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2121             ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
2122             ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
2123             exponent         = gmx_simd_exp_r(ewcljrsq);
2124             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2125             poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2126             /* f6A = 6 * C6grid * (1 - poly) */
2127             f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
2128             /* f6B = C6grid * exponent * beta^6 */
2129             f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
2130             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2131             fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2132
2133             fscal            = fvdw;
2134
2135             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2136
2137             /* Calculate temporary vectorial force */
2138             tx               = _mm256_mul_ps(fscal,dx00);
2139             ty               = _mm256_mul_ps(fscal,dy00);
2140             tz               = _mm256_mul_ps(fscal,dz00);
2141
2142             /* Update vectorial force */
2143             fix0             = _mm256_add_ps(fix0,tx);
2144             fiy0             = _mm256_add_ps(fiy0,ty);
2145             fiz0             = _mm256_add_ps(fiz0,tz);
2146
2147             fjx0             = _mm256_add_ps(fjx0,tx);
2148             fjy0             = _mm256_add_ps(fjy0,ty);
2149             fjz0             = _mm256_add_ps(fjz0,tz);
2150
2151             /**************************
2152              * CALCULATE INTERACTIONS *
2153              **************************/
2154
2155             r11              = _mm256_mul_ps(rsq11,rinv11);
2156             r11              = _mm256_andnot_ps(dummy_mask,r11);
2157
2158             /* EWALD ELECTROSTATICS */
2159             
2160             /* Analytical PME correction */
2161             zeta2            = _mm256_mul_ps(beta2,rsq11);
2162             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2163             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2164             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2165             felec            = _mm256_mul_ps(qq11,felec);
2166             
2167             fscal            = felec;
2168
2169             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2170
2171             /* Calculate temporary vectorial force */
2172             tx               = _mm256_mul_ps(fscal,dx11);
2173             ty               = _mm256_mul_ps(fscal,dy11);
2174             tz               = _mm256_mul_ps(fscal,dz11);
2175
2176             /* Update vectorial force */
2177             fix1             = _mm256_add_ps(fix1,tx);
2178             fiy1             = _mm256_add_ps(fiy1,ty);
2179             fiz1             = _mm256_add_ps(fiz1,tz);
2180
2181             fjx1             = _mm256_add_ps(fjx1,tx);
2182             fjy1             = _mm256_add_ps(fjy1,ty);
2183             fjz1             = _mm256_add_ps(fjz1,tz);
2184
2185             /**************************
2186              * CALCULATE INTERACTIONS *
2187              **************************/
2188
2189             r12              = _mm256_mul_ps(rsq12,rinv12);
2190             r12              = _mm256_andnot_ps(dummy_mask,r12);
2191
2192             /* EWALD ELECTROSTATICS */
2193             
2194             /* Analytical PME correction */
2195             zeta2            = _mm256_mul_ps(beta2,rsq12);
2196             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2197             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2198             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2199             felec            = _mm256_mul_ps(qq12,felec);
2200             
2201             fscal            = felec;
2202
2203             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2204
2205             /* Calculate temporary vectorial force */
2206             tx               = _mm256_mul_ps(fscal,dx12);
2207             ty               = _mm256_mul_ps(fscal,dy12);
2208             tz               = _mm256_mul_ps(fscal,dz12);
2209
2210             /* Update vectorial force */
2211             fix1             = _mm256_add_ps(fix1,tx);
2212             fiy1             = _mm256_add_ps(fiy1,ty);
2213             fiz1             = _mm256_add_ps(fiz1,tz);
2214
2215             fjx2             = _mm256_add_ps(fjx2,tx);
2216             fjy2             = _mm256_add_ps(fjy2,ty);
2217             fjz2             = _mm256_add_ps(fjz2,tz);
2218
2219             /**************************
2220              * CALCULATE INTERACTIONS *
2221              **************************/
2222
2223             r13              = _mm256_mul_ps(rsq13,rinv13);
2224             r13              = _mm256_andnot_ps(dummy_mask,r13);
2225
2226             /* EWALD ELECTROSTATICS */
2227             
2228             /* Analytical PME correction */
2229             zeta2            = _mm256_mul_ps(beta2,rsq13);
2230             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
2231             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2232             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2233             felec            = _mm256_mul_ps(qq13,felec);
2234             
2235             fscal            = felec;
2236
2237             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2238
2239             /* Calculate temporary vectorial force */
2240             tx               = _mm256_mul_ps(fscal,dx13);
2241             ty               = _mm256_mul_ps(fscal,dy13);
2242             tz               = _mm256_mul_ps(fscal,dz13);
2243
2244             /* Update vectorial force */
2245             fix1             = _mm256_add_ps(fix1,tx);
2246             fiy1             = _mm256_add_ps(fiy1,ty);
2247             fiz1             = _mm256_add_ps(fiz1,tz);
2248
2249             fjx3             = _mm256_add_ps(fjx3,tx);
2250             fjy3             = _mm256_add_ps(fjy3,ty);
2251             fjz3             = _mm256_add_ps(fjz3,tz);
2252
2253             /**************************
2254              * CALCULATE INTERACTIONS *
2255              **************************/
2256
2257             r21              = _mm256_mul_ps(rsq21,rinv21);
2258             r21              = _mm256_andnot_ps(dummy_mask,r21);
2259
2260             /* EWALD ELECTROSTATICS */
2261             
2262             /* Analytical PME correction */
2263             zeta2            = _mm256_mul_ps(beta2,rsq21);
2264             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2265             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2266             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2267             felec            = _mm256_mul_ps(qq21,felec);
2268             
2269             fscal            = felec;
2270
2271             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2272
2273             /* Calculate temporary vectorial force */
2274             tx               = _mm256_mul_ps(fscal,dx21);
2275             ty               = _mm256_mul_ps(fscal,dy21);
2276             tz               = _mm256_mul_ps(fscal,dz21);
2277
2278             /* Update vectorial force */
2279             fix2             = _mm256_add_ps(fix2,tx);
2280             fiy2             = _mm256_add_ps(fiy2,ty);
2281             fiz2             = _mm256_add_ps(fiz2,tz);
2282
2283             fjx1             = _mm256_add_ps(fjx1,tx);
2284             fjy1             = _mm256_add_ps(fjy1,ty);
2285             fjz1             = _mm256_add_ps(fjz1,tz);
2286
2287             /**************************
2288              * CALCULATE INTERACTIONS *
2289              **************************/
2290
2291             r22              = _mm256_mul_ps(rsq22,rinv22);
2292             r22              = _mm256_andnot_ps(dummy_mask,r22);
2293
2294             /* EWALD ELECTROSTATICS */
2295             
2296             /* Analytical PME correction */
2297             zeta2            = _mm256_mul_ps(beta2,rsq22);
2298             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2299             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2300             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2301             felec            = _mm256_mul_ps(qq22,felec);
2302             
2303             fscal            = felec;
2304
2305             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2306
2307             /* Calculate temporary vectorial force */
2308             tx               = _mm256_mul_ps(fscal,dx22);
2309             ty               = _mm256_mul_ps(fscal,dy22);
2310             tz               = _mm256_mul_ps(fscal,dz22);
2311
2312             /* Update vectorial force */
2313             fix2             = _mm256_add_ps(fix2,tx);
2314             fiy2             = _mm256_add_ps(fiy2,ty);
2315             fiz2             = _mm256_add_ps(fiz2,tz);
2316
2317             fjx2             = _mm256_add_ps(fjx2,tx);
2318             fjy2             = _mm256_add_ps(fjy2,ty);
2319             fjz2             = _mm256_add_ps(fjz2,tz);
2320
2321             /**************************
2322              * CALCULATE INTERACTIONS *
2323              **************************/
2324
2325             r23              = _mm256_mul_ps(rsq23,rinv23);
2326             r23              = _mm256_andnot_ps(dummy_mask,r23);
2327
2328             /* EWALD ELECTROSTATICS */
2329             
2330             /* Analytical PME correction */
2331             zeta2            = _mm256_mul_ps(beta2,rsq23);
2332             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
2333             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2334             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2335             felec            = _mm256_mul_ps(qq23,felec);
2336             
2337             fscal            = felec;
2338
2339             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2340
2341             /* Calculate temporary vectorial force */
2342             tx               = _mm256_mul_ps(fscal,dx23);
2343             ty               = _mm256_mul_ps(fscal,dy23);
2344             tz               = _mm256_mul_ps(fscal,dz23);
2345
2346             /* Update vectorial force */
2347             fix2             = _mm256_add_ps(fix2,tx);
2348             fiy2             = _mm256_add_ps(fiy2,ty);
2349             fiz2             = _mm256_add_ps(fiz2,tz);
2350
2351             fjx3             = _mm256_add_ps(fjx3,tx);
2352             fjy3             = _mm256_add_ps(fjy3,ty);
2353             fjz3             = _mm256_add_ps(fjz3,tz);
2354
2355             /**************************
2356              * CALCULATE INTERACTIONS *
2357              **************************/
2358
2359             r31              = _mm256_mul_ps(rsq31,rinv31);
2360             r31              = _mm256_andnot_ps(dummy_mask,r31);
2361
2362             /* EWALD ELECTROSTATICS */
2363             
2364             /* Analytical PME correction */
2365             zeta2            = _mm256_mul_ps(beta2,rsq31);
2366             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
2367             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2368             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2369             felec            = _mm256_mul_ps(qq31,felec);
2370             
2371             fscal            = felec;
2372
2373             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2374
2375             /* Calculate temporary vectorial force */
2376             tx               = _mm256_mul_ps(fscal,dx31);
2377             ty               = _mm256_mul_ps(fscal,dy31);
2378             tz               = _mm256_mul_ps(fscal,dz31);
2379
2380             /* Update vectorial force */
2381             fix3             = _mm256_add_ps(fix3,tx);
2382             fiy3             = _mm256_add_ps(fiy3,ty);
2383             fiz3             = _mm256_add_ps(fiz3,tz);
2384
2385             fjx1             = _mm256_add_ps(fjx1,tx);
2386             fjy1             = _mm256_add_ps(fjy1,ty);
2387             fjz1             = _mm256_add_ps(fjz1,tz);
2388
2389             /**************************
2390              * CALCULATE INTERACTIONS *
2391              **************************/
2392
2393             r32              = _mm256_mul_ps(rsq32,rinv32);
2394             r32              = _mm256_andnot_ps(dummy_mask,r32);
2395
2396             /* EWALD ELECTROSTATICS */
2397             
2398             /* Analytical PME correction */
2399             zeta2            = _mm256_mul_ps(beta2,rsq32);
2400             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
2401             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2402             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2403             felec            = _mm256_mul_ps(qq32,felec);
2404             
2405             fscal            = felec;
2406
2407             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2408
2409             /* Calculate temporary vectorial force */
2410             tx               = _mm256_mul_ps(fscal,dx32);
2411             ty               = _mm256_mul_ps(fscal,dy32);
2412             tz               = _mm256_mul_ps(fscal,dz32);
2413
2414             /* Update vectorial force */
2415             fix3             = _mm256_add_ps(fix3,tx);
2416             fiy3             = _mm256_add_ps(fiy3,ty);
2417             fiz3             = _mm256_add_ps(fiz3,tz);
2418
2419             fjx2             = _mm256_add_ps(fjx2,tx);
2420             fjy2             = _mm256_add_ps(fjy2,ty);
2421             fjz2             = _mm256_add_ps(fjz2,tz);
2422
2423             /**************************
2424              * CALCULATE INTERACTIONS *
2425              **************************/
2426
2427             r33              = _mm256_mul_ps(rsq33,rinv33);
2428             r33              = _mm256_andnot_ps(dummy_mask,r33);
2429
2430             /* EWALD ELECTROSTATICS */
2431             
2432             /* Analytical PME correction */
2433             zeta2            = _mm256_mul_ps(beta2,rsq33);
2434             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
2435             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2436             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2437             felec            = _mm256_mul_ps(qq33,felec);
2438             
2439             fscal            = felec;
2440
2441             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2442
2443             /* Calculate temporary vectorial force */
2444             tx               = _mm256_mul_ps(fscal,dx33);
2445             ty               = _mm256_mul_ps(fscal,dy33);
2446             tz               = _mm256_mul_ps(fscal,dz33);
2447
2448             /* Update vectorial force */
2449             fix3             = _mm256_add_ps(fix3,tx);
2450             fiy3             = _mm256_add_ps(fiy3,ty);
2451             fiz3             = _mm256_add_ps(fiz3,tz);
2452
2453             fjx3             = _mm256_add_ps(fjx3,tx);
2454             fjy3             = _mm256_add_ps(fjy3,ty);
2455             fjz3             = _mm256_add_ps(fjz3,tz);
2456
2457             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2458             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2459             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2460             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2461             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2462             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2463             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2464             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2465
2466             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2467                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2468                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2469
2470             /* Inner loop uses 563 flops */
2471         }
2472
2473         /* End of innermost loop */
2474
2475         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2476                                                  f+i_coord_offset,fshift+i_shift_offset);
2477
2478         /* Increment number of inner iterations */
2479         inneriter                  += j_index_end - j_index_start;
2480
2481         /* Outer loop uses 24 flops */
2482     }
2483
2484     /* Increment number of outer iterations */
2485     outeriter        += nri;
2486
2487     /* Update outer/inner flops */
2488
2489     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*563);
2490 }